2010-08-10 20:39:06 -07:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2010 Intel Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
2011-05-24 16:45:17 -07:00
|
|
|
*/
|
|
|
|
|
|
2024-12-06 19:48:54 -08:00
|
|
|
#include "brw_analysis.h"
|
2010-08-10 20:39:06 -07:00
|
|
|
#include "brw_eu.h"
|
2025-02-05 14:25:15 -08:00
|
|
|
#include "brw_shader.h"
|
2025-01-15 08:20:46 -08:00
|
|
|
#include "brw_builder.h"
|
2015-11-11 10:04:43 -08:00
|
|
|
#include "brw_nir.h"
|
2014-07-12 21:18:39 -07:00
|
|
|
#include "brw_cfg.h"
|
2024-04-24 16:14:16 +03:00
|
|
|
#include "brw_rt.h"
|
2021-10-07 00:23:07 -07:00
|
|
|
#include "brw_private.h"
|
2023-11-01 12:51:33 -07:00
|
|
|
#include "intel_nir.h"
|
2023-09-24 21:38:47 -07:00
|
|
|
#include "shader_enums.h"
|
2021-04-05 10:44:41 -07:00
|
|
|
#include "dev/intel_debug.h"
|
2023-01-20 23:19:34 -08:00
|
|
|
#include "dev/intel_wa.h"
|
2016-01-18 11:35:29 +02:00
|
|
|
#include "compiler/glsl_types.h"
|
2016-07-17 18:37:08 -07:00
|
|
|
#include "compiler/nir/nir_builder.h"
|
2018-08-21 09:46:46 -07:00
|
|
|
#include "util/u_math.h"
|
2010-08-10 20:39:06 -07:00
|
|
|
|
2024-12-07 10:13:32 -08:00
|
|
|
void
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader::emit_urb_writes(const brw_reg &gs_vertex_count)
|
2024-12-07 10:13:32 -08:00
|
|
|
{
|
|
|
|
|
int slot, urb_offset, length;
|
|
|
|
|
int starting_urb_offset = 0;
|
|
|
|
|
const struct brw_vue_prog_data *vue_prog_data =
|
|
|
|
|
brw_vue_prog_data(this->prog_data);
|
|
|
|
|
const struct intel_vue_map *vue_map = &vue_prog_data->vue_map;
|
|
|
|
|
bool flush;
|
|
|
|
|
brw_reg sources[8];
|
|
|
|
|
brw_reg urb_handle;
|
|
|
|
|
|
|
|
|
|
switch (stage) {
|
|
|
|
|
case MESA_SHADER_VERTEX:
|
|
|
|
|
urb_handle = vs_payload().urb_handles;
|
|
|
|
|
break;
|
|
|
|
|
case MESA_SHADER_TESS_EVAL:
|
|
|
|
|
urb_handle = tes_payload().urb_output;
|
|
|
|
|
break;
|
|
|
|
|
case MESA_SHADER_GEOMETRY:
|
|
|
|
|
urb_handle = gs_payload().urb_handles;
|
|
|
|
|
break;
|
|
|
|
|
default:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("invalid stage");
|
2024-12-07 10:13:32 -08:00
|
|
|
}
|
|
|
|
|
|
2025-02-27 22:56:15 -08:00
|
|
|
const brw_builder bld = brw_builder(this);
|
2024-12-07 10:13:32 -08:00
|
|
|
|
|
|
|
|
brw_reg per_slot_offsets;
|
|
|
|
|
|
|
|
|
|
if (stage == MESA_SHADER_GEOMETRY) {
|
|
|
|
|
const struct brw_gs_prog_data *gs_prog_data =
|
|
|
|
|
brw_gs_prog_data(this->prog_data);
|
|
|
|
|
|
|
|
|
|
/* We need to increment the Global Offset to skip over the control data
|
|
|
|
|
* header and the extra "Vertex Count" field (1 HWord) at the beginning
|
|
|
|
|
* of the VUE. We're counting in OWords, so the units are doubled.
|
|
|
|
|
*/
|
|
|
|
|
starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords;
|
|
|
|
|
if (gs_prog_data->static_vertex_count == -1)
|
|
|
|
|
starting_urb_offset += 2;
|
|
|
|
|
|
|
|
|
|
/* The URB offset is in 128-bit units, so we need to multiply by 2 */
|
|
|
|
|
const int output_vertex_size_owords =
|
|
|
|
|
gs_prog_data->output_vertex_size_hwords * 2;
|
|
|
|
|
|
|
|
|
|
/* On Xe2+ platform, LSC can operate on the Dword data element with byte
|
|
|
|
|
* offset granularity, so convert per slot offset in bytes since it's in
|
|
|
|
|
* Owords (16-bytes) unit else keep per slot offset in oword unit for
|
|
|
|
|
* previous platforms.
|
|
|
|
|
*/
|
|
|
|
|
const int output_vertex_size = devinfo->ver >= 20 ?
|
|
|
|
|
output_vertex_size_owords * 16 :
|
|
|
|
|
output_vertex_size_owords;
|
|
|
|
|
if (gs_vertex_count.file == IMM) {
|
|
|
|
|
per_slot_offsets = brw_imm_ud(output_vertex_size *
|
|
|
|
|
gs_vertex_count.ud);
|
|
|
|
|
} else {
|
|
|
|
|
per_slot_offsets = bld.vgrf(BRW_TYPE_UD);
|
|
|
|
|
bld.MUL(per_slot_offsets, gs_vertex_count,
|
|
|
|
|
brw_imm_ud(output_vertex_size));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
length = 0;
|
|
|
|
|
urb_offset = starting_urb_offset;
|
|
|
|
|
flush = false;
|
|
|
|
|
|
|
|
|
|
/* SSO shaders can have VUE slots allocated which are never actually
|
|
|
|
|
* written to, so ignore them when looking for the last (written) slot.
|
|
|
|
|
*/
|
|
|
|
|
int last_slot = vue_map->num_slots - 1;
|
|
|
|
|
while (last_slot > 0 &&
|
|
|
|
|
(vue_map->slot_to_varying[last_slot] == BRW_VARYING_SLOT_PAD ||
|
|
|
|
|
outputs[vue_map->slot_to_varying[last_slot]].file == BAD_FILE)) {
|
|
|
|
|
last_slot--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool urb_written = false;
|
|
|
|
|
for (slot = 0; slot < vue_map->num_slots; slot++) {
|
|
|
|
|
int varying = vue_map->slot_to_varying[slot];
|
|
|
|
|
switch (varying) {
|
|
|
|
|
case VARYING_SLOT_PSIZ: {
|
|
|
|
|
/* The point size varying slot is the vue header and is always in the
|
2025-03-26 14:32:28 +02:00
|
|
|
* vue map. If anything in the header is going to be read back by HW,
|
|
|
|
|
* we need to initialize it, in particular the viewport & layer
|
|
|
|
|
* values.
|
|
|
|
|
*
|
|
|
|
|
* SKL PRMs, Volume 7: 3D-Media-GPGPU, Vertex URB Entry (VUE)
|
|
|
|
|
* Formats:
|
2025-03-13 10:05:55 +02:00
|
|
|
*
|
|
|
|
|
* "VUEs are written in two ways:
|
|
|
|
|
*
|
|
|
|
|
* - At the top of the 3D Geometry pipeline, the VF's
|
|
|
|
|
* InputAssembly function creates VUEs and initializes them
|
|
|
|
|
* from data extracted from Vertex Buffers as well as
|
|
|
|
|
* internally generated data.
|
|
|
|
|
*
|
|
|
|
|
* - VS, GS, HS and DS threads can compute, format, and write
|
|
|
|
|
* new VUEs as thread output."
|
|
|
|
|
*
|
|
|
|
|
* "Software must ensure that any VUEs subject to readback by the
|
|
|
|
|
* 3D pipeline start with a valid Vertex Header. This extends to
|
|
|
|
|
* all VUEs with the following exceptions:
|
|
|
|
|
*
|
|
|
|
|
* - If the VS function is enabled, the VF-written VUEs are not
|
|
|
|
|
* required to have Vertex Headers, as the VS-incoming
|
|
|
|
|
* vertices are guaranteed to be consumed by the VS (i.e.,
|
|
|
|
|
* the VS thread is responsible for overwriting the input
|
|
|
|
|
* vertex data).
|
|
|
|
|
*
|
|
|
|
|
* - If the GS FF is enabled, neither VF-written VUEs nor VS
|
|
|
|
|
* thread-generated VUEs are required to have Vertex Headers,
|
|
|
|
|
* as the GS will consume all incoming vertices.
|
|
|
|
|
*
|
|
|
|
|
* - If Rendering is disabled, VertexHeaders are not required
|
|
|
|
|
* anywhere."
|
|
|
|
|
*/
|
2024-12-07 10:13:32 -08:00
|
|
|
brw_reg zero =
|
|
|
|
|
retype(brw_allocate_vgrf_units(*this, dispatch_width / 8), BRW_TYPE_UD);
|
|
|
|
|
bld.MOV(zero, brw_imm_ud(0u));
|
|
|
|
|
|
|
|
|
|
if (vue_map->slots_valid & VARYING_BIT_PRIMITIVE_SHADING_RATE &&
|
|
|
|
|
this->outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE].file != BAD_FILE) {
|
|
|
|
|
sources[length++] = this->outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE];
|
|
|
|
|
} else if (devinfo->has_coarse_pixel_primitive_and_cb) {
|
|
|
|
|
uint32_t one_fp16 = 0x3C00;
|
|
|
|
|
brw_reg one_by_one_fp16 =
|
|
|
|
|
retype(brw_allocate_vgrf_units(*this, dispatch_width / 8), BRW_TYPE_UD);
|
|
|
|
|
bld.MOV(one_by_one_fp16, brw_imm_ud((one_fp16 << 16) | one_fp16));
|
|
|
|
|
sources[length++] = one_by_one_fp16;
|
|
|
|
|
} else {
|
|
|
|
|
sources[length++] = zero;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (vue_map->slots_valid & VARYING_BIT_LAYER)
|
|
|
|
|
sources[length++] = this->outputs[VARYING_SLOT_LAYER];
|
|
|
|
|
else
|
|
|
|
|
sources[length++] = zero;
|
|
|
|
|
|
|
|
|
|
if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
|
|
|
|
|
sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
|
|
|
|
|
else
|
|
|
|
|
sources[length++] = zero;
|
|
|
|
|
|
|
|
|
|
if (vue_map->slots_valid & VARYING_BIT_PSIZ)
|
|
|
|
|
sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
|
|
|
|
|
else
|
|
|
|
|
sources[length++] = zero;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case VARYING_SLOT_EDGE:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("unexpected scalar vs output");
|
2024-12-07 10:13:32 -08:00
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
/* gl_Position is always in the vue map, but isn't always written by
|
|
|
|
|
* the shader. Other varyings (clip distances) get added to the vue
|
|
|
|
|
* map but don't always get written. In those cases, the
|
|
|
|
|
* corresponding this->output[] slot will be invalid we and can skip
|
|
|
|
|
* the urb write for the varying. If we've already queued up a vue
|
|
|
|
|
* slot for writing we flush a mlen 5 urb write, otherwise we just
|
|
|
|
|
* advance the urb_offset.
|
|
|
|
|
*/
|
|
|
|
|
if (varying == BRW_VARYING_SLOT_PAD ||
|
|
|
|
|
this->outputs[varying].file == BAD_FILE) {
|
|
|
|
|
if (length > 0)
|
|
|
|
|
flush = true;
|
|
|
|
|
else
|
|
|
|
|
urb_offset++;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int slot_offset = 0;
|
|
|
|
|
|
|
|
|
|
/* When using Primitive Replication, there may be multiple slots
|
|
|
|
|
* assigned to POS.
|
|
|
|
|
*/
|
|
|
|
|
if (varying == VARYING_SLOT_POS)
|
|
|
|
|
slot_offset = slot - vue_map->varying_to_slot[VARYING_SLOT_POS];
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < 4; i++) {
|
|
|
|
|
sources[length++] = offset(this->outputs[varying], bld,
|
|
|
|
|
i + (slot_offset * 4));
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const brw_builder abld = bld.annotate("URB write");
|
|
|
|
|
|
|
|
|
|
/* If we've queued up 8 registers of payload (2 VUE slots), if this is
|
|
|
|
|
* the last slot or if we need to flush (see BAD_FILE varying case
|
|
|
|
|
* above), emit a URB write send now to flush out the data.
|
|
|
|
|
*/
|
|
|
|
|
if (length == 8 || (length > 0 && slot == last_slot))
|
|
|
|
|
flush = true;
|
|
|
|
|
if (flush) {
|
|
|
|
|
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
|
|
|
|
|
|
|
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
|
|
|
|
|
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offsets;
|
|
|
|
|
srcs[URB_LOGICAL_SRC_DATA] =
|
|
|
|
|
retype(brw_allocate_vgrf_units(*this, (dispatch_width / 8) * length), BRW_TYPE_F);
|
|
|
|
|
srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length);
|
|
|
|
|
abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
|
|
|
|
|
|
|
|
|
|
brw_inst *inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
|
|
|
|
|
srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
|
|
|
|
|
/* For Wa_1805992985 one needs additional write in the end. */
|
|
|
|
|
if (intel_needs_workaround(devinfo, 1805992985) && stage == MESA_SHADER_TESS_EVAL)
|
|
|
|
|
inst->eot = false;
|
|
|
|
|
else
|
|
|
|
|
inst->eot = slot == last_slot && stage != MESA_SHADER_GEOMETRY;
|
|
|
|
|
|
|
|
|
|
inst->offset = urb_offset;
|
|
|
|
|
urb_offset = starting_urb_offset + slot + 1;
|
|
|
|
|
length = 0;
|
|
|
|
|
flush = false;
|
|
|
|
|
urb_written = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* If we don't have any valid slots to write, just do a minimal urb write
|
|
|
|
|
* send to terminate the shader. This includes 1 slot of undefined data,
|
|
|
|
|
* because it's invalid to write 0 data:
|
|
|
|
|
*
|
|
|
|
|
* From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions -
|
|
|
|
|
* Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read >
|
|
|
|
|
* Write Data Payload:
|
|
|
|
|
*
|
|
|
|
|
* "The write data payload can be between 1 and 8 message phases long."
|
|
|
|
|
*/
|
|
|
|
|
if (!urb_written) {
|
|
|
|
|
/* For GS, just turn EmitVertex() into a no-op. We don't want it to
|
|
|
|
|
* end the thread, and emit_gs_thread_end() already emits a SEND with
|
|
|
|
|
* EOT at the end of the program for us.
|
|
|
|
|
*/
|
|
|
|
|
if (stage == MESA_SHADER_GEOMETRY)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
brw_reg uniform_urb_handle =
|
|
|
|
|
retype(brw_allocate_vgrf_units(*this, dispatch_width / 8), BRW_TYPE_UD);
|
|
|
|
|
brw_reg payload =
|
|
|
|
|
retype(brw_allocate_vgrf_units(*this, dispatch_width / 8), BRW_TYPE_UD);
|
|
|
|
|
|
|
|
|
|
bld.exec_all().MOV(uniform_urb_handle, urb_handle);
|
|
|
|
|
|
|
|
|
|
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
|
|
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle;
|
|
|
|
|
srcs[URB_LOGICAL_SRC_DATA] = payload;
|
|
|
|
|
srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(1);
|
|
|
|
|
|
|
|
|
|
brw_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
|
|
|
|
|
srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
inst->eot = true;
|
|
|
|
|
inst->offset = 1;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Wa_1805992985:
|
|
|
|
|
*
|
|
|
|
|
* GPU hangs on one of tessellation vkcts tests with DS not done. The
|
|
|
|
|
* send cycle, which is a urb write with an eot must be 4 phases long and
|
|
|
|
|
* all 8 lanes must valid.
|
|
|
|
|
*/
|
|
|
|
|
if (intel_needs_workaround(devinfo, 1805992985) && stage == MESA_SHADER_TESS_EVAL) {
|
|
|
|
|
assert(dispatch_width == 8);
|
|
|
|
|
brw_reg uniform_urb_handle = retype(brw_allocate_vgrf_units(*this, 1), BRW_TYPE_UD);
|
|
|
|
|
brw_reg uniform_mask = retype(brw_allocate_vgrf_units(*this, 1), BRW_TYPE_UD);
|
|
|
|
|
brw_reg payload = retype(brw_allocate_vgrf_units(*this, 4), BRW_TYPE_UD);
|
|
|
|
|
|
|
|
|
|
/* Workaround requires all 8 channels (lanes) to be valid. This is
|
|
|
|
|
* understood to mean they all need to be alive. First trick is to find
|
|
|
|
|
* a live channel and copy its urb handle for all the other channels to
|
|
|
|
|
* make sure all handles are valid.
|
|
|
|
|
*/
|
|
|
|
|
bld.exec_all().MOV(uniform_urb_handle, bld.emit_uniformize(urb_handle));
|
|
|
|
|
|
|
|
|
|
/* Second trick is to use masked URB write where one can tell the HW to
|
|
|
|
|
* actually write data only for selected channels even though all are
|
|
|
|
|
* active.
|
|
|
|
|
* Third trick is to take advantage of the must-be-zero (MBZ) area in
|
|
|
|
|
* the very beginning of the URB.
|
|
|
|
|
*
|
|
|
|
|
* One masks data to be written only for the first channel and uses
|
|
|
|
|
* offset zero explicitly to land data to the MBZ area avoiding trashing
|
|
|
|
|
* any other part of the URB.
|
|
|
|
|
*
|
|
|
|
|
* Since the WA says that the write needs to be 4 phases long one uses
|
|
|
|
|
* 4 slots data. All are explicitly zeros in order to to keep the MBZ
|
|
|
|
|
* area written as zeros.
|
|
|
|
|
*/
|
|
|
|
|
bld.exec_all().MOV(uniform_mask, brw_imm_ud(0x10000u));
|
|
|
|
|
bld.exec_all().MOV(offset(payload, bld, 0), brw_imm_ud(0u));
|
|
|
|
|
bld.exec_all().MOV(offset(payload, bld, 1), brw_imm_ud(0u));
|
|
|
|
|
bld.exec_all().MOV(offset(payload, bld, 2), brw_imm_ud(0u));
|
|
|
|
|
bld.exec_all().MOV(offset(payload, bld, 3), brw_imm_ud(0u));
|
|
|
|
|
|
|
|
|
|
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
|
|
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle;
|
|
|
|
|
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = uniform_mask;
|
|
|
|
|
srcs[URB_LOGICAL_SRC_DATA] = payload;
|
|
|
|
|
srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(4);
|
|
|
|
|
|
|
|
|
|
brw_inst *inst = bld.exec_all().emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
|
|
|
|
|
reg_undef, srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
inst->eot = true;
|
|
|
|
|
inst->offset = 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader::emit_cs_terminate()
|
2024-12-07 10:13:32 -08:00
|
|
|
{
|
2025-02-27 22:56:15 -08:00
|
|
|
const brw_builder ubld = brw_builder(this).exec_all();
|
2024-12-07 10:13:32 -08:00
|
|
|
|
|
|
|
|
/* We can't directly send from g0, since sends with EOT have to use
|
|
|
|
|
* g112-127. So, copy it to a virtual register, The register allocator will
|
|
|
|
|
* make sure it uses the appropriate register range.
|
|
|
|
|
*/
|
|
|
|
|
struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_TYPE_UD);
|
|
|
|
|
brw_reg payload =
|
|
|
|
|
retype(brw_allocate_vgrf_units(*this, reg_unit(devinfo)), BRW_TYPE_UD);
|
|
|
|
|
ubld.group(8 * reg_unit(devinfo), 0).MOV(payload, g0);
|
|
|
|
|
|
|
|
|
|
/* Set the descriptor to "Dereference Resource" and "Root Thread" */
|
|
|
|
|
unsigned desc = 0;
|
|
|
|
|
|
|
|
|
|
/* Set Resource Select to "Do not dereference URB" on Gfx < 11.
|
|
|
|
|
*
|
|
|
|
|
* Note that even though the thread has a URB resource associated with it,
|
|
|
|
|
* we set the "do not dereference URB" bit, because the URB resource is
|
|
|
|
|
* managed by the fixed-function unit, so it will free it automatically.
|
|
|
|
|
*/
|
|
|
|
|
if (devinfo->ver < 11)
|
|
|
|
|
desc |= (1 << 4); /* Do not dereference URB */
|
|
|
|
|
|
|
|
|
|
brw_reg srcs[4] = {
|
|
|
|
|
brw_imm_ud(desc), /* desc */
|
|
|
|
|
brw_imm_ud(0), /* ex_desc */
|
|
|
|
|
payload, /* payload */
|
|
|
|
|
brw_reg(), /* payload2 */
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
brw_inst *send = ubld.emit(SHADER_OPCODE_SEND, reg_undef, srcs, 4);
|
|
|
|
|
|
|
|
|
|
/* On Alchemist and later, send an EOT message to the message gateway to
|
|
|
|
|
* terminate a compute shader. For older GPUs, send to the thread spawner.
|
|
|
|
|
*/
|
|
|
|
|
send->sfid = devinfo->verx10 >= 125 ? BRW_SFID_MESSAGE_GATEWAY
|
|
|
|
|
: BRW_SFID_THREAD_SPAWNER;
|
|
|
|
|
send->mlen = reg_unit(devinfo);
|
|
|
|
|
send->eot = true;
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader::brw_shader(const struct brw_compiler *compiler,
|
2024-12-07 10:13:32 -08:00
|
|
|
const struct brw_compile_params *params,
|
|
|
|
|
const brw_base_prog_key *key,
|
|
|
|
|
struct brw_stage_prog_data *prog_data,
|
|
|
|
|
const nir_shader *shader,
|
|
|
|
|
unsigned dispatch_width,
|
|
|
|
|
bool needs_register_pressure,
|
|
|
|
|
bool debug_enabled)
|
|
|
|
|
: compiler(compiler), log_data(params->log_data),
|
|
|
|
|
devinfo(compiler->devinfo), nir(shader),
|
|
|
|
|
mem_ctx(params->mem_ctx),
|
|
|
|
|
cfg(NULL), stage(shader->info.stage),
|
|
|
|
|
debug_enabled(debug_enabled),
|
|
|
|
|
key(key), prog_data(prog_data),
|
|
|
|
|
live_analysis(this), regpressure_analysis(this),
|
|
|
|
|
performance_analysis(this), idom_analysis(this), def_analysis(this),
|
2025-03-07 23:05:43 -08:00
|
|
|
ip_ranges_analysis(this),
|
2024-12-07 10:13:32 -08:00
|
|
|
needs_register_pressure(needs_register_pressure),
|
|
|
|
|
dispatch_width(dispatch_width),
|
|
|
|
|
max_polygons(0),
|
|
|
|
|
api_subgroup_size(brw_nir_api_subgroup_size(shader, dispatch_width))
|
|
|
|
|
{
|
|
|
|
|
init();
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader::brw_shader(const struct brw_compiler *compiler,
|
2024-12-07 10:13:32 -08:00
|
|
|
const struct brw_compile_params *params,
|
|
|
|
|
const brw_wm_prog_key *key,
|
|
|
|
|
struct brw_wm_prog_data *prog_data,
|
|
|
|
|
const nir_shader *shader,
|
|
|
|
|
unsigned dispatch_width, unsigned max_polygons,
|
|
|
|
|
bool needs_register_pressure,
|
|
|
|
|
bool debug_enabled)
|
|
|
|
|
: compiler(compiler), log_data(params->log_data),
|
|
|
|
|
devinfo(compiler->devinfo), nir(shader),
|
|
|
|
|
mem_ctx(params->mem_ctx),
|
|
|
|
|
cfg(NULL), stage(shader->info.stage),
|
|
|
|
|
debug_enabled(debug_enabled),
|
|
|
|
|
key(&key->base), prog_data(&prog_data->base),
|
|
|
|
|
live_analysis(this), regpressure_analysis(this),
|
|
|
|
|
performance_analysis(this), idom_analysis(this), def_analysis(this),
|
2025-03-07 23:05:43 -08:00
|
|
|
ip_ranges_analysis(this),
|
2024-12-07 10:13:32 -08:00
|
|
|
needs_register_pressure(needs_register_pressure),
|
|
|
|
|
dispatch_width(dispatch_width),
|
|
|
|
|
max_polygons(max_polygons),
|
|
|
|
|
api_subgroup_size(brw_nir_api_subgroup_size(shader, dispatch_width))
|
|
|
|
|
{
|
|
|
|
|
init();
|
|
|
|
|
assert(api_subgroup_size == 0 ||
|
|
|
|
|
api_subgroup_size == 8 ||
|
|
|
|
|
api_subgroup_size == 16 ||
|
|
|
|
|
api_subgroup_size == 32);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader::init()
|
2024-12-07 10:13:32 -08:00
|
|
|
{
|
|
|
|
|
this->max_dispatch_width = 32;
|
|
|
|
|
|
|
|
|
|
this->failed = false;
|
|
|
|
|
this->fail_msg = NULL;
|
|
|
|
|
|
|
|
|
|
this->payload_ = NULL;
|
|
|
|
|
this->source_depth_to_render_target = false;
|
|
|
|
|
this->first_non_payload_grf = 0;
|
|
|
|
|
|
|
|
|
|
this->uniforms = 0;
|
|
|
|
|
this->last_scratch = 0;
|
|
|
|
|
|
|
|
|
|
memset(&this->shader_stats, 0, sizeof(this->shader_stats));
|
|
|
|
|
|
|
|
|
|
this->grf_used = 0;
|
|
|
|
|
this->spilled_any_registers = false;
|
|
|
|
|
|
|
|
|
|
this->phase = BRW_SHADER_PHASE_INITIAL;
|
|
|
|
|
|
|
|
|
|
this->next_address_register_nr = 1;
|
|
|
|
|
|
|
|
|
|
this->alloc.capacity = 0;
|
|
|
|
|
this->alloc.sizes = NULL;
|
|
|
|
|
this->alloc.count = 0;
|
|
|
|
|
|
|
|
|
|
this->gs.control_data_bits_per_vertex = 0;
|
|
|
|
|
this->gs.control_data_header_size_bits = 0;
|
2025-03-10 23:18:30 +02:00
|
|
|
|
|
|
|
|
memset(&this->fs.per_primitive_offsets, -1,
|
|
|
|
|
sizeof(this->fs.per_primitive_offsets));
|
2024-12-07 10:13:32 -08:00
|
|
|
}
|
|
|
|
|
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader::~brw_shader()
|
2024-12-07 10:13:32 -08:00
|
|
|
{
|
|
|
|
|
delete this->payload_;
|
|
|
|
|
}
|
|
|
|
|
|
2011-03-13 13:43:05 -07:00
|
|
|
void
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader::vfail(const char *format, va_list va)
|
2011-03-13 13:43:05 -07:00
|
|
|
{
|
2011-05-16 15:10:26 -07:00
|
|
|
char *msg;
|
2011-03-13 13:43:05 -07:00
|
|
|
|
2011-05-16 15:10:26 -07:00
|
|
|
if (failed)
|
|
|
|
|
return;
|
2011-03-13 13:43:05 -07:00
|
|
|
|
2011-05-16 15:10:26 -07:00
|
|
|
failed = true;
|
|
|
|
|
|
|
|
|
|
msg = ralloc_vasprintf(mem_ctx, format, va);
|
2020-07-02 13:37:10 +02:00
|
|
|
msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
|
2023-09-24 21:38:47 -07:00
|
|
|
dispatch_width, _mesa_shader_stage_to_abbrev(stage), msg);
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
|
|
|
this->fail_msg = msg;
|
|
|
|
|
|
2021-03-23 11:31:51 -07:00
|
|
|
if (unlikely(debug_enabled)) {
|
2011-06-10 15:26:02 -03:00
|
|
|
fprintf(stderr, "%s", msg);
|
2011-03-13 13:43:05 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
void
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader::fail(const char *format, ...)
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
{
|
|
|
|
|
va_list va;
|
|
|
|
|
|
|
|
|
|
va_start(va, format);
|
|
|
|
|
vfail(format, va);
|
|
|
|
|
va_end(va);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
2016-05-18 14:39:52 -07:00
|
|
|
* Mark this program as impossible to compile with dispatch width greater
|
|
|
|
|
* than n.
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
*
|
|
|
|
|
* During the SIMD8 compile (which happens first), we can detect and flag
|
2016-05-18 14:39:52 -07:00
|
|
|
* things that are unsupported in SIMD16+ mode, so the compiler can skip the
|
|
|
|
|
* SIMD16+ compile altogether.
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
*
|
2016-05-18 14:39:52 -07:00
|
|
|
* During a compile of dispatch width greater than n (if one happens anyway),
|
|
|
|
|
* this just calls fail().
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
*/
|
|
|
|
|
void
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader::limit_dispatch_width(unsigned n, const char *msg)
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
{
|
2016-05-18 14:39:52 -07:00
|
|
|
if (dispatch_width > n) {
|
2015-06-22 16:30:04 -07:00
|
|
|
fail("%s", msg);
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
} else {
|
2020-10-30 17:41:02 +02:00
|
|
|
max_dispatch_width = MIN2(max_dispatch_width, n);
|
2021-07-29 14:27:57 -07:00
|
|
|
brw_shader_perf_log(compiler, log_data,
|
2021-10-03 15:58:36 +03:00
|
|
|
"Shader dispatch width limited to SIMD%d: %s\n",
|
2021-07-29 14:27:57 -07:00
|
|
|
n, msg);
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2013-11-12 15:33:27 -08:00
|
|
|
/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
|
2011-03-23 12:50:53 -07:00
|
|
|
* This brings in those uniform definitions
|
|
|
|
|
*/
|
|
|
|
|
void
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader::import_uniforms(brw_shader *v)
|
2011-03-23 12:50:53 -07:00
|
|
|
{
|
2014-03-11 14:35:27 -07:00
|
|
|
this->uniforms = v->uniforms;
|
2011-03-23 12:50:53 -07:00
|
|
|
}
|
|
|
|
|
|
2025-03-10 23:18:30 +02:00
|
|
|
/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
|
|
|
|
|
* This brings in those uniform definitions
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
brw_shader::import_per_primitive_offsets(const int *per_primitive_offsets)
|
|
|
|
|
{
|
|
|
|
|
memcpy(this->fs.per_primitive_offsets, per_primitive_offsets,
|
|
|
|
|
sizeof(this->fs.per_primitive_offsets));
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-18 11:33:35 +02:00
|
|
|
enum intel_barycentric_mode
|
2024-04-18 09:54:11 +03:00
|
|
|
brw_barycentric_mode(const struct brw_wm_prog_key *key,
|
|
|
|
|
nir_intrinsic_instr *intr)
|
2016-07-11 15:00:37 -07:00
|
|
|
{
|
2022-07-06 13:01:24 -07:00
|
|
|
const glsl_interp_mode mode =
|
|
|
|
|
(enum glsl_interp_mode) nir_intrinsic_interp_mode(intr);
|
|
|
|
|
|
2016-07-11 15:00:37 -07:00
|
|
|
/* Barycentric modes don't make sense for flat inputs. */
|
2016-07-07 02:02:38 -07:00
|
|
|
assert(mode != INTERP_MODE_FLAT);
|
2016-07-11 15:00:37 -07:00
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
unsigned bary;
|
2022-07-06 13:01:24 -07:00
|
|
|
switch (intr->intrinsic) {
|
2016-07-12 03:57:25 -07:00
|
|
|
case nir_intrinsic_load_barycentric_pixel:
|
|
|
|
|
case nir_intrinsic_load_barycentric_at_offset:
|
2024-04-18 09:54:11 +03:00
|
|
|
/* When per sample interpolation is dynamic, assume sample
|
|
|
|
|
* interpolation. We'll dynamically remap things so that the FS thread
|
|
|
|
|
* payload is not affected.
|
|
|
|
|
*/
|
2024-11-18 10:58:46 +02:00
|
|
|
bary = key->persample_interp == INTEL_SOMETIMES ?
|
2024-11-18 11:33:35 +02:00
|
|
|
INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE :
|
|
|
|
|
INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL;
|
2016-07-12 03:57:25 -07:00
|
|
|
break;
|
|
|
|
|
case nir_intrinsic_load_barycentric_centroid:
|
2024-11-18 11:33:35 +02:00
|
|
|
bary = INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID;
|
2016-07-12 03:57:25 -07:00
|
|
|
break;
|
|
|
|
|
case nir_intrinsic_load_barycentric_sample:
|
|
|
|
|
case nir_intrinsic_load_barycentric_at_sample:
|
2024-11-18 11:33:35 +02:00
|
|
|
bary = INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE;
|
2016-07-12 03:57:25 -07:00
|
|
|
break;
|
|
|
|
|
default:
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("invalid intrinsic");
|
2016-07-11 15:00:37 -07:00
|
|
|
}
|
|
|
|
|
|
2016-07-07 02:02:38 -07:00
|
|
|
if (mode == INTERP_MODE_NOPERSPECTIVE)
|
2016-07-11 15:00:37 -07:00
|
|
|
bary += 3;
|
|
|
|
|
|
2024-11-18 11:33:35 +02:00
|
|
|
return (enum intel_barycentric_mode) bary;
|
2016-07-11 15:00:37 -07:00
|
|
|
}
|
|
|
|
|
|
intel/compiler: Use an existing URB write to end TCS threads when viable
VS, TCS, TES, and GS threads must end with a URB write message with the
EOT (end of thread) bit set. For VS and TES, we shadow output variables
with temporaries and perform all stores at the end of the shader, giving
us an existing message to do the EOT.
In tessellation control shaders, we don't defer output stores until the
end of the thread like we do for vertex or evaluation shaders. We just
process store_output and store_per_vertex_output intrinsics where they
occur, which may be in control flow. So we can't guarantee that there's
a URB write being at the end of the shader.
Traditionally, we've just emitted a separate URB write to finish TCS
threads, doing a writemasked write to an single patch header DWord.
On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is
a convenient spot to do so. But on other platforms, there's no such
field, and this write is purely wasteful.
Insetad of emitting a separate write, we can just look for an existing
URB write at the end of the program and tag that with EOT, if possible.
We already had code to do this for geometry shaders, so just lift it
into a helper function and reuse it.
No changes in shader-db.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17944>
2022-08-03 20:54:52 -07:00
|
|
|
/**
|
|
|
|
|
* Walk backwards from the end of the program looking for a URB write that
|
|
|
|
|
* isn't in control flow, and mark it with EOT.
|
|
|
|
|
*
|
|
|
|
|
* Return true if successful or false if a separate EOT write is needed.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader::mark_last_urb_write_with_eot()
|
intel/compiler: Use an existing URB write to end TCS threads when viable
VS, TCS, TES, and GS threads must end with a URB write message with the
EOT (end of thread) bit set. For VS and TES, we shadow output variables
with temporaries and perform all stores at the end of the shader, giving
us an existing message to do the EOT.
In tessellation control shaders, we don't defer output stores until the
end of the thread like we do for vertex or evaluation shaders. We just
process store_output and store_per_vertex_output intrinsics where they
occur, which may be in control flow. So we can't guarantee that there's
a URB write being at the end of the shader.
Traditionally, we've just emitted a separate URB write to finish TCS
threads, doing a writemasked write to an single patch header DWord.
On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is
a convenient spot to do so. But on other platforms, there's no such
field, and this write is purely wasteful.
Insetad of emitting a separate write, we can just look for an existing
URB write at the end of the program and tag that with EOT, if possible.
We already had code to do this for geometry shaders, so just lift it
into a helper function and reuse it.
No changes in shader-db.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17944>
2022-08-03 20:54:52 -07:00
|
|
|
{
|
2025-07-28 16:07:44 -04:00
|
|
|
brw_foreach_in_list_reverse(brw_inst, prev, &this->instructions) {
|
intel/compiler: Use an existing URB write to end TCS threads when viable
VS, TCS, TES, and GS threads must end with a URB write message with the
EOT (end of thread) bit set. For VS and TES, we shadow output variables
with temporaries and perform all stores at the end of the shader, giving
us an existing message to do the EOT.
In tessellation control shaders, we don't defer output stores until the
end of the thread like we do for vertex or evaluation shaders. We just
process store_output and store_per_vertex_output intrinsics where they
occur, which may be in control flow. So we can't guarantee that there's
a URB write being at the end of the shader.
Traditionally, we've just emitted a separate URB write to finish TCS
threads, doing a writemasked write to an single patch header DWord.
On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is
a convenient spot to do so. But on other platforms, there's no such
field, and this write is purely wasteful.
Insetad of emitting a separate write, we can just look for an existing
URB write at the end of the program and tag that with EOT, if possible.
We already had code to do this for geometry shaders, so just lift it
into a helper function and reuse it.
No changes in shader-db.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17944>
2022-08-03 20:54:52 -07:00
|
|
|
if (prev->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) {
|
|
|
|
|
prev->eot = true;
|
|
|
|
|
|
|
|
|
|
/* Delete now dead instructions. */
|
2025-07-28 16:07:44 -04:00
|
|
|
brw_foreach_in_list_reverse_safe(brw_exec_node, dead, &this->instructions) {
|
intel/compiler: Use an existing URB write to end TCS threads when viable
VS, TCS, TES, and GS threads must end with a URB write message with the
EOT (end of thread) bit set. For VS and TES, we shadow output variables
with temporaries and perform all stores at the end of the shader, giving
us an existing message to do the EOT.
In tessellation control shaders, we don't defer output stores until the
end of the thread like we do for vertex or evaluation shaders. We just
process store_output and store_per_vertex_output intrinsics where they
occur, which may be in control flow. So we can't guarantee that there's
a URB write being at the end of the shader.
Traditionally, we've just emitted a separate URB write to finish TCS
threads, doing a writemasked write to an single patch header DWord.
On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is
a convenient spot to do so. But on other platforms, there's no such
field, and this write is purely wasteful.
Insetad of emitting a separate write, we can just look for an existing
URB write at the end of the program and tag that with EOT, if possible.
We already had code to do this for geometry shaders, so just lift it
into a helper function and reuse it.
No changes in shader-db.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17944>
2022-08-03 20:54:52 -07:00
|
|
|
if (dead == prev)
|
|
|
|
|
break;
|
|
|
|
|
dead->remove();
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
} else if (prev->is_control_flow() || prev->has_side_effects()) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-03 12:15:21 -07:00
|
|
|
static unsigned
|
|
|
|
|
round_components_to_whole_registers(const intel_device_info *devinfo,
|
|
|
|
|
unsigned c)
|
|
|
|
|
{
|
|
|
|
|
return DIV_ROUND_UP(c, 8 * reg_unit(devinfo)) * reg_unit(devinfo);
|
|
|
|
|
}
|
|
|
|
|
|
2010-08-26 16:39:41 -07:00
|
|
|
void
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader::assign_curb_setup()
|
2010-08-26 16:39:41 -07:00
|
|
|
{
|
2022-08-03 12:15:21 -07:00
|
|
|
unsigned uniform_push_length =
|
|
|
|
|
round_components_to_whole_registers(devinfo, prog_data->nr_params);
|
2016-11-29 02:47:15 -08:00
|
|
|
|
|
|
|
|
unsigned ubo_push_length = 0;
|
2016-11-29 05:20:20 -08:00
|
|
|
unsigned ubo_push_start[4];
|
2016-11-29 02:47:15 -08:00
|
|
|
for (int i = 0; i < 4; i++) {
|
2016-11-29 05:20:20 -08:00
|
|
|
ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
|
2024-02-19 23:07:04 -08:00
|
|
|
ubo_push_length += prog_data->ubo_ranges[i].length;
|
2022-08-03 12:15:21 -07:00
|
|
|
|
|
|
|
|
assert(ubo_push_start[i] % (8 * reg_unit(devinfo)) == 0);
|
|
|
|
|
assert(ubo_push_length % (1 * reg_unit(devinfo)) == 0);
|
2016-11-29 02:47:15 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
prog_data->curb_read_length = uniform_push_length + ubo_push_length;
|
2024-07-01 14:45:38 -07:00
|
|
|
if (stage == MESA_SHADER_FRAGMENT &&
|
|
|
|
|
((struct brw_wm_prog_key *)key)->null_push_constant_tbimr_workaround)
|
|
|
|
|
prog_data->curb_read_length = MAX2(1, prog_data->curb_read_length);
|
2014-02-19 15:27:01 +01:00
|
|
|
|
2020-04-03 20:20:53 -05:00
|
|
|
uint64_t used = 0;
|
2024-04-24 16:14:16 +03:00
|
|
|
const bool pull_constants =
|
|
|
|
|
devinfo->verx10 >= 125 &&
|
2025-08-05 16:43:06 +08:00
|
|
|
(mesa_shader_stage_is_compute(stage) ||
|
2025-08-05 16:44:47 +08:00
|
|
|
mesa_shader_stage_is_mesh(stage)) &&
|
2024-04-24 16:14:16 +03:00
|
|
|
uniform_push_length;
|
|
|
|
|
|
|
|
|
|
if (pull_constants) {
|
|
|
|
|
const bool pull_constants_a64 =
|
|
|
|
|
(gl_shader_stage_is_rt(stage) &&
|
|
|
|
|
brw_bs_prog_data(prog_data)->uses_inline_push_addr) ||
|
2025-08-05 16:43:06 +08:00
|
|
|
((mesa_shader_stage_is_compute(stage) ||
|
2025-08-05 16:44:47 +08:00
|
|
|
mesa_shader_stage_is_mesh(stage)) &&
|
2024-04-24 16:14:16 +03:00
|
|
|
brw_cs_prog_data(prog_data)->uses_inline_push_addr);
|
2022-07-15 13:08:23 +03:00
|
|
|
assert(devinfo->has_lsc);
|
2025-04-02 16:12:45 -07:00
|
|
|
brw_builder ubld = brw_builder(this, 1).exec_all().at_start(cfg->first_block());
|
2020-05-04 16:17:58 -05:00
|
|
|
|
2024-04-24 16:14:16 +03:00
|
|
|
brw_reg base_addr;
|
|
|
|
|
if (pull_constants_a64) {
|
|
|
|
|
/* The address of the push constants is at offset 0 in the inline
|
|
|
|
|
* parameter.
|
|
|
|
|
*/
|
|
|
|
|
base_addr =
|
|
|
|
|
gl_shader_stage_is_rt(stage) ?
|
|
|
|
|
retype(bs_payload().inline_parameter, BRW_TYPE_UQ) :
|
|
|
|
|
retype(cs_payload().inline_parameter, BRW_TYPE_UQ);
|
|
|
|
|
} else {
|
|
|
|
|
/* The base offset for our push data is passed in as R0.0[31:6]. We
|
|
|
|
|
* have to mask off the bottom 6 bits.
|
|
|
|
|
*/
|
|
|
|
|
base_addr = ubld.AND(retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
|
|
|
|
|
brw_imm_ud(INTEL_MASK(31, 6)));
|
|
|
|
|
}
|
2020-05-04 16:17:58 -05:00
|
|
|
|
2021-03-29 15:46:12 -07:00
|
|
|
/* On Gfx12-HP we load constants at the start of the program using A32
|
2020-05-04 16:17:58 -05:00
|
|
|
* stateless messages.
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned i = 0; i < uniform_push_length;) {
|
2022-07-15 13:08:23 +03:00
|
|
|
/* Limit ourselves to LSC HW limit of 8 GRFs (256bytes D32V64). */
|
|
|
|
|
unsigned num_regs = MIN2(uniform_push_length - i, 8);
|
2020-05-04 16:17:58 -05:00
|
|
|
assert(num_regs > 0);
|
|
|
|
|
num_regs = 1 << util_logbase2(num_regs);
|
|
|
|
|
|
2024-04-24 16:14:16 +03:00
|
|
|
brw_reg addr;
|
|
|
|
|
|
|
|
|
|
if (i != 0) {
|
|
|
|
|
if (pull_constants_a64) {
|
|
|
|
|
/* We need to do the carry manually as when this pass is run,
|
|
|
|
|
* we're not expecting any 64bit ALUs. Unfortunately all the
|
|
|
|
|
* 64bit lowering is done in NIR.
|
|
|
|
|
*/
|
|
|
|
|
addr = ubld.vgrf(BRW_TYPE_UQ);
|
|
|
|
|
brw_reg addr_ldw = subscript(addr, BRW_TYPE_UD, 0);
|
|
|
|
|
brw_reg addr_udw = subscript(addr, BRW_TYPE_UD, 1);
|
|
|
|
|
brw_reg base_addr_ldw = subscript(base_addr, BRW_TYPE_UD, 0);
|
|
|
|
|
brw_reg base_addr_udw = subscript(base_addr, BRW_TYPE_UD, 1);
|
|
|
|
|
ubld.ADD(addr_ldw, base_addr_ldw, brw_imm_ud(i * REG_SIZE));
|
|
|
|
|
ubld.CMP(ubld.null_reg_d(), addr_ldw, base_addr_ldw, BRW_CONDITIONAL_L);
|
|
|
|
|
set_predicate(BRW_PREDICATE_NORMAL,
|
|
|
|
|
ubld.ADD(addr_udw, base_addr_udw, brw_imm_ud(1)));
|
|
|
|
|
set_predicate_inv(BRW_PREDICATE_NORMAL, true,
|
|
|
|
|
ubld.MOV(addr_udw, base_addr_udw));
|
|
|
|
|
} else {
|
|
|
|
|
addr = ubld.ADD(base_addr, brw_imm_ud(i * REG_SIZE));
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
addr = base_addr;
|
|
|
|
|
}
|
2020-05-04 16:17:58 -05:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[4] = {
|
2020-05-04 16:17:58 -05:00
|
|
|
brw_imm_ud(0), /* desc */
|
|
|
|
|
brw_imm_ud(0), /* ex_desc */
|
2022-07-15 13:08:23 +03:00
|
|
|
addr, /* payload */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg(), /* payload2 */
|
2020-05-04 16:17:58 -05:00
|
|
|
};
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dest = retype(brw_vec8_grf(payload().num_regs + i, 0),
|
2024-04-20 17:08:02 -07:00
|
|
|
BRW_TYPE_UD);
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4);
|
2022-07-15 13:08:23 +03:00
|
|
|
|
brw: Rename shared function enums for clarity
Our name for this enum was brw_message_target, but it's better known as
shared function ID or SFID. Call it brw_sfid to make it easier to find.
Now that brw only supports Gfx9+, we don't particularly care whether
SFIDs were introduced on Gfx4, Gfx6, or Gfx7.5. Also, the LSC SFIDs
were confusingly tagged "GFX12" but aren't available on Gfx12.0; they
were introduced with Alchemist/Meteorlake.
GFX6_SFID_DATAPORT_SAMPLER_CACHE in particular was confusing. It sounds
like the SFID to use for the sampler on Gfx6+, however it has nothing to
do with the sampler at all. BRW_SFID_SAMPLER remains the sampler SFID.
On Haswell, we ran out of messages on the main data cache data port, and
so they introduced two additional ones, for more messages. The modern
Tigerlake PRMs simply call these DP_DC0, DP_DC1, and DP_DC2. I think
the "sampler" name came from some idea about reorganizing messages that
never materialized (instead, the LSC came as a much larger cleanup).
Recently we've adopted the term "HDC" for the legacy data cluster, as
opposed to "LSC" for the modern Load/Store Cache. To make clear which
SFIDs target the legacy HDC dataports, we use BRW_SFID_HDC0/1/2.
We were also citing the G45, Sandybridge, and Ivybridge PRMs for a
compiler that supports none of those platforms. Cite modern docs.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33650>
2025-02-10 16:28:48 -08:00
|
|
|
send->sfid = BRW_SFID_UGM;
|
brw: move final send lowering up into the IR
Because we do emit the final send message form in code generation, a
lot of emissions look like this :
add(8) vgrf0, u0, 0x100
mov(1) a0.1, vgrf0 # emitted by the generator
send(8) ..., a0.1
By moving address register manipulation in the IR, we can get this
down to :
add(1) a0.1, u0, 0x100
send(8) ..., a0.1
This reduce register pressure around some send messages by 1 vgrf.
All lost shaders in the below results are fragment SIMD32, due to the
throughput estimator. If turned off, we loose no SIMD32 shaders with
this change.
DG2 results:
Assassin's Creed Valhalla:
Totals from 2044 (96.87% of 2110) affected shaders:
Instrs: 852879 -> 832044 (-2.44%); split: -2.45%, +0.00%
Subgroup size: 23832 -> 23824 (-0.03%)
Cycle count: 53345742 -> 52144277 (-2.25%); split: -5.08%, +2.82%
Spill count: 729 -> 554 (-24.01%); split: -28.40%, +4.39%
Fill count: 2005 -> 1256 (-37.36%)
Scratch Memory Size: 25600 -> 19456 (-24.00%); split: -32.00%, +8.00%
Max live registers: 116765 -> 115058 (-1.46%)
Max dispatch width: 19152 -> 18872 (-1.46%); split: +0.21%, -1.67%
Cyberpunk 2077:
Totals from 1181 (93.43% of 1264) affected shaders:
Instrs: 667192 -> 663615 (-0.54%); split: -0.55%, +0.01%
Subgroup size: 13016 -> 13032 (+0.12%)
Cycle count: 17383539 -> 17986073 (+3.47%); split: -0.93%, +4.39%
Spill count: 12 -> 8 (-33.33%)
Fill count: 9 -> 6 (-33.33%)
Dota2:
Totals from 173 (11.59% of 1493) affected shaders:
Cycle count: 274403 -> 280817 (+2.34%); split: -0.01%, +2.34%
Max live registers: 5787 -> 5779 (-0.14%)
Max dispatch width: 1344 -> 1152 (-14.29%)
Hitman3:
Totals from 5072 (95.39% of 5317) affected shaders:
Instrs: 2879952 -> 2841804 (-1.32%); split: -1.32%, +0.00%
Cycle count: 153208505 -> 165860401 (+8.26%); split: -2.22%, +10.48%
Spill count: 3942 -> 3200 (-18.82%)
Fill count: 10158 -> 8846 (-12.92%)
Scratch Memory Size: 257024 -> 223232 (-13.15%)
Max live registers: 328467 -> 324631 (-1.17%)
Max dispatch width: 43928 -> 42768 (-2.64%); split: +0.09%, -2.73%
Fortnite:
Totals from 360 (4.82% of 7472) affected shaders:
Instrs: 778068 -> 777925 (-0.02%)
Subgroup size: 3128 -> 3136 (+0.26%)
Cycle count: 38684183 -> 38734579 (+0.13%); split: -0.06%, +0.19%
Max live registers: 50689 -> 50658 (-0.06%)
Hogwarts Legacy:
Totals from 1376 (84.00% of 1638) affected shaders:
Instrs: 758810 -> 749727 (-1.20%); split: -1.23%, +0.03%
Cycle count: 27778983 -> 28805469 (+3.70%); split: -1.42%, +5.12%
Spill count: 2475 -> 2299 (-7.11%); split: -7.47%, +0.36%
Fill count: 2677 -> 2445 (-8.67%); split: -9.90%, +1.23%
Scratch Memory Size: 99328 -> 89088 (-10.31%)
Max live registers: 84969 -> 84671 (-0.35%); split: -0.58%, +0.23%
Max dispatch width: 11848 -> 11920 (+0.61%)
Metro Exodus:
Totals from 92 (0.21% of 43072) affected shaders:
Instrs: 262995 -> 262968 (-0.01%)
Cycle count: 13818007 -> 13851266 (+0.24%); split: -0.01%, +0.25%
Max live registers: 11152 -> 11140 (-0.11%)
Red Dead Redemption 2 :
Totals from 451 (7.71% of 5847) affected shaders:
Instrs: 754178 -> 753811 (-0.05%); split: -0.05%, +0.00%
Cycle count: 3484078523 -> 3484111965 (+0.00%); split: -0.00%, +0.00%
Max live registers: 42294 -> 42185 (-0.26%)
Spiderman Remastered:
Totals from 6820 (98.02% of 6958) affected shaders:
Instrs: 6921500 -> 6747933 (-2.51%); split: -4.16%, +1.65%
Cycle count: 234400692460 -> 236846720707 (+1.04%); split: -0.20%, +1.25%
Spill count: 72971 -> 72622 (-0.48%); split: -8.08%, +7.61%
Fill count: 212921 -> 198483 (-6.78%); split: -12.37%, +5.58%
Scratch Memory Size: 3491840 -> 3410944 (-2.32%); split: -12.05%, +9.74%
Max live registers: 493149 -> 487458 (-1.15%)
Max dispatch width: 56936 -> 56856 (-0.14%); split: +0.06%, -0.20%
Strange Brigade:
Totals from 3769 (91.21% of 4132) affected shaders:
Instrs: 1354476 -> 1321474 (-2.44%)
Cycle count: 25351530 -> 25339190 (-0.05%); split: -1.64%, +1.59%
Max live registers: 199057 -> 193656 (-2.71%)
Max dispatch width: 30272 -> 30240 (-0.11%)
Witcher 3:
Totals from 25 (2.40% of 1041) affected shaders:
Instrs: 24621 -> 24606 (-0.06%)
Cycle count: 2218793 -> 2217503 (-0.06%); split: -0.11%, +0.05%
Max live registers: 1963 -> 1955 (-0.41%)
LNL results:
Assassin's Creed Valhalla:
Totals from 1928 (98.02% of 1967) affected shaders:
Instrs: 856107 -> 835756 (-2.38%); split: -2.48%, +0.11%
Subgroup size: 41264 -> 41280 (+0.04%)
Cycle count: 64606590 -> 62371700 (-3.46%); split: -5.57%, +2.11%
Spill count: 915 -> 669 (-26.89%); split: -32.79%, +5.90%
Fill count: 2414 -> 1617 (-33.02%); split: -36.62%, +3.60%
Scratch Memory Size: 62464 -> 44032 (-29.51%); split: -36.07%, +6.56%
Max live registers: 205483 -> 202192 (-1.60%)
Cyberpunk 2077:
Totals from 1177 (96.40% of 1221) affected shaders:
Instrs: 682237 -> 678931 (-0.48%); split: -0.51%, +0.03%
Subgroup size: 24912 -> 24944 (+0.13%)
Cycle count: 24355928 -> 25089292 (+3.01%); split: -0.80%, +3.81%
Spill count: 8 -> 3 (-62.50%)
Fill count: 6 -> 3 (-50.00%)
Max live registers: 126922 -> 125472 (-1.14%)
Dota2:
Totals from 428 (32.47% of 1318) affected shaders:
Instrs: 89355 -> 89740 (+0.43%)
Cycle count: 1152412 -> 1152706 (+0.03%); split: -0.52%, +0.55%
Max live registers: 32863 -> 32847 (-0.05%)
Fortnite:
Totals from 5354 (81.72% of 6552) affected shaders:
Instrs: 4135059 -> 4239015 (+2.51%); split: -0.01%, +2.53%
Cycle count: 132557506 -> 132427302 (-0.10%); split: -0.75%, +0.65%
Spill count: 7144 -> 7234 (+1.26%); split: -0.46%, +1.72%
Fill count: 12086 -> 12403 (+2.62%); split: -0.73%, +3.35%
Scratch Memory Size: 600064 -> 604160 (+0.68%); split: -1.02%, +1.71%
Hitman3:
Totals from 4912 (97.09% of 5059) affected shaders:
Instrs: 2952124 -> 2916824 (-1.20%); split: -1.20%, +0.00%
Cycle count: 179985656 -> 189175250 (+5.11%); split: -2.44%, +7.55%
Spill count: 3739 -> 3136 (-16.13%)
Fill count: 10657 -> 9564 (-10.26%)
Scratch Memory Size: 373760 -> 318464 (-14.79%)
Max live registers: 597566 -> 589460 (-1.36%)
Hogwarts Legacy:
Totals from 1471 (96.33% of 1527) affected shaders:
Instrs: 748749 -> 766214 (+2.33%); split: -0.71%, +3.05%
Cycle count: 33301528 -> 34426308 (+3.38%); split: -1.30%, +4.68%
Spill count: 3278 -> 3070 (-6.35%); split: -8.30%, +1.95%
Fill count: 4553 -> 4097 (-10.02%); split: -10.85%, +0.83%
Scratch Memory Size: 251904 -> 217088 (-13.82%)
Max live registers: 168911 -> 168106 (-0.48%); split: -0.59%, +0.12%
Metro Exodus:
Totals from 18356 (49.81% of 36854) affected shaders:
Instrs: 7559386 -> 7621591 (+0.82%); split: -0.01%, +0.83%
Cycle count: 195240612 -> 196455186 (+0.62%); split: -1.22%, +1.84%
Spill count: 595 -> 546 (-8.24%)
Fill count: 1604 -> 1408 (-12.22%)
Max live registers: 2086937 -> 2086933 (-0.00%)
Red Dead Redemption 2:
Totals from 4171 (79.31% of 5259) affected shaders:
Instrs: 2619392 -> 2719587 (+3.83%); split: -0.00%, +3.83%
Subgroup size: 86416 -> 86432 (+0.02%)
Cycle count: 8542836160 -> 8531976886 (-0.13%); split: -0.65%, +0.53%
Fill count: 12949 -> 12970 (+0.16%); split: -0.43%, +0.59%
Scratch Memory Size: 401408 -> 385024 (-4.08%)
Spiderman Remastered:
Totals from 6639 (98.94% of 6710) affected shaders:
Instrs: 6877980 -> 6800592 (-1.13%); split: -3.11%, +1.98%
Cycle count: 282183352210 -> 282100051824 (-0.03%); split: -0.62%, +0.59%
Spill count: 63147 -> 64218 (+1.70%); split: -7.12%, +8.82%
Fill count: 184931 -> 175591 (-5.05%); split: -10.81%, +5.76%
Scratch Memory Size: 5318656 -> 5970944 (+12.26%); split: -5.91%, +18.17%
Max live registers: 918240 -> 906604 (-1.27%)
Strange Brigade:
Totals from 3675 (92.24% of 3984) affected shaders:
Instrs: 1462231 -> 1429345 (-2.25%); split: -2.25%, +0.00%
Cycle count: 37404050 -> 37345292 (-0.16%); split: -1.25%, +1.09%
Max live registers: 361849 -> 351265 (-2.92%)
Witcher 3:
Totals from 13 (46.43% of 28) affected shaders:
Instrs: 593 -> 660 (+11.30%)
Cycle count: 28302 -> 28714 (+1.46%)
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28199>
2024-02-29 20:51:50 +02:00
|
|
|
uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
|
|
|
|
|
LSC_ADDR_SURFTYPE_FLAT,
|
2024-04-24 16:14:16 +03:00
|
|
|
pull_constants_a64 ?
|
|
|
|
|
LSC_ADDR_SIZE_A64 : LSC_ADDR_SIZE_A32,
|
brw: move final send lowering up into the IR
Because we do emit the final send message form in code generation, a
lot of emissions look like this :
add(8) vgrf0, u0, 0x100
mov(1) a0.1, vgrf0 # emitted by the generator
send(8) ..., a0.1
By moving address register manipulation in the IR, we can get this
down to :
add(1) a0.1, u0, 0x100
send(8) ..., a0.1
This reduce register pressure around some send messages by 1 vgrf.
All lost shaders in the below results are fragment SIMD32, due to the
throughput estimator. If turned off, we loose no SIMD32 shaders with
this change.
DG2 results:
Assassin's Creed Valhalla:
Totals from 2044 (96.87% of 2110) affected shaders:
Instrs: 852879 -> 832044 (-2.44%); split: -2.45%, +0.00%
Subgroup size: 23832 -> 23824 (-0.03%)
Cycle count: 53345742 -> 52144277 (-2.25%); split: -5.08%, +2.82%
Spill count: 729 -> 554 (-24.01%); split: -28.40%, +4.39%
Fill count: 2005 -> 1256 (-37.36%)
Scratch Memory Size: 25600 -> 19456 (-24.00%); split: -32.00%, +8.00%
Max live registers: 116765 -> 115058 (-1.46%)
Max dispatch width: 19152 -> 18872 (-1.46%); split: +0.21%, -1.67%
Cyberpunk 2077:
Totals from 1181 (93.43% of 1264) affected shaders:
Instrs: 667192 -> 663615 (-0.54%); split: -0.55%, +0.01%
Subgroup size: 13016 -> 13032 (+0.12%)
Cycle count: 17383539 -> 17986073 (+3.47%); split: -0.93%, +4.39%
Spill count: 12 -> 8 (-33.33%)
Fill count: 9 -> 6 (-33.33%)
Dota2:
Totals from 173 (11.59% of 1493) affected shaders:
Cycle count: 274403 -> 280817 (+2.34%); split: -0.01%, +2.34%
Max live registers: 5787 -> 5779 (-0.14%)
Max dispatch width: 1344 -> 1152 (-14.29%)
Hitman3:
Totals from 5072 (95.39% of 5317) affected shaders:
Instrs: 2879952 -> 2841804 (-1.32%); split: -1.32%, +0.00%
Cycle count: 153208505 -> 165860401 (+8.26%); split: -2.22%, +10.48%
Spill count: 3942 -> 3200 (-18.82%)
Fill count: 10158 -> 8846 (-12.92%)
Scratch Memory Size: 257024 -> 223232 (-13.15%)
Max live registers: 328467 -> 324631 (-1.17%)
Max dispatch width: 43928 -> 42768 (-2.64%); split: +0.09%, -2.73%
Fortnite:
Totals from 360 (4.82% of 7472) affected shaders:
Instrs: 778068 -> 777925 (-0.02%)
Subgroup size: 3128 -> 3136 (+0.26%)
Cycle count: 38684183 -> 38734579 (+0.13%); split: -0.06%, +0.19%
Max live registers: 50689 -> 50658 (-0.06%)
Hogwarts Legacy:
Totals from 1376 (84.00% of 1638) affected shaders:
Instrs: 758810 -> 749727 (-1.20%); split: -1.23%, +0.03%
Cycle count: 27778983 -> 28805469 (+3.70%); split: -1.42%, +5.12%
Spill count: 2475 -> 2299 (-7.11%); split: -7.47%, +0.36%
Fill count: 2677 -> 2445 (-8.67%); split: -9.90%, +1.23%
Scratch Memory Size: 99328 -> 89088 (-10.31%)
Max live registers: 84969 -> 84671 (-0.35%); split: -0.58%, +0.23%
Max dispatch width: 11848 -> 11920 (+0.61%)
Metro Exodus:
Totals from 92 (0.21% of 43072) affected shaders:
Instrs: 262995 -> 262968 (-0.01%)
Cycle count: 13818007 -> 13851266 (+0.24%); split: -0.01%, +0.25%
Max live registers: 11152 -> 11140 (-0.11%)
Red Dead Redemption 2 :
Totals from 451 (7.71% of 5847) affected shaders:
Instrs: 754178 -> 753811 (-0.05%); split: -0.05%, +0.00%
Cycle count: 3484078523 -> 3484111965 (+0.00%); split: -0.00%, +0.00%
Max live registers: 42294 -> 42185 (-0.26%)
Spiderman Remastered:
Totals from 6820 (98.02% of 6958) affected shaders:
Instrs: 6921500 -> 6747933 (-2.51%); split: -4.16%, +1.65%
Cycle count: 234400692460 -> 236846720707 (+1.04%); split: -0.20%, +1.25%
Spill count: 72971 -> 72622 (-0.48%); split: -8.08%, +7.61%
Fill count: 212921 -> 198483 (-6.78%); split: -12.37%, +5.58%
Scratch Memory Size: 3491840 -> 3410944 (-2.32%); split: -12.05%, +9.74%
Max live registers: 493149 -> 487458 (-1.15%)
Max dispatch width: 56936 -> 56856 (-0.14%); split: +0.06%, -0.20%
Strange Brigade:
Totals from 3769 (91.21% of 4132) affected shaders:
Instrs: 1354476 -> 1321474 (-2.44%)
Cycle count: 25351530 -> 25339190 (-0.05%); split: -1.64%, +1.59%
Max live registers: 199057 -> 193656 (-2.71%)
Max dispatch width: 30272 -> 30240 (-0.11%)
Witcher 3:
Totals from 25 (2.40% of 1041) affected shaders:
Instrs: 24621 -> 24606 (-0.06%)
Cycle count: 2218793 -> 2217503 (-0.06%); split: -0.11%, +0.05%
Max live registers: 1963 -> 1955 (-0.41%)
LNL results:
Assassin's Creed Valhalla:
Totals from 1928 (98.02% of 1967) affected shaders:
Instrs: 856107 -> 835756 (-2.38%); split: -2.48%, +0.11%
Subgroup size: 41264 -> 41280 (+0.04%)
Cycle count: 64606590 -> 62371700 (-3.46%); split: -5.57%, +2.11%
Spill count: 915 -> 669 (-26.89%); split: -32.79%, +5.90%
Fill count: 2414 -> 1617 (-33.02%); split: -36.62%, +3.60%
Scratch Memory Size: 62464 -> 44032 (-29.51%); split: -36.07%, +6.56%
Max live registers: 205483 -> 202192 (-1.60%)
Cyberpunk 2077:
Totals from 1177 (96.40% of 1221) affected shaders:
Instrs: 682237 -> 678931 (-0.48%); split: -0.51%, +0.03%
Subgroup size: 24912 -> 24944 (+0.13%)
Cycle count: 24355928 -> 25089292 (+3.01%); split: -0.80%, +3.81%
Spill count: 8 -> 3 (-62.50%)
Fill count: 6 -> 3 (-50.00%)
Max live registers: 126922 -> 125472 (-1.14%)
Dota2:
Totals from 428 (32.47% of 1318) affected shaders:
Instrs: 89355 -> 89740 (+0.43%)
Cycle count: 1152412 -> 1152706 (+0.03%); split: -0.52%, +0.55%
Max live registers: 32863 -> 32847 (-0.05%)
Fortnite:
Totals from 5354 (81.72% of 6552) affected shaders:
Instrs: 4135059 -> 4239015 (+2.51%); split: -0.01%, +2.53%
Cycle count: 132557506 -> 132427302 (-0.10%); split: -0.75%, +0.65%
Spill count: 7144 -> 7234 (+1.26%); split: -0.46%, +1.72%
Fill count: 12086 -> 12403 (+2.62%); split: -0.73%, +3.35%
Scratch Memory Size: 600064 -> 604160 (+0.68%); split: -1.02%, +1.71%
Hitman3:
Totals from 4912 (97.09% of 5059) affected shaders:
Instrs: 2952124 -> 2916824 (-1.20%); split: -1.20%, +0.00%
Cycle count: 179985656 -> 189175250 (+5.11%); split: -2.44%, +7.55%
Spill count: 3739 -> 3136 (-16.13%)
Fill count: 10657 -> 9564 (-10.26%)
Scratch Memory Size: 373760 -> 318464 (-14.79%)
Max live registers: 597566 -> 589460 (-1.36%)
Hogwarts Legacy:
Totals from 1471 (96.33% of 1527) affected shaders:
Instrs: 748749 -> 766214 (+2.33%); split: -0.71%, +3.05%
Cycle count: 33301528 -> 34426308 (+3.38%); split: -1.30%, +4.68%
Spill count: 3278 -> 3070 (-6.35%); split: -8.30%, +1.95%
Fill count: 4553 -> 4097 (-10.02%); split: -10.85%, +0.83%
Scratch Memory Size: 251904 -> 217088 (-13.82%)
Max live registers: 168911 -> 168106 (-0.48%); split: -0.59%, +0.12%
Metro Exodus:
Totals from 18356 (49.81% of 36854) affected shaders:
Instrs: 7559386 -> 7621591 (+0.82%); split: -0.01%, +0.83%
Cycle count: 195240612 -> 196455186 (+0.62%); split: -1.22%, +1.84%
Spill count: 595 -> 546 (-8.24%)
Fill count: 1604 -> 1408 (-12.22%)
Max live registers: 2086937 -> 2086933 (-0.00%)
Red Dead Redemption 2:
Totals from 4171 (79.31% of 5259) affected shaders:
Instrs: 2619392 -> 2719587 (+3.83%); split: -0.00%, +3.83%
Subgroup size: 86416 -> 86432 (+0.02%)
Cycle count: 8542836160 -> 8531976886 (-0.13%); split: -0.65%, +0.53%
Fill count: 12949 -> 12970 (+0.16%); split: -0.43%, +0.59%
Scratch Memory Size: 401408 -> 385024 (-4.08%)
Spiderman Remastered:
Totals from 6639 (98.94% of 6710) affected shaders:
Instrs: 6877980 -> 6800592 (-1.13%); split: -3.11%, +1.98%
Cycle count: 282183352210 -> 282100051824 (-0.03%); split: -0.62%, +0.59%
Spill count: 63147 -> 64218 (+1.70%); split: -7.12%, +8.82%
Fill count: 184931 -> 175591 (-5.05%); split: -10.81%, +5.76%
Scratch Memory Size: 5318656 -> 5970944 (+12.26%); split: -5.91%, +18.17%
Max live registers: 918240 -> 906604 (-1.27%)
Strange Brigade:
Totals from 3675 (92.24% of 3984) affected shaders:
Instrs: 1462231 -> 1429345 (-2.25%); split: -2.25%, +0.00%
Cycle count: 37404050 -> 37345292 (-0.16%); split: -1.25%, +1.09%
Max live registers: 361849 -> 351265 (-2.92%)
Witcher 3:
Totals from 13 (46.43% of 28) affected shaders:
Instrs: 593 -> 660 (+11.30%)
Cycle count: 28302 -> 28714 (+1.46%)
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28199>
2024-02-29 20:51:50 +02:00
|
|
|
LSC_DATA_SIZE_D32,
|
|
|
|
|
num_regs * 8 /* num_channels */,
|
|
|
|
|
true /* transpose */,
|
|
|
|
|
LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
|
2022-07-15 13:08:23 +03:00
|
|
|
send->header_size = 0;
|
2024-04-24 16:14:16 +03:00
|
|
|
send->mlen = lsc_msg_addr_len(
|
|
|
|
|
devinfo, pull_constants_a64 ?
|
|
|
|
|
LSC_ADDR_SIZE_A64 : LSC_ADDR_SIZE_A32, 1);
|
2022-07-15 13:08:23 +03:00
|
|
|
send->size_written =
|
2022-09-28 16:17:02 -07:00
|
|
|
lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, num_regs * 8) * REG_SIZE;
|
2024-04-24 16:14:16 +03:00
|
|
|
assert((payload().num_regs + i + send->size_written / REG_SIZE) <=
|
|
|
|
|
(payload().num_regs + prog_data->curb_read_length));
|
2020-05-04 16:17:58 -05:00
|
|
|
send->send_is_volatile = true;
|
|
|
|
|
|
brw: move final send lowering up into the IR
Because we do emit the final send message form in code generation, a
lot of emissions look like this :
add(8) vgrf0, u0, 0x100
mov(1) a0.1, vgrf0 # emitted by the generator
send(8) ..., a0.1
By moving address register manipulation in the IR, we can get this
down to :
add(1) a0.1, u0, 0x100
send(8) ..., a0.1
This reduce register pressure around some send messages by 1 vgrf.
All lost shaders in the below results are fragment SIMD32, due to the
throughput estimator. If turned off, we loose no SIMD32 shaders with
this change.
DG2 results:
Assassin's Creed Valhalla:
Totals from 2044 (96.87% of 2110) affected shaders:
Instrs: 852879 -> 832044 (-2.44%); split: -2.45%, +0.00%
Subgroup size: 23832 -> 23824 (-0.03%)
Cycle count: 53345742 -> 52144277 (-2.25%); split: -5.08%, +2.82%
Spill count: 729 -> 554 (-24.01%); split: -28.40%, +4.39%
Fill count: 2005 -> 1256 (-37.36%)
Scratch Memory Size: 25600 -> 19456 (-24.00%); split: -32.00%, +8.00%
Max live registers: 116765 -> 115058 (-1.46%)
Max dispatch width: 19152 -> 18872 (-1.46%); split: +0.21%, -1.67%
Cyberpunk 2077:
Totals from 1181 (93.43% of 1264) affected shaders:
Instrs: 667192 -> 663615 (-0.54%); split: -0.55%, +0.01%
Subgroup size: 13016 -> 13032 (+0.12%)
Cycle count: 17383539 -> 17986073 (+3.47%); split: -0.93%, +4.39%
Spill count: 12 -> 8 (-33.33%)
Fill count: 9 -> 6 (-33.33%)
Dota2:
Totals from 173 (11.59% of 1493) affected shaders:
Cycle count: 274403 -> 280817 (+2.34%); split: -0.01%, +2.34%
Max live registers: 5787 -> 5779 (-0.14%)
Max dispatch width: 1344 -> 1152 (-14.29%)
Hitman3:
Totals from 5072 (95.39% of 5317) affected shaders:
Instrs: 2879952 -> 2841804 (-1.32%); split: -1.32%, +0.00%
Cycle count: 153208505 -> 165860401 (+8.26%); split: -2.22%, +10.48%
Spill count: 3942 -> 3200 (-18.82%)
Fill count: 10158 -> 8846 (-12.92%)
Scratch Memory Size: 257024 -> 223232 (-13.15%)
Max live registers: 328467 -> 324631 (-1.17%)
Max dispatch width: 43928 -> 42768 (-2.64%); split: +0.09%, -2.73%
Fortnite:
Totals from 360 (4.82% of 7472) affected shaders:
Instrs: 778068 -> 777925 (-0.02%)
Subgroup size: 3128 -> 3136 (+0.26%)
Cycle count: 38684183 -> 38734579 (+0.13%); split: -0.06%, +0.19%
Max live registers: 50689 -> 50658 (-0.06%)
Hogwarts Legacy:
Totals from 1376 (84.00% of 1638) affected shaders:
Instrs: 758810 -> 749727 (-1.20%); split: -1.23%, +0.03%
Cycle count: 27778983 -> 28805469 (+3.70%); split: -1.42%, +5.12%
Spill count: 2475 -> 2299 (-7.11%); split: -7.47%, +0.36%
Fill count: 2677 -> 2445 (-8.67%); split: -9.90%, +1.23%
Scratch Memory Size: 99328 -> 89088 (-10.31%)
Max live registers: 84969 -> 84671 (-0.35%); split: -0.58%, +0.23%
Max dispatch width: 11848 -> 11920 (+0.61%)
Metro Exodus:
Totals from 92 (0.21% of 43072) affected shaders:
Instrs: 262995 -> 262968 (-0.01%)
Cycle count: 13818007 -> 13851266 (+0.24%); split: -0.01%, +0.25%
Max live registers: 11152 -> 11140 (-0.11%)
Red Dead Redemption 2 :
Totals from 451 (7.71% of 5847) affected shaders:
Instrs: 754178 -> 753811 (-0.05%); split: -0.05%, +0.00%
Cycle count: 3484078523 -> 3484111965 (+0.00%); split: -0.00%, +0.00%
Max live registers: 42294 -> 42185 (-0.26%)
Spiderman Remastered:
Totals from 6820 (98.02% of 6958) affected shaders:
Instrs: 6921500 -> 6747933 (-2.51%); split: -4.16%, +1.65%
Cycle count: 234400692460 -> 236846720707 (+1.04%); split: -0.20%, +1.25%
Spill count: 72971 -> 72622 (-0.48%); split: -8.08%, +7.61%
Fill count: 212921 -> 198483 (-6.78%); split: -12.37%, +5.58%
Scratch Memory Size: 3491840 -> 3410944 (-2.32%); split: -12.05%, +9.74%
Max live registers: 493149 -> 487458 (-1.15%)
Max dispatch width: 56936 -> 56856 (-0.14%); split: +0.06%, -0.20%
Strange Brigade:
Totals from 3769 (91.21% of 4132) affected shaders:
Instrs: 1354476 -> 1321474 (-2.44%)
Cycle count: 25351530 -> 25339190 (-0.05%); split: -1.64%, +1.59%
Max live registers: 199057 -> 193656 (-2.71%)
Max dispatch width: 30272 -> 30240 (-0.11%)
Witcher 3:
Totals from 25 (2.40% of 1041) affected shaders:
Instrs: 24621 -> 24606 (-0.06%)
Cycle count: 2218793 -> 2217503 (-0.06%); split: -0.11%, +0.05%
Max live registers: 1963 -> 1955 (-0.41%)
LNL results:
Assassin's Creed Valhalla:
Totals from 1928 (98.02% of 1967) affected shaders:
Instrs: 856107 -> 835756 (-2.38%); split: -2.48%, +0.11%
Subgroup size: 41264 -> 41280 (+0.04%)
Cycle count: 64606590 -> 62371700 (-3.46%); split: -5.57%, +2.11%
Spill count: 915 -> 669 (-26.89%); split: -32.79%, +5.90%
Fill count: 2414 -> 1617 (-33.02%); split: -36.62%, +3.60%
Scratch Memory Size: 62464 -> 44032 (-29.51%); split: -36.07%, +6.56%
Max live registers: 205483 -> 202192 (-1.60%)
Cyberpunk 2077:
Totals from 1177 (96.40% of 1221) affected shaders:
Instrs: 682237 -> 678931 (-0.48%); split: -0.51%, +0.03%
Subgroup size: 24912 -> 24944 (+0.13%)
Cycle count: 24355928 -> 25089292 (+3.01%); split: -0.80%, +3.81%
Spill count: 8 -> 3 (-62.50%)
Fill count: 6 -> 3 (-50.00%)
Max live registers: 126922 -> 125472 (-1.14%)
Dota2:
Totals from 428 (32.47% of 1318) affected shaders:
Instrs: 89355 -> 89740 (+0.43%)
Cycle count: 1152412 -> 1152706 (+0.03%); split: -0.52%, +0.55%
Max live registers: 32863 -> 32847 (-0.05%)
Fortnite:
Totals from 5354 (81.72% of 6552) affected shaders:
Instrs: 4135059 -> 4239015 (+2.51%); split: -0.01%, +2.53%
Cycle count: 132557506 -> 132427302 (-0.10%); split: -0.75%, +0.65%
Spill count: 7144 -> 7234 (+1.26%); split: -0.46%, +1.72%
Fill count: 12086 -> 12403 (+2.62%); split: -0.73%, +3.35%
Scratch Memory Size: 600064 -> 604160 (+0.68%); split: -1.02%, +1.71%
Hitman3:
Totals from 4912 (97.09% of 5059) affected shaders:
Instrs: 2952124 -> 2916824 (-1.20%); split: -1.20%, +0.00%
Cycle count: 179985656 -> 189175250 (+5.11%); split: -2.44%, +7.55%
Spill count: 3739 -> 3136 (-16.13%)
Fill count: 10657 -> 9564 (-10.26%)
Scratch Memory Size: 373760 -> 318464 (-14.79%)
Max live registers: 597566 -> 589460 (-1.36%)
Hogwarts Legacy:
Totals from 1471 (96.33% of 1527) affected shaders:
Instrs: 748749 -> 766214 (+2.33%); split: -0.71%, +3.05%
Cycle count: 33301528 -> 34426308 (+3.38%); split: -1.30%, +4.68%
Spill count: 3278 -> 3070 (-6.35%); split: -8.30%, +1.95%
Fill count: 4553 -> 4097 (-10.02%); split: -10.85%, +0.83%
Scratch Memory Size: 251904 -> 217088 (-13.82%)
Max live registers: 168911 -> 168106 (-0.48%); split: -0.59%, +0.12%
Metro Exodus:
Totals from 18356 (49.81% of 36854) affected shaders:
Instrs: 7559386 -> 7621591 (+0.82%); split: -0.01%, +0.83%
Cycle count: 195240612 -> 196455186 (+0.62%); split: -1.22%, +1.84%
Spill count: 595 -> 546 (-8.24%)
Fill count: 1604 -> 1408 (-12.22%)
Max live registers: 2086937 -> 2086933 (-0.00%)
Red Dead Redemption 2:
Totals from 4171 (79.31% of 5259) affected shaders:
Instrs: 2619392 -> 2719587 (+3.83%); split: -0.00%, +3.83%
Subgroup size: 86416 -> 86432 (+0.02%)
Cycle count: 8542836160 -> 8531976886 (-0.13%); split: -0.65%, +0.53%
Fill count: 12949 -> 12970 (+0.16%); split: -0.43%, +0.59%
Scratch Memory Size: 401408 -> 385024 (-4.08%)
Spiderman Remastered:
Totals from 6639 (98.94% of 6710) affected shaders:
Instrs: 6877980 -> 6800592 (-1.13%); split: -3.11%, +1.98%
Cycle count: 282183352210 -> 282100051824 (-0.03%); split: -0.62%, +0.59%
Spill count: 63147 -> 64218 (+1.70%); split: -7.12%, +8.82%
Fill count: 184931 -> 175591 (-5.05%); split: -10.81%, +5.76%
Scratch Memory Size: 5318656 -> 5970944 (+12.26%); split: -5.91%, +18.17%
Max live registers: 918240 -> 906604 (-1.27%)
Strange Brigade:
Totals from 3675 (92.24% of 3984) affected shaders:
Instrs: 1462231 -> 1429345 (-2.25%); split: -2.25%, +0.00%
Cycle count: 37404050 -> 37345292 (-0.16%); split: -1.25%, +1.09%
Max live registers: 361849 -> 351265 (-2.92%)
Witcher 3:
Totals from 13 (46.43% of 28) affected shaders:
Instrs: 593 -> 660 (+11.30%)
Cycle count: 28302 -> 28714 (+1.46%)
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28199>
2024-02-29 20:51:50 +02:00
|
|
|
send->src[0] = brw_imm_ud(desc |
|
|
|
|
|
brw_message_desc(devinfo,
|
|
|
|
|
send->mlen,
|
|
|
|
|
send->size_written / REG_SIZE,
|
|
|
|
|
send->header_size));
|
|
|
|
|
|
2020-05-04 16:17:58 -05:00
|
|
|
i += num_regs;
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-06 20:52:05 -08:00
|
|
|
invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);
|
2020-05-04 16:17:58 -05:00
|
|
|
}
|
|
|
|
|
|
2010-08-26 16:39:41 -07:00
|
|
|
/* Map the offsets in the UNIFORM file to fixed HW regs. */
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst(block, brw_inst, inst, cfg) {
|
2014-03-17 10:39:43 -07:00
|
|
|
for (unsigned int i = 0; i < inst->sources; i++) {
|
2010-08-26 16:39:41 -07:00
|
|
|
if (inst->src[i].file == UNIFORM) {
|
2016-09-01 12:42:20 -07:00
|
|
|
int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
|
2014-03-11 14:35:27 -07:00
|
|
|
int constant_nr;
|
2016-11-29 05:20:20 -08:00
|
|
|
if (inst->src[i].nr >= UBO_START) {
|
|
|
|
|
/* constant_nr is in 32-bit units, the rest are in bytes */
|
|
|
|
|
constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
|
|
|
|
|
inst->src[i].offset / 4;
|
|
|
|
|
} else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
|
2025-01-01 23:52:33 -08:00
|
|
|
constant_nr = uniform_nr;
|
2014-03-11 14:35:27 -07:00
|
|
|
} else {
|
|
|
|
|
/* Section 5.11 of the OpenGL 4.1 spec says:
|
|
|
|
|
* "Out-of-bounds reads return undefined values, which include
|
|
|
|
|
* values from other variables of the active program or zero."
|
|
|
|
|
* Just return the first push constant.
|
|
|
|
|
*/
|
|
|
|
|
constant_nr = 0;
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-03 20:20:53 -05:00
|
|
|
assert(constant_nr / 8 < 64);
|
|
|
|
|
used |= BITFIELD64_BIT(constant_nr / 8);
|
|
|
|
|
|
2022-08-19 12:40:20 -07:00
|
|
|
struct brw_reg brw_reg = brw_vec1_grf(payload().num_regs +
|
2010-08-27 14:15:42 -07:00
|
|
|
constant_nr / 8,
|
|
|
|
|
constant_nr % 8);
|
2015-10-24 15:29:03 -07:00
|
|
|
brw_reg.abs = inst->src[i].abs;
|
|
|
|
|
brw_reg.negate = inst->src[i].negate;
|
2010-08-26 16:39:41 -07:00
|
|
|
|
2024-02-01 15:02:37 -08:00
|
|
|
/* The combination of is_scalar for load_uniform, copy prop, and
|
|
|
|
|
* lower_btd_logical_send can generate a MOV from a UNIFORM with
|
|
|
|
|
* exec size 2 and stride of 1.
|
|
|
|
|
*/
|
|
|
|
|
assert(inst->src[i].stride == 0 || inst->exec_size == 2);
|
2015-10-24 15:29:03 -07:00
|
|
|
inst->src[i] = byte_offset(
|
2013-12-08 04:57:08 +01:00
|
|
|
retype(brw_reg, inst->src[i].type),
|
2016-09-01 15:11:21 -07:00
|
|
|
inst->src[i].offset % 4);
|
2010-08-26 16:39:41 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2014-10-03 19:05:32 -07:00
|
|
|
|
2024-02-19 23:07:04 -08:00
|
|
|
uint64_t want_zero = used & prog_data->zero_push_reg;
|
2020-04-03 20:20:53 -05:00
|
|
|
if (want_zero) {
|
2025-04-02 16:12:45 -07:00
|
|
|
brw_builder ubld = brw_builder(this, 8).exec_all().at_start(cfg->first_block());
|
2020-04-03 20:20:53 -05:00
|
|
|
|
|
|
|
|
/* push_reg_mask_param is in 32-bit units */
|
2024-02-19 23:07:04 -08:00
|
|
|
unsigned mask_param = prog_data->push_reg_mask_param;
|
2022-08-19 12:40:20 -07:00
|
|
|
struct brw_reg mask = brw_vec1_grf(payload().num_regs + mask_param / 8,
|
|
|
|
|
mask_param % 8);
|
2020-04-03 20:20:53 -05:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg b32;
|
2020-04-03 20:20:53 -05:00
|
|
|
for (unsigned i = 0; i < 64; i++) {
|
|
|
|
|
if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg shifted = ubld.vgrf(BRW_TYPE_W, 2);
|
2020-04-03 20:20:53 -05:00
|
|
|
ubld.SHL(horiz_offset(shifted, 8),
|
2024-04-20 17:08:02 -07:00
|
|
|
byte_offset(retype(mask, BRW_TYPE_W), i / 8),
|
2020-04-03 20:20:53 -05:00
|
|
|
brw_imm_v(0x01234567));
|
|
|
|
|
ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
|
|
|
|
|
|
2024-12-29 15:41:04 -08:00
|
|
|
brw_builder ubld16 = ubld.group(16, 0);
|
2024-04-20 17:08:02 -07:00
|
|
|
b32 = ubld16.vgrf(BRW_TYPE_D);
|
2020-04-03 20:20:53 -05:00
|
|
|
ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (want_zero & BITFIELD64_BIT(i)) {
|
|
|
|
|
assert(i < prog_data->curb_read_length);
|
|
|
|
|
struct brw_reg push_reg =
|
2024-04-20 17:30:23 -07:00
|
|
|
retype(brw_vec8_grf(payload().num_regs + i, 0), BRW_TYPE_D);
|
2020-04-03 20:20:53 -05:00
|
|
|
|
|
|
|
|
ubld.AND(push_reg, push_reg, component(b32, i % 16));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-06 20:52:05 -08:00
|
|
|
invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);
|
2020-04-03 20:20:53 -05:00
|
|
|
}
|
|
|
|
|
|
2014-10-03 19:05:32 -07:00
|
|
|
/* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
|
2022-08-19 12:40:20 -07:00
|
|
|
this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length;
|
2010-08-26 16:39:41 -07:00
|
|
|
}
|
|
|
|
|
|
2018-12-11 18:45:43 +01:00
|
|
|
/*
|
|
|
|
|
* Build up an array of indices into the urb_setup array that
|
|
|
|
|
* references the active entries of the urb_setup array.
|
|
|
|
|
* Used to accelerate walking the active entries of the urb_setup array
|
|
|
|
|
* on each upload.
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data)
|
|
|
|
|
{
|
2021-10-29 12:56:22 -07:00
|
|
|
/* TODO(mesh): Review usage of this in the context of Mesh, we may want to
|
|
|
|
|
* skip per-primitive attributes here.
|
|
|
|
|
*/
|
|
|
|
|
|
2018-12-11 18:45:43 +01:00
|
|
|
/* Make sure uint8_t is sufficient */
|
|
|
|
|
STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
|
|
|
|
|
uint8_t index = 0;
|
|
|
|
|
for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
|
|
|
|
|
if (wm_prog_data->urb_setup[attr] >= 0) {
|
|
|
|
|
wm_prog_data->urb_setup_attribs[index++] = attr;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
wm_prog_data->urb_setup_attribs_count = index;
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-11 23:14:31 -07:00
|
|
|
void
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader::convert_attr_sources_to_hw_regs(brw_inst *inst)
|
2015-03-11 23:14:31 -07:00
|
|
|
{
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
if (inst->src[i].file == ATTR) {
|
2022-09-12 16:49:11 -07:00
|
|
|
assert(inst->src[i].nr == 0);
|
2022-08-19 12:40:20 -07:00
|
|
|
int grf = payload().num_regs +
|
2015-03-11 23:14:31 -07:00
|
|
|
prog_data->curb_read_length +
|
2016-09-01 12:42:20 -07:00
|
|
|
inst->src[i].offset / REG_SIZE;
|
2015-03-11 23:14:31 -07:00
|
|
|
|
2024-12-07 10:54:40 -08:00
|
|
|
/* As explained at brw_lower_vgrf_to_fixed_grf, From the Haswell PRM:
|
2016-03-23 12:20:05 +01:00
|
|
|
*
|
|
|
|
|
* VertStride must be used to cross GRF register boundaries. This
|
|
|
|
|
* rule implies that elements within a 'Width' cannot cross GRF
|
|
|
|
|
* boundaries.
|
|
|
|
|
*
|
|
|
|
|
* So, for registers that are large enough, we have to split the exec
|
|
|
|
|
* size in two and trust the compression state to sort it out.
|
|
|
|
|
*/
|
|
|
|
|
unsigned total_size = inst->exec_size *
|
|
|
|
|
inst->src[i].stride *
|
2024-04-21 00:57:59 -07:00
|
|
|
brw_type_size_bytes(inst->src[i].type);
|
2016-03-23 12:20:05 +01:00
|
|
|
|
|
|
|
|
assert(total_size <= 2 * REG_SIZE);
|
|
|
|
|
const unsigned exec_size =
|
|
|
|
|
(total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
|
|
|
|
|
|
|
|
|
|
unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
|
2015-10-24 15:29:03 -07:00
|
|
|
struct brw_reg reg =
|
2015-03-11 23:14:31 -07:00
|
|
|
stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
|
2016-09-01 15:11:21 -07:00
|
|
|
inst->src[i].offset % REG_SIZE),
|
2016-03-23 12:20:05 +01:00
|
|
|
exec_size * inst->src[i].stride,
|
2015-11-11 22:37:53 -08:00
|
|
|
width, inst->src[i].stride);
|
2015-10-24 15:29:03 -07:00
|
|
|
reg.abs = inst->src[i].abs;
|
|
|
|
|
reg.negate = inst->src[i].negate;
|
|
|
|
|
|
|
|
|
|
inst->src[i] = reg;
|
2015-03-11 23:14:31 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-30 00:47:32 -07:00
|
|
|
int
|
|
|
|
|
brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
|
|
|
|
|
const brw_stage_prog_data *prog_data)
|
2017-09-29 12:22:48 -07:00
|
|
|
{
|
|
|
|
|
if (prog_data->nr_params == 0)
|
|
|
|
|
return -1;
|
|
|
|
|
|
2021-03-29 13:43:47 -07:00
|
|
|
if (devinfo->verx10 >= 125)
|
2020-06-16 23:06:25 -05:00
|
|
|
return -1;
|
|
|
|
|
|
2017-09-29 12:22:48 -07:00
|
|
|
/* The local thread id is always the last parameter in the list */
|
|
|
|
|
uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
|
2017-08-24 11:40:31 -07:00
|
|
|
if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
|
2017-09-29 12:22:48 -07:00
|
|
|
return prog_data->nr_params - 1;
|
|
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
|
2019-08-25 23:59:25 -07:00
|
|
|
uint32_t
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_fb_write_msg_control(const brw_inst *inst,
|
2019-08-25 23:59:25 -07:00
|
|
|
const struct brw_wm_prog_data *prog_data)
|
|
|
|
|
{
|
|
|
|
|
uint32_t mctl;
|
|
|
|
|
|
2024-02-19 19:41:48 -08:00
|
|
|
if (prog_data->dual_src_blend) {
|
2022-07-22 17:33:12 -07:00
|
|
|
assert(inst->exec_size < 32);
|
2019-08-25 23:59:25 -07:00
|
|
|
|
|
|
|
|
if (inst->group % 16 == 0)
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
|
|
|
|
|
else if (inst->group % 16 == 8)
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
|
|
|
|
|
else
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("Invalid dual-source FB write instruction group");
|
2019-08-25 23:59:25 -07:00
|
|
|
} else {
|
|
|
|
|
assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
|
|
|
|
|
|
|
|
|
|
if (inst->exec_size == 16)
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
|
|
|
|
|
else if (inst->exec_size == 8)
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
|
2022-07-22 17:33:12 -07:00
|
|
|
else if (inst->exec_size == 32)
|
|
|
|
|
mctl = XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE;
|
2019-08-25 23:59:25 -07:00
|
|
|
else
|
2025-07-23 09:17:35 +02:00
|
|
|
UNREACHABLE("Invalid FB write execution size");
|
2019-08-25 23:59:25 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return mctl;
|
|
|
|
|
}
|
|
|
|
|
|
2016-03-12 18:50:24 -08:00
|
|
|
void
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader::invalidate_analysis(brw_analysis_dependency_class c)
|
2016-03-12 18:50:24 -08:00
|
|
|
{
|
2016-03-13 16:25:57 -07:00
|
|
|
live_analysis.invalidate(c);
|
2016-03-13 16:35:49 -07:00
|
|
|
regpressure_analysis.invalidate(c);
|
2024-12-29 16:09:03 -08:00
|
|
|
performance_analysis.invalidate(c);
|
2024-02-19 22:25:16 -08:00
|
|
|
idom_analysis.invalidate(c);
|
2023-11-16 01:16:45 -08:00
|
|
|
def_analysis.invalidate(c);
|
2025-03-07 23:05:43 -08:00
|
|
|
ip_ranges_analysis.invalidate(c);
|
2016-03-12 18:50:24 -08:00
|
|
|
}
|
|
|
|
|
|
2023-08-06 15:46:12 +03:00
|
|
|
void
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader::debug_optimizer(const nir_shader *nir,
|
2023-08-14 16:59:17 -07:00
|
|
|
const char *pass_name,
|
2023-08-06 15:46:12 +03:00
|
|
|
int iteration, int pass_num) const
|
|
|
|
|
{
|
2025-05-16 23:28:04 +00:00
|
|
|
/* source_hash is not readily accessible in this context */
|
|
|
|
|
if (!brw_should_print_shader(nir, DEBUG_OPTIMIZER, 0))
|
2023-08-06 15:46:12 +03:00
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
char *filename;
|
2023-08-07 17:06:49 +03:00
|
|
|
int ret = asprintf(&filename, "%s/%s%d-%s-%02d-%02d-%s",
|
|
|
|
|
debug_get_option("INTEL_SHADER_OPTIMIZER_PATH", "./"),
|
2023-09-24 21:38:47 -07:00
|
|
|
_mesa_shader_stage_to_abbrev(stage), dispatch_width, nir->info.name,
|
2023-08-06 15:46:12 +03:00
|
|
|
iteration, pass_num, pass_name);
|
|
|
|
|
if (ret == -1)
|
|
|
|
|
return;
|
2024-12-07 09:53:31 -08:00
|
|
|
|
|
|
|
|
FILE *file = stderr;
|
|
|
|
|
if (__normal_user()) {
|
|
|
|
|
file = fopen(filename, "w");
|
|
|
|
|
if (!file)
|
|
|
|
|
file = stderr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
brw_print_instructions(*this, file);
|
|
|
|
|
|
|
|
|
|
if (file != stderr)
|
|
|
|
|
fclose(file);
|
|
|
|
|
|
2023-08-06 15:46:12 +03:00
|
|
|
free(filename);
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
static uint32_t
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_compute_max_register_pressure(brw_shader &s)
|
2023-02-03 17:02:28 +01:00
|
|
|
{
|
2024-12-06 21:20:58 -08:00
|
|
|
const brw_register_pressure &rp = s.regpressure_analysis.require();
|
2023-02-03 17:02:28 +01:00
|
|
|
uint32_t ip = 0, max_pressure = 0;
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst(block, brw_inst, inst, s.cfg) {
|
2023-02-03 17:02:28 +01:00
|
|
|
max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
|
|
|
|
|
ip++;
|
|
|
|
|
}
|
|
|
|
|
return max_pressure;
|
|
|
|
|
}
|
|
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
static brw_inst **
|
2023-08-23 02:19:06 -07:00
|
|
|
save_instruction_order(const struct cfg_t *cfg)
|
|
|
|
|
{
|
|
|
|
|
/* Before we schedule anything, stash off the instruction order as an array
|
2024-12-07 00:23:07 -08:00
|
|
|
* of brw_inst *. This way, we can reset it between scheduling passes to
|
2023-08-23 02:19:06 -07:00
|
|
|
* prevent dependencies between the different scheduling modes.
|
|
|
|
|
*/
|
2025-03-11 10:48:00 -07:00
|
|
|
int num_insts = cfg->total_instructions;
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst **inst_arr = new brw_inst * [num_insts];
|
2023-08-23 02:19:06 -07:00
|
|
|
|
|
|
|
|
int ip = 0;
|
2024-12-07 00:23:07 -08:00
|
|
|
foreach_block_and_inst(block, brw_inst, inst, cfg) {
|
2023-08-23 02:19:06 -07:00
|
|
|
inst_arr[ip++] = inst;
|
|
|
|
|
}
|
|
|
|
|
assert(ip == num_insts);
|
|
|
|
|
|
|
|
|
|
return inst_arr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
2024-12-07 00:23:07 -08:00
|
|
|
restore_instruction_order(struct cfg_t *cfg, brw_inst **inst_arr)
|
2023-08-23 02:19:06 -07:00
|
|
|
{
|
2025-03-11 10:48:00 -07:00
|
|
|
ASSERTED int num_insts = cfg->total_instructions;
|
2023-08-23 02:19:06 -07:00
|
|
|
|
|
|
|
|
int ip = 0;
|
|
|
|
|
foreach_block (block, cfg) {
|
|
|
|
|
block->instructions.make_empty();
|
|
|
|
|
|
2025-03-11 13:20:09 -07:00
|
|
|
for (unsigned i = 0; i < block->num_instructions; i++)
|
|
|
|
|
block->instructions.push_tail(inst_arr[ip++]);
|
2023-08-23 02:19:06 -07:00
|
|
|
}
|
|
|
|
|
assert(ip == num_insts);
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-28 13:59:35 -08:00
|
|
|
/* Per-thread scratch space is a power-of-two multiple of 1KB. */
|
|
|
|
|
static inline unsigned
|
|
|
|
|
brw_get_scratch_size(int size)
|
|
|
|
|
{
|
|
|
|
|
return MAX2(1024, util_next_power_of_two(size));
|
|
|
|
|
}
|
|
|
|
|
|
2014-11-13 16:28:19 -08:00
|
|
|
void
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_allocate_registers(brw_shader &s, bool allow_spilling)
|
2014-11-13 16:28:19 -08:00
|
|
|
{
|
2024-07-12 16:55:33 -07:00
|
|
|
const struct intel_device_info *devinfo = s.devinfo;
|
|
|
|
|
const nir_shader *nir = s.nir;
|
2019-05-09 14:44:16 -05:00
|
|
|
bool allocated;
|
2014-11-13 16:28:19 -08:00
|
|
|
|
2024-12-06 22:39:15 -08:00
|
|
|
static const enum brw_instruction_scheduler_mode pre_modes[] = {
|
|
|
|
|
BRW_SCHEDULE_PRE,
|
|
|
|
|
BRW_SCHEDULE_PRE_NON_LIFO,
|
|
|
|
|
BRW_SCHEDULE_NONE,
|
|
|
|
|
BRW_SCHEDULE_PRE_LIFO,
|
2014-11-13 16:28:19 -08:00
|
|
|
};
|
|
|
|
|
|
2016-10-17 14:12:28 -07:00
|
|
|
static const char *scheduler_mode_name[] = {
|
2024-12-06 22:39:15 -08:00
|
|
|
[BRW_SCHEDULE_PRE] = "top-down",
|
|
|
|
|
[BRW_SCHEDULE_PRE_NON_LIFO] = "non-lifo",
|
|
|
|
|
[BRW_SCHEDULE_PRE_LIFO] = "lifo",
|
|
|
|
|
[BRW_SCHEDULE_POST] = "post",
|
|
|
|
|
[BRW_SCHEDULE_NONE] = "none",
|
2016-10-17 14:12:28 -07:00
|
|
|
};
|
|
|
|
|
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
uint32_t best_register_pressure = UINT32_MAX;
|
2024-12-06 22:39:15 -08:00
|
|
|
enum brw_instruction_scheduler_mode best_sched = BRW_SCHEDULE_NONE;
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
brw_opt_compact_virtual_grfs(s);
|
2023-03-17 09:42:31 +02:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
if (s.needs_register_pressure)
|
|
|
|
|
s.shader_stats.max_register_pressure = brw_compute_max_register_pressure(s);
|
2023-02-03 17:02:28 +01:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
s.debug_optimizer(nir, "pre_register_allocate", 90, 90);
|
2023-08-06 15:46:12 +03:00
|
|
|
|
2021-10-13 11:21:41 +02:00
|
|
|
bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
|
2016-05-16 14:30:25 -07:00
|
|
|
|
2021-11-09 19:03:19 -06:00
|
|
|
/* Before we schedule anything, stash off the instruction order as an array
|
2024-12-07 00:23:07 -08:00
|
|
|
* of brw_inst *. This way, we can reset it between scheduling passes to
|
2021-11-09 19:03:19 -06:00
|
|
|
* prevent dependencies between the different scheduling modes.
|
|
|
|
|
*/
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst **orig_order = save_instruction_order(s.cfg);
|
|
|
|
|
brw_inst **best_pressure_order = NULL;
|
2021-11-09 19:03:19 -06:00
|
|
|
|
2023-10-20 10:32:54 -07:00
|
|
|
void *scheduler_ctx = ralloc_context(NULL);
|
2024-12-06 22:39:15 -08:00
|
|
|
brw_instruction_scheduler *sched = brw_prepare_scheduler(s, scheduler_ctx);
|
2023-10-20 10:32:54 -07:00
|
|
|
|
2014-11-13 16:28:19 -08:00
|
|
|
/* Try each scheduling heuristic to see if it can successfully register
|
|
|
|
|
* allocate without spilling. They should be ordered by decreasing
|
|
|
|
|
* performance but increasing likelihood of allocating.
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
|
2024-12-06 22:39:15 -08:00
|
|
|
enum brw_instruction_scheduler_mode sched_mode = pre_modes[i];
|
2023-08-14 19:35:32 -07:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
brw_schedule_instructions_pre_ra(s, sched, sched_mode);
|
|
|
|
|
s.shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
|
2014-11-13 16:28:19 -08:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
s.debug_optimizer(nir, s.shader_stats.scheduler_mode, 95, i);
|
2023-08-15 01:15:26 -07:00
|
|
|
|
2014-11-13 16:28:19 -08:00
|
|
|
if (0) {
|
2024-07-12 16:55:33 -07:00
|
|
|
brw_assign_regs_trivial(s);
|
2019-05-09 14:44:16 -05:00
|
|
|
allocated = true;
|
|
|
|
|
break;
|
2014-11-13 16:28:19 -08:00
|
|
|
}
|
2019-05-09 14:44:16 -05:00
|
|
|
|
|
|
|
|
/* We should only spill registers on the last scheduling. */
|
2024-07-12 16:55:33 -07:00
|
|
|
assert(!s.spilled_any_registers);
|
2019-05-09 14:44:16 -05:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
allocated = brw_assign_regs(s, false, spill_all);
|
2019-05-09 14:44:16 -05:00
|
|
|
if (allocated)
|
2014-11-13 16:28:19 -08:00
|
|
|
break;
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
|
|
|
|
|
/* Save the maximum register pressure */
|
2024-07-12 16:55:33 -07:00
|
|
|
uint32_t this_pressure = brw_compute_max_register_pressure(s);
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
|
|
|
|
|
if (0) {
|
|
|
|
|
fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
|
|
|
|
|
scheduler_mode_name[sched_mode], this_pressure);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this_pressure < best_register_pressure) {
|
|
|
|
|
best_register_pressure = this_pressure;
|
|
|
|
|
best_sched = sched_mode;
|
|
|
|
|
delete[] best_pressure_order;
|
2024-07-12 16:55:33 -07:00
|
|
|
best_pressure_order = save_instruction_order(s.cfg);
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Reset back to the original order before trying the next mode */
|
2024-07-12 16:55:33 -07:00
|
|
|
restore_instruction_order(s.cfg, orig_order);
|
2025-03-11 13:20:09 -07:00
|
|
|
|
2024-12-06 20:52:05 -08:00
|
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
}
|
|
|
|
|
|
2023-10-20 10:32:54 -07:00
|
|
|
ralloc_free(scheduler_ctx);
|
|
|
|
|
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
if (!allocated) {
|
|
|
|
|
if (0) {
|
|
|
|
|
fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
|
|
|
|
|
scheduler_mode_name[best_sched]);
|
|
|
|
|
}
|
2024-07-12 16:55:33 -07:00
|
|
|
restore_instruction_order(s.cfg, best_pressure_order);
|
|
|
|
|
s.shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
allocated = brw_assign_regs(s, allow_spilling, spill_all);
|
2014-11-13 16:28:19 -08:00
|
|
|
}
|
|
|
|
|
|
2023-08-23 02:19:06 -07:00
|
|
|
delete[] orig_order;
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
delete[] best_pressure_order;
|
2023-08-23 02:19:06 -07:00
|
|
|
|
2019-05-09 14:44:16 -05:00
|
|
|
if (!allocated) {
|
2024-07-12 16:55:33 -07:00
|
|
|
s.fail("Failure to register allocate. Reduce number of "
|
2020-05-19 14:37:44 -07:00
|
|
|
"live scalar values to avoid this.");
|
2024-07-12 16:55:33 -07:00
|
|
|
} else if (s.spilled_any_registers) {
|
|
|
|
|
brw_shader_perf_log(s.compiler, s.log_data,
|
2021-07-29 14:27:57 -07:00
|
|
|
"%s shader triggered register spilling. "
|
|
|
|
|
"Try reducing the number of live scalar "
|
|
|
|
|
"values to improve performance.\n",
|
2024-07-12 16:55:33 -07:00
|
|
|
_mesa_shader_stage_to_string(s.stage));
|
2014-11-13 16:28:19 -08:00
|
|
|
}
|
|
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
if (s.failed)
|
2014-11-13 16:28:19 -08:00
|
|
|
return;
|
|
|
|
|
|
2024-12-18 11:07:08 -08:00
|
|
|
int pass_num = 0;
|
|
|
|
|
|
|
|
|
|
s.debug_optimizer(nir, "post_ra_alloc", 96, pass_num++);
|
2024-03-14 16:28:56 +02:00
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
brw_opt_bank_conflicts(s);
|
2017-06-15 15:23:57 -07:00
|
|
|
|
2024-12-18 11:07:08 -08:00
|
|
|
s.debug_optimizer(nir, "bank_conflict", 96, pass_num++);
|
2024-03-14 16:28:56 +02:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
brw_schedule_instructions_post_ra(s);
|
2014-11-13 16:28:19 -08:00
|
|
|
|
2024-12-18 11:07:08 -08:00
|
|
|
s.debug_optimizer(nir, "post_ra_alloc_scheduling", 96, pass_num++);
|
2024-03-14 16:28:56 +02:00
|
|
|
|
2024-04-04 16:03:34 -07:00
|
|
|
/* Lowering VGRF to FIXED_GRF is currently done as a separate pass instead
|
|
|
|
|
* of part of assign_regs since both bank conflicts optimization and post
|
|
|
|
|
* RA scheduling take advantage of distinguishing references to registers
|
|
|
|
|
* that were allocated from references that were already fixed.
|
|
|
|
|
*
|
|
|
|
|
* TODO: Change the passes above, then move this lowering to be part of
|
|
|
|
|
* assign_regs.
|
|
|
|
|
*/
|
2024-12-06 11:37:57 -08:00
|
|
|
brw_lower_vgrfs_to_fixed_grfs(s);
|
2024-04-04 16:03:34 -07:00
|
|
|
|
2024-12-18 11:07:08 -08:00
|
|
|
s.debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, pass_num++);
|
2016-06-09 18:13:26 -07:00
|
|
|
|
2024-11-20 08:12:52 -08:00
|
|
|
if (s.devinfo->ver >= 30) {
|
|
|
|
|
brw_lower_send_gather(s);
|
|
|
|
|
s.debug_optimizer(nir, "lower_send_gather", 96, pass_num++);
|
|
|
|
|
}
|
|
|
|
|
|
2024-08-27 10:16:11 -07:00
|
|
|
brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_REGALLOC);
|
|
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
if (s.last_scratch > 0) {
|
2016-06-09 18:13:26 -07:00
|
|
|
/* We currently only support up to 2MB of scratch space. If we
|
|
|
|
|
* need to support more eventually, the documentation suggests
|
|
|
|
|
* that we could allocate a larger buffer, and partition it out
|
|
|
|
|
* ourselves. We'd just have to undo the hardware's address
|
|
|
|
|
* calculation by subtracting (FFTID * Per Thread Scratch Space)
|
|
|
|
|
* and then add FFTID * (Larger Per Thread Scratch Space).
|
|
|
|
|
*
|
|
|
|
|
* See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
|
|
|
|
|
* Thread Group Tracking > Local Memory/Scratch Space.
|
|
|
|
|
*/
|
2024-07-12 16:55:33 -07:00
|
|
|
if (s.last_scratch <= devinfo->max_scratch_size_per_thread) {
|
2024-07-19 10:55:59 -07:00
|
|
|
/* Take the max of any previously compiled variant of the shader. In the
|
|
|
|
|
* case of bindless shaders with return parts, this will also take the
|
|
|
|
|
* max of all parts.
|
|
|
|
|
*/
|
2024-07-12 16:55:33 -07:00
|
|
|
s.prog_data->total_scratch = MAX2(brw_get_scratch_size(s.last_scratch),
|
|
|
|
|
s.prog_data->total_scratch);
|
2024-07-19 10:55:59 -07:00
|
|
|
} else {
|
2024-07-12 16:55:33 -07:00
|
|
|
s.fail("Scratch space required is larger than supported");
|
2024-07-19 10:55:59 -07:00
|
|
|
}
|
2016-06-09 16:56:31 -07:00
|
|
|
}
|
2018-11-09 14:13:37 -08:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
if (s.failed)
|
2024-07-19 10:55:59 -07:00
|
|
|
return;
|
|
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
brw_lower_scoreboard(s);
|
2024-11-20 16:18:40 -08:00
|
|
|
|
2024-12-18 11:07:08 -08:00
|
|
|
s.debug_optimizer(nir, "scoreboard", 96, pass_num++);
|
2014-11-13 16:28:19 -08:00
|
|
|
}
|
|
|
|
|
|
2020-03-20 21:02:06 -07:00
|
|
|
unsigned
|
|
|
|
|
brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
|
|
|
|
|
unsigned threads)
|
|
|
|
|
{
|
|
|
|
|
assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
|
|
|
|
|
assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
|
|
|
|
|
return cs_prog_data->push.per_thread.size * threads +
|
|
|
|
|
cs_prog_data->push.cross_thread.size;
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-01 16:02:50 -08:00
|
|
|
struct intel_cs_dispatch_info
|
2021-04-28 10:54:53 -07:00
|
|
|
brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
|
|
|
|
|
const struct brw_cs_prog_data *prog_data,
|
|
|
|
|
const unsigned *override_local_size)
|
|
|
|
|
{
|
2024-02-01 16:02:50 -08:00
|
|
|
struct intel_cs_dispatch_info info = {};
|
2021-04-28 10:54:53 -07:00
|
|
|
|
|
|
|
|
const unsigned *sizes =
|
|
|
|
|
override_local_size ? override_local_size :
|
|
|
|
|
prog_data->local_size;
|
|
|
|
|
|
2022-11-08 01:24:36 -08:00
|
|
|
const int simd = brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
|
2021-10-11 07:49:40 -07:00
|
|
|
assert(simd >= 0 && simd < 3);
|
|
|
|
|
|
2021-04-28 10:54:53 -07:00
|
|
|
info.group_size = sizes[0] * sizes[1] * sizes[2];
|
2021-10-11 07:49:40 -07:00
|
|
|
info.simd_size = 8u << simd;
|
2021-04-28 10:54:53 -07:00
|
|
|
info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
|
|
|
|
|
|
|
|
|
|
const uint32_t remainder = info.group_size & (info.simd_size - 1);
|
|
|
|
|
if (remainder > 0)
|
|
|
|
|
info.right_mask = ~0u >> (32 - remainder);
|
|
|
|
|
else
|
|
|
|
|
info.right_mask = ~0u >> (32 - info.simd_size);
|
|
|
|
|
|
|
|
|
|
return info;
|
|
|
|
|
}
|
|
|
|
|
|
2024-08-27 10:16:11 -07:00
|
|
|
void
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_shader_phase_update(brw_shader &s, enum brw_shader_phase phase)
|
2024-08-27 10:16:11 -07:00
|
|
|
{
|
|
|
|
|
assert(phase == s.phase + 1);
|
|
|
|
|
s.phase = phase;
|
2024-12-29 17:39:39 -08:00
|
|
|
brw_validate(s);
|
2024-08-27 10:16:11 -07:00
|
|
|
}
|
|
|
|
|
|
2025-05-16 23:28:04 +00:00
|
|
|
bool brw_should_print_shader(const nir_shader *shader, uint64_t debug_flag, uint32_t source_hash)
|
2023-06-20 14:42:02 -07:00
|
|
|
{
|
2025-05-16 23:28:04 +00:00
|
|
|
if (intel_shader_dump_filter && intel_shader_dump_filter != source_hash) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2023-06-20 14:42:02 -07:00
|
|
|
return INTEL_DEBUG(debug_flag) && (!shader->info.internal || NIR_DEBUG(PRINT_INTERNAL));
|
2023-06-21 07:51:00 -07:00
|
|
|
}
|
2025-01-31 12:50:20 -08:00
|
|
|
|
2025-01-31 14:56:50 -08:00
|
|
|
static unsigned
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_allocate_vgrf_number(brw_shader &s, unsigned size_in_REGSIZE_units)
|
2025-01-31 14:56:50 -08:00
|
|
|
{
|
|
|
|
|
assert(size_in_REGSIZE_units > 0);
|
|
|
|
|
|
|
|
|
|
if (s.alloc.capacity <= s.alloc.count) {
|
|
|
|
|
unsigned new_cap = MAX2(16, s.alloc.capacity * 2);
|
|
|
|
|
s.alloc.sizes = rerzalloc(s.mem_ctx, s.alloc.sizes, unsigned,
|
|
|
|
|
s.alloc.capacity, new_cap);
|
|
|
|
|
s.alloc.capacity = new_cap;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
s.alloc.sizes[s.alloc.count] = size_in_REGSIZE_units;
|
|
|
|
|
|
|
|
|
|
return s.alloc.count++;
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-31 12:50:20 -08:00
|
|
|
brw_reg
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_allocate_vgrf(brw_shader &s, brw_reg_type type, unsigned count)
|
2025-01-31 12:50:20 -08:00
|
|
|
{
|
|
|
|
|
const unsigned unit = reg_unit(s.devinfo);
|
|
|
|
|
const unsigned size = DIV_ROUND_UP(count * brw_type_size_bytes(type),
|
|
|
|
|
unit * REG_SIZE) * unit;
|
|
|
|
|
return retype(brw_allocate_vgrf_units(s, size), type);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
brw_reg
|
2024-12-07 10:25:45 -08:00
|
|
|
brw_allocate_vgrf_units(brw_shader &s, unsigned units_of_REGSIZE)
|
2025-01-31 12:50:20 -08:00
|
|
|
{
|
2025-01-31 14:56:50 -08:00
|
|
|
return brw_vgrf(brw_allocate_vgrf_number(s, units_of_REGSIZE), BRW_TYPE_UD);
|
2025-01-31 12:50:20 -08:00
|
|
|
}
|