mesa/src/intel/compiler/brw_shader.cpp

1328 lines
46 KiB
C++
Raw Normal View History

2010-08-10 20:39:06 -07:00
/*
* Copyright © 2010 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_analysis.h"
2010-08-10 20:39:06 -07:00
#include "brw_eu.h"
#include "brw_shader.h"
#include "brw_builder.h"
#include "brw_nir.h"
#include "brw_cfg.h"
#include "brw_rt.h"
#include "brw_private.h"
#include "intel_nir.h"
#include "shader_enums.h"
#include "dev/intel_debug.h"
#include "dev/intel_wa.h"
#include "compiler/glsl_types.h"
#include "compiler/nir/nir_builder.h"
#include "util/u_math.h"
2010-08-10 20:39:06 -07:00
void
brw_shader::emit_urb_writes(const brw_reg &gs_vertex_count)
{
int slot, urb_offset, length;
int starting_urb_offset = 0;
const struct brw_vue_prog_data *vue_prog_data =
brw_vue_prog_data(this->prog_data);
const struct intel_vue_map *vue_map = &vue_prog_data->vue_map;
bool flush;
brw_reg sources[8];
brw_reg urb_handle;
switch (stage) {
case MESA_SHADER_VERTEX:
urb_handle = vs_payload().urb_handles;
break;
case MESA_SHADER_TESS_EVAL:
urb_handle = tes_payload().urb_output;
break;
case MESA_SHADER_GEOMETRY:
urb_handle = gs_payload().urb_handles;
break;
default:
build: avoid redefining unreachable() which is standard in C23 In the C23 standard unreachable() is now a predefined function-like macro in <stddef.h> See https://android.googlesource.com/platform/bionic/+/HEAD/docs/c23.md#is-now-a-predefined-function_like-macro-in And this causes build errors when building for C23: ----------------------------------------------------------------------- In file included from ../src/util/log.h:30, from ../src/util/log.c:30: ../src/util/macros.h:123:9: warning: "unreachable" redefined 123 | #define unreachable(str) \ | ^~~~~~~~~~~ In file included from ../src/util/macros.h:31: /usr/lib/gcc/x86_64-linux-gnu/14/include/stddef.h:456:9: note: this is the location of the previous definition 456 | #define unreachable() (__builtin_unreachable ()) | ^~~~~~~~~~~ ----------------------------------------------------------------------- So don't redefine it with the same name, but use the name UNREACHABLE() to also signify it's a macro. Using a different name also makes sense because the behavior of the macro was extending the one of __builtin_unreachable() anyway, and it also had a different signature, accepting one argument, compared to the standard unreachable() with no arguments. This change improves the chances of building mesa with the C23 standard, which for instance is the default in recent AOSP versions. All the instances of the macro, including the definition, were updated with the following command line: git grep -l '[^_]unreachable(' -- "src/**" | sort | uniq | \ while read file; \ do \ sed -e 's/\([^_]\)unreachable(/\1UNREACHABLE(/g' -i "$file"; \ done && \ sed -e 's/#undef unreachable/#undef UNREACHABLE/g' -i src/intel/isl/isl_aux_info.c Reviewed-by: Erik Faye-Lund <erik.faye-lund@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36437>
2025-07-23 09:17:35 +02:00
UNREACHABLE("invalid stage");
}
const brw_builder bld = brw_builder(this);
brw_reg per_slot_offsets;
if (stage == MESA_SHADER_GEOMETRY) {
const struct brw_gs_prog_data *gs_prog_data =
brw_gs_prog_data(this->prog_data);
/* We need to increment the Global Offset to skip over the control data
* header and the extra "Vertex Count" field (1 HWord) at the beginning
* of the VUE. We're counting in OWords, so the units are doubled.
*/
starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords;
if (gs_prog_data->static_vertex_count == -1)
starting_urb_offset += 2;
/* The URB offset is in 128-bit units, so we need to multiply by 2 */
const int output_vertex_size_owords =
gs_prog_data->output_vertex_size_hwords * 2;
/* On Xe2+ platform, LSC can operate on the Dword data element with byte
* offset granularity, so convert per slot offset in bytes since it's in
* Owords (16-bytes) unit else keep per slot offset in oword unit for
* previous platforms.
*/
const int output_vertex_size = devinfo->ver >= 20 ?
output_vertex_size_owords * 16 :
output_vertex_size_owords;
if (gs_vertex_count.file == IMM) {
per_slot_offsets = brw_imm_ud(output_vertex_size *
gs_vertex_count.ud);
} else {
per_slot_offsets = bld.vgrf(BRW_TYPE_UD);
bld.MUL(per_slot_offsets, gs_vertex_count,
brw_imm_ud(output_vertex_size));
}
}
length = 0;
urb_offset = starting_urb_offset;
flush = false;
/* SSO shaders can have VUE slots allocated which are never actually
* written to, so ignore them when looking for the last (written) slot.
*/
int last_slot = vue_map->num_slots - 1;
while (last_slot > 0 &&
(vue_map->slot_to_varying[last_slot] == BRW_VARYING_SLOT_PAD ||
outputs[vue_map->slot_to_varying[last_slot]].file == BAD_FILE)) {
last_slot--;
}
bool urb_written = false;
for (slot = 0; slot < vue_map->num_slots; slot++) {
int varying = vue_map->slot_to_varying[slot];
switch (varying) {
case VARYING_SLOT_PSIZ: {
/* The point size varying slot is the vue header and is always in the
* vue map. If anything in the header is going to be read back by HW,
* we need to initialize it, in particular the viewport & layer
* values.
*
* SKL PRMs, Volume 7: 3D-Media-GPGPU, Vertex URB Entry (VUE)
* Formats:
*
* "VUEs are written in two ways:
*
* - At the top of the 3D Geometry pipeline, the VF's
* InputAssembly function creates VUEs and initializes them
* from data extracted from Vertex Buffers as well as
* internally generated data.
*
* - VS, GS, HS and DS threads can compute, format, and write
* new VUEs as thread output."
*
* "Software must ensure that any VUEs subject to readback by the
* 3D pipeline start with a valid Vertex Header. This extends to
* all VUEs with the following exceptions:
*
* - If the VS function is enabled, the VF-written VUEs are not
* required to have Vertex Headers, as the VS-incoming
* vertices are guaranteed to be consumed by the VS (i.e.,
* the VS thread is responsible for overwriting the input
* vertex data).
*
* - If the GS FF is enabled, neither VF-written VUEs nor VS
* thread-generated VUEs are required to have Vertex Headers,
* as the GS will consume all incoming vertices.
*
* - If Rendering is disabled, VertexHeaders are not required
* anywhere."
*/
brw_reg zero =
retype(brw_allocate_vgrf_units(*this, dispatch_width / 8), BRW_TYPE_UD);
bld.MOV(zero, brw_imm_ud(0u));
if (vue_map->slots_valid & VARYING_BIT_PRIMITIVE_SHADING_RATE &&
this->outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE].file != BAD_FILE) {
sources[length++] = this->outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE];
} else if (devinfo->has_coarse_pixel_primitive_and_cb) {
uint32_t one_fp16 = 0x3C00;
brw_reg one_by_one_fp16 =
retype(brw_allocate_vgrf_units(*this, dispatch_width / 8), BRW_TYPE_UD);
bld.MOV(one_by_one_fp16, brw_imm_ud((one_fp16 << 16) | one_fp16));
sources[length++] = one_by_one_fp16;
} else {
sources[length++] = zero;
}
if (vue_map->slots_valid & VARYING_BIT_LAYER)
sources[length++] = this->outputs[VARYING_SLOT_LAYER];
else
sources[length++] = zero;
if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
else
sources[length++] = zero;
if (vue_map->slots_valid & VARYING_BIT_PSIZ)
sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
else
sources[length++] = zero;
break;
}
case VARYING_SLOT_EDGE:
build: avoid redefining unreachable() which is standard in C23 In the C23 standard unreachable() is now a predefined function-like macro in <stddef.h> See https://android.googlesource.com/platform/bionic/+/HEAD/docs/c23.md#is-now-a-predefined-function_like-macro-in And this causes build errors when building for C23: ----------------------------------------------------------------------- In file included from ../src/util/log.h:30, from ../src/util/log.c:30: ../src/util/macros.h:123:9: warning: "unreachable" redefined 123 | #define unreachable(str) \ | ^~~~~~~~~~~ In file included from ../src/util/macros.h:31: /usr/lib/gcc/x86_64-linux-gnu/14/include/stddef.h:456:9: note: this is the location of the previous definition 456 | #define unreachable() (__builtin_unreachable ()) | ^~~~~~~~~~~ ----------------------------------------------------------------------- So don't redefine it with the same name, but use the name UNREACHABLE() to also signify it's a macro. Using a different name also makes sense because the behavior of the macro was extending the one of __builtin_unreachable() anyway, and it also had a different signature, accepting one argument, compared to the standard unreachable() with no arguments. This change improves the chances of building mesa with the C23 standard, which for instance is the default in recent AOSP versions. All the instances of the macro, including the definition, were updated with the following command line: git grep -l '[^_]unreachable(' -- "src/**" | sort | uniq | \ while read file; \ do \ sed -e 's/\([^_]\)unreachable(/\1UNREACHABLE(/g' -i "$file"; \ done && \ sed -e 's/#undef unreachable/#undef UNREACHABLE/g' -i src/intel/isl/isl_aux_info.c Reviewed-by: Erik Faye-Lund <erik.faye-lund@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36437>
2025-07-23 09:17:35 +02:00
UNREACHABLE("unexpected scalar vs output");
break;
default:
/* gl_Position is always in the vue map, but isn't always written by
* the shader. Other varyings (clip distances) get added to the vue
* map but don't always get written. In those cases, the
* corresponding this->output[] slot will be invalid we and can skip
* the urb write for the varying. If we've already queued up a vue
* slot for writing we flush a mlen 5 urb write, otherwise we just
* advance the urb_offset.
*/
if (varying == BRW_VARYING_SLOT_PAD ||
this->outputs[varying].file == BAD_FILE) {
if (length > 0)
flush = true;
else
urb_offset++;
break;
}
int slot_offset = 0;
/* When using Primitive Replication, there may be multiple slots
* assigned to POS.
*/
if (varying == VARYING_SLOT_POS)
slot_offset = slot - vue_map->varying_to_slot[VARYING_SLOT_POS];
for (unsigned i = 0; i < 4; i++) {
sources[length++] = offset(this->outputs[varying], bld,
i + (slot_offset * 4));
}
break;
}
const brw_builder abld = bld.annotate("URB write");
/* If we've queued up 8 registers of payload (2 VUE slots), if this is
* the last slot or if we need to flush (see BAD_FILE varying case
* above), emit a URB write send now to flush out the data.
*/
if (length == 8 || (length > 0 && slot == last_slot))
flush = true;
if (flush) {
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offsets;
srcs[URB_LOGICAL_SRC_DATA] =
retype(brw_allocate_vgrf_units(*this, (dispatch_width / 8) * length), BRW_TYPE_F);
srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length);
abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
brw_inst *inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
srcs, ARRAY_SIZE(srcs));
/* For Wa_1805992985 one needs additional write in the end. */
if (intel_needs_workaround(devinfo, 1805992985) && stage == MESA_SHADER_TESS_EVAL)
inst->eot = false;
else
inst->eot = slot == last_slot && stage != MESA_SHADER_GEOMETRY;
inst->offset = urb_offset;
urb_offset = starting_urb_offset + slot + 1;
length = 0;
flush = false;
urb_written = true;
}
}
/* If we don't have any valid slots to write, just do a minimal urb write
* send to terminate the shader. This includes 1 slot of undefined data,
* because it's invalid to write 0 data:
*
* From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions -
* Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read >
* Write Data Payload:
*
* "The write data payload can be between 1 and 8 message phases long."
*/
if (!urb_written) {
/* For GS, just turn EmitVertex() into a no-op. We don't want it to
* end the thread, and emit_gs_thread_end() already emits a SEND with
* EOT at the end of the program for us.
*/
if (stage == MESA_SHADER_GEOMETRY)
return;
brw_reg uniform_urb_handle =
retype(brw_allocate_vgrf_units(*this, dispatch_width / 8), BRW_TYPE_UD);
brw_reg payload =
retype(brw_allocate_vgrf_units(*this, dispatch_width / 8), BRW_TYPE_UD);
bld.exec_all().MOV(uniform_urb_handle, urb_handle);
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle;
srcs[URB_LOGICAL_SRC_DATA] = payload;
srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(1);
brw_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
srcs, ARRAY_SIZE(srcs));
inst->eot = true;
inst->offset = 1;
return;
}
/* Wa_1805992985:
*
* GPU hangs on one of tessellation vkcts tests with DS not done. The
* send cycle, which is a urb write with an eot must be 4 phases long and
* all 8 lanes must valid.
*/
if (intel_needs_workaround(devinfo, 1805992985) && stage == MESA_SHADER_TESS_EVAL) {
assert(dispatch_width == 8);
brw_reg uniform_urb_handle = retype(brw_allocate_vgrf_units(*this, 1), BRW_TYPE_UD);
brw_reg uniform_mask = retype(brw_allocate_vgrf_units(*this, 1), BRW_TYPE_UD);
brw_reg payload = retype(brw_allocate_vgrf_units(*this, 4), BRW_TYPE_UD);
/* Workaround requires all 8 channels (lanes) to be valid. This is
* understood to mean they all need to be alive. First trick is to find
* a live channel and copy its urb handle for all the other channels to
* make sure all handles are valid.
*/
bld.exec_all().MOV(uniform_urb_handle, bld.emit_uniformize(urb_handle));
/* Second trick is to use masked URB write where one can tell the HW to
* actually write data only for selected channels even though all are
* active.
* Third trick is to take advantage of the must-be-zero (MBZ) area in
* the very beginning of the URB.
*
* One masks data to be written only for the first channel and uses
* offset zero explicitly to land data to the MBZ area avoiding trashing
* any other part of the URB.
*
* Since the WA says that the write needs to be 4 phases long one uses
* 4 slots data. All are explicitly zeros in order to to keep the MBZ
* area written as zeros.
*/
bld.exec_all().MOV(uniform_mask, brw_imm_ud(0x10000u));
bld.exec_all().MOV(offset(payload, bld, 0), brw_imm_ud(0u));
bld.exec_all().MOV(offset(payload, bld, 1), brw_imm_ud(0u));
bld.exec_all().MOV(offset(payload, bld, 2), brw_imm_ud(0u));
bld.exec_all().MOV(offset(payload, bld, 3), brw_imm_ud(0u));
brw_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle;
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = uniform_mask;
srcs[URB_LOGICAL_SRC_DATA] = payload;
srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(4);
brw_inst *inst = bld.exec_all().emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
reg_undef, srcs, ARRAY_SIZE(srcs));
inst->eot = true;
inst->offset = 0;
}
}
void
brw_shader::emit_cs_terminate()
{
const brw_builder ubld = brw_builder(this).exec_all();
/* We can't directly send from g0, since sends with EOT have to use
* g112-127. So, copy it to a virtual register, The register allocator will
* make sure it uses the appropriate register range.
*/
struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_TYPE_UD);
brw_reg payload =
retype(brw_allocate_vgrf_units(*this, reg_unit(devinfo)), BRW_TYPE_UD);
ubld.group(8 * reg_unit(devinfo), 0).MOV(payload, g0);
/* Set the descriptor to "Dereference Resource" and "Root Thread" */
unsigned desc = 0;
/* Set Resource Select to "Do not dereference URB" on Gfx < 11.
*
* Note that even though the thread has a URB resource associated with it,
* we set the "do not dereference URB" bit, because the URB resource is
* managed by the fixed-function unit, so it will free it automatically.
*/
if (devinfo->ver < 11)
desc |= (1 << 4); /* Do not dereference URB */
brw_reg srcs[4] = {
brw_imm_ud(desc), /* desc */
brw_imm_ud(0), /* ex_desc */
payload, /* payload */
brw_reg(), /* payload2 */
};
brw_inst *send = ubld.emit(SHADER_OPCODE_SEND, reg_undef, srcs, 4);
/* On Alchemist and later, send an EOT message to the message gateway to
* terminate a compute shader. For older GPUs, send to the thread spawner.
*/
send->sfid = devinfo->verx10 >= 125 ? BRW_SFID_MESSAGE_GATEWAY
: BRW_SFID_THREAD_SPAWNER;
send->mlen = reg_unit(devinfo);
send->eot = true;
}
brw_shader::brw_shader(const struct brw_compiler *compiler,
const struct brw_compile_params *params,
const brw_base_prog_key *key,
struct brw_stage_prog_data *prog_data,
const nir_shader *shader,
unsigned dispatch_width,
bool needs_register_pressure,
bool debug_enabled)
: compiler(compiler), log_data(params->log_data),
devinfo(compiler->devinfo), nir(shader),
mem_ctx(params->mem_ctx),
cfg(NULL), stage(shader->info.stage),
debug_enabled(debug_enabled),
key(key), prog_data(prog_data),
live_analysis(this), regpressure_analysis(this),
performance_analysis(this), idom_analysis(this), def_analysis(this),
ip_ranges_analysis(this),
needs_register_pressure(needs_register_pressure),
dispatch_width(dispatch_width),
max_polygons(0),
api_subgroup_size(brw_nir_api_subgroup_size(shader, dispatch_width))
{
init();
}
brw_shader::brw_shader(const struct brw_compiler *compiler,
const struct brw_compile_params *params,
const brw_wm_prog_key *key,
struct brw_wm_prog_data *prog_data,
const nir_shader *shader,
unsigned dispatch_width, unsigned max_polygons,
bool needs_register_pressure,
bool debug_enabled)
: compiler(compiler), log_data(params->log_data),
devinfo(compiler->devinfo), nir(shader),
mem_ctx(params->mem_ctx),
cfg(NULL), stage(shader->info.stage),
debug_enabled(debug_enabled),
key(&key->base), prog_data(&prog_data->base),
live_analysis(this), regpressure_analysis(this),
performance_analysis(this), idom_analysis(this), def_analysis(this),
ip_ranges_analysis(this),
needs_register_pressure(needs_register_pressure),
dispatch_width(dispatch_width),
max_polygons(max_polygons),
api_subgroup_size(brw_nir_api_subgroup_size(shader, dispatch_width))
{
init();
assert(api_subgroup_size == 0 ||
api_subgroup_size == 8 ||
api_subgroup_size == 16 ||
api_subgroup_size == 32);
}
void
brw_shader::init()
{
this->max_dispatch_width = 32;
this->failed = false;
this->fail_msg = NULL;
this->payload_ = NULL;
this->source_depth_to_render_target = false;
this->first_non_payload_grf = 0;
this->uniforms = 0;
this->last_scratch = 0;
memset(&this->shader_stats, 0, sizeof(this->shader_stats));
this->grf_used = 0;
this->spilled_any_registers = false;
this->phase = BRW_SHADER_PHASE_INITIAL;
this->next_address_register_nr = 1;
this->alloc.capacity = 0;
this->alloc.sizes = NULL;
this->alloc.count = 0;
this->gs.control_data_bits_per_vertex = 0;
this->gs.control_data_header_size_bits = 0;
memset(&this->fs.per_primitive_offsets, -1,
sizeof(this->fs.per_primitive_offsets));
}
brw_shader::~brw_shader()
{
delete this->payload_;
}
void
brw_shader::vfail(const char *format, va_list va)
{
char *msg;
if (failed)
return;
failed = true;
msg = ralloc_vasprintf(mem_ctx, format, va);
msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
dispatch_width, _mesa_shader_stage_to_abbrev(stage), msg);
this->fail_msg = msg;
if (unlikely(debug_enabled)) {
fprintf(stderr, "%s", msg);
}
}
void
brw_shader::fail(const char *format, ...)
{
va_list va;
va_start(va, format);
vfail(format, va);
va_end(va);
}
/**
* Mark this program as impossible to compile with dispatch width greater
* than n.
*
* During the SIMD8 compile (which happens first), we can detect and flag
* things that are unsupported in SIMD16+ mode, so the compiler can skip the
* SIMD16+ compile altogether.
*
* During a compile of dispatch width greater than n (if one happens anyway),
* this just calls fail().
*/
void
brw_shader::limit_dispatch_width(unsigned n, const char *msg)
{
if (dispatch_width > n) {
fail("%s", msg);
} else {
max_dispatch_width = MIN2(max_dispatch_width, n);
brw_shader_perf_log(compiler, log_data,
"Shader dispatch width limited to SIMD%d: %s\n",
n, msg);
}
}
/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
* This brings in those uniform definitions
*/
void
brw_shader::import_uniforms(brw_shader *v)
{
this->uniforms = v->uniforms;
}
/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
* This brings in those uniform definitions
*/
void
brw_shader::import_per_primitive_offsets(const int *per_primitive_offsets)
{
memcpy(this->fs.per_primitive_offsets, per_primitive_offsets,
sizeof(this->fs.per_primitive_offsets));
}
enum intel_barycentric_mode
brw_barycentric_mode(const struct brw_wm_prog_key *key,
nir_intrinsic_instr *intr)
{
const glsl_interp_mode mode =
(enum glsl_interp_mode) nir_intrinsic_interp_mode(intr);
/* Barycentric modes don't make sense for flat inputs. */
assert(mode != INTERP_MODE_FLAT);
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
unsigned bary;
switch (intr->intrinsic) {
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
case nir_intrinsic_load_barycentric_pixel:
case nir_intrinsic_load_barycentric_at_offset:
/* When per sample interpolation is dynamic, assume sample
* interpolation. We'll dynamically remap things so that the FS thread
* payload is not affected.
*/
bary = key->persample_interp == INTEL_SOMETIMES ?
INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE :
INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL;
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
break;
case nir_intrinsic_load_barycentric_centroid:
bary = INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID;
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
break;
case nir_intrinsic_load_barycentric_sample:
case nir_intrinsic_load_barycentric_at_sample:
bary = INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE;
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
break;
default:
build: avoid redefining unreachable() which is standard in C23 In the C23 standard unreachable() is now a predefined function-like macro in <stddef.h> See https://android.googlesource.com/platform/bionic/+/HEAD/docs/c23.md#is-now-a-predefined-function_like-macro-in And this causes build errors when building for C23: ----------------------------------------------------------------------- In file included from ../src/util/log.h:30, from ../src/util/log.c:30: ../src/util/macros.h:123:9: warning: "unreachable" redefined 123 | #define unreachable(str) \ | ^~~~~~~~~~~ In file included from ../src/util/macros.h:31: /usr/lib/gcc/x86_64-linux-gnu/14/include/stddef.h:456:9: note: this is the location of the previous definition 456 | #define unreachable() (__builtin_unreachable ()) | ^~~~~~~~~~~ ----------------------------------------------------------------------- So don't redefine it with the same name, but use the name UNREACHABLE() to also signify it's a macro. Using a different name also makes sense because the behavior of the macro was extending the one of __builtin_unreachable() anyway, and it also had a different signature, accepting one argument, compared to the standard unreachable() with no arguments. This change improves the chances of building mesa with the C23 standard, which for instance is the default in recent AOSP versions. All the instances of the macro, including the definition, were updated with the following command line: git grep -l '[^_]unreachable(' -- "src/**" | sort | uniq | \ while read file; \ do \ sed -e 's/\([^_]\)unreachable(/\1UNREACHABLE(/g' -i "$file"; \ done && \ sed -e 's/#undef unreachable/#undef UNREACHABLE/g' -i src/intel/isl/isl_aux_info.c Reviewed-by: Erik Faye-Lund <erik.faye-lund@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36437>
2025-07-23 09:17:35 +02:00
UNREACHABLE("invalid intrinsic");
}
if (mode == INTERP_MODE_NOPERSPECTIVE)
bary += 3;
return (enum intel_barycentric_mode) bary;
}
/**
* Walk backwards from the end of the program looking for a URB write that
* isn't in control flow, and mark it with EOT.
*
* Return true if successful or false if a separate EOT write is needed.
*/
bool
brw_shader::mark_last_urb_write_with_eot()
{
brw_foreach_in_list_reverse(brw_inst, prev, &this->instructions) {
if (prev->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) {
prev->eot = true;
/* Delete now dead instructions. */
brw_foreach_in_list_reverse_safe(brw_exec_node, dead, &this->instructions) {
if (dead == prev)
break;
dead->remove();
}
return true;
} else if (prev->is_control_flow() || prev->has_side_effects()) {
break;
}
}
return false;
}
static unsigned
round_components_to_whole_registers(const intel_device_info *devinfo,
unsigned c)
{
return DIV_ROUND_UP(c, 8 * reg_unit(devinfo)) * reg_unit(devinfo);
}
void
brw_shader::assign_curb_setup()
{
unsigned uniform_push_length =
round_components_to_whole_registers(devinfo, prog_data->nr_params);
unsigned ubo_push_length = 0;
unsigned ubo_push_start[4];
for (int i = 0; i < 4; i++) {
ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
ubo_push_length += prog_data->ubo_ranges[i].length;
assert(ubo_push_start[i] % (8 * reg_unit(devinfo)) == 0);
assert(ubo_push_length % (1 * reg_unit(devinfo)) == 0);
}
prog_data->curb_read_length = uniform_push_length + ubo_push_length;
if (stage == MESA_SHADER_FRAGMENT &&
((struct brw_wm_prog_key *)key)->null_push_constant_tbimr_workaround)
prog_data->curb_read_length = MAX2(1, prog_data->curb_read_length);
anv: Emit pushed UBO bounds checking code in the back-end compiler This commit fixes performance regressions introduced by e03f9652801ad7 in which we started bounds checking our push constants. This added a LOT of shader code to shaders which use the robustBufferAccess feature and led to substantial spilling. The checking we just added to the FS back-end is far more efficient for two reasons: 1. It can be done at a whole register granularity rather than per- scalar and so we emit one SIMD8 SEL per 32B GRF rather than one SIMD16 SEL (executed as two SELs) for each component loaded. 2. Because we do it with NoMask instructions, we can do it on whole pushed GRFs without splatting them out to SIMD8 or SIME16 values. This means that robust buffer access no longer explodes our register pressure for no good reason. As a tiny side-benefit, we're now using can use AND instead of SEL which means no need for the flag and better scheduling. Vulkan pipeline database results on ICL: Instructions in all programs: 293586059 -> 238009118 (-18.9%) SENDs in all programs: 13568515 -> 13568515 (+0.0%) Loops in all programs: 149720 -> 149720 (+0.0%) Cycles in all programs: 88499234498 -> 84348917496 (-4.7%) Spills in all programs: 1229018 -> 184339 (-85.0%) Fills in all programs: 1348397 -> 246061 (-81.8%) This also improves the performance of a few apps: - Shadow of the Tomb Raider: +4% - Witcher 3: +3.5% - UE4 Shooter demo: +2% Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4447>
2020-04-03 20:20:53 -05:00
uint64_t used = 0;
const bool pull_constants =
devinfo->verx10 >= 125 &&
(mesa_shader_stage_is_compute(stage) ||
gl_shader_stage_is_mesh(stage)) &&
uniform_push_length;
if (pull_constants) {
const bool pull_constants_a64 =
(gl_shader_stage_is_rt(stage) &&
brw_bs_prog_data(prog_data)->uses_inline_push_addr) ||
((mesa_shader_stage_is_compute(stage) ||
gl_shader_stage_is_mesh(stage)) &&
brw_cs_prog_data(prog_data)->uses_inline_push_addr);
assert(devinfo->has_lsc);
brw_builder ubld = brw_builder(this, 1).exec_all().at_start(cfg->first_block());
brw_reg base_addr;
if (pull_constants_a64) {
/* The address of the push constants is at offset 0 in the inline
* parameter.
*/
base_addr =
gl_shader_stage_is_rt(stage) ?
retype(bs_payload().inline_parameter, BRW_TYPE_UQ) :
retype(cs_payload().inline_parameter, BRW_TYPE_UQ);
} else {
/* The base offset for our push data is passed in as R0.0[31:6]. We
* have to mask off the bottom 6 bits.
*/
base_addr = ubld.AND(retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
brw_imm_ud(INTEL_MASK(31, 6)));
}
/* On Gfx12-HP we load constants at the start of the program using A32
* stateless messages.
*/
for (unsigned i = 0; i < uniform_push_length;) {
/* Limit ourselves to LSC HW limit of 8 GRFs (256bytes D32V64). */
unsigned num_regs = MIN2(uniform_push_length - i, 8);
assert(num_regs > 0);
num_regs = 1 << util_logbase2(num_regs);
brw_reg addr;
if (i != 0) {
if (pull_constants_a64) {
/* We need to do the carry manually as when this pass is run,
* we're not expecting any 64bit ALUs. Unfortunately all the
* 64bit lowering is done in NIR.
*/
addr = ubld.vgrf(BRW_TYPE_UQ);
brw_reg addr_ldw = subscript(addr, BRW_TYPE_UD, 0);
brw_reg addr_udw = subscript(addr, BRW_TYPE_UD, 1);
brw_reg base_addr_ldw = subscript(base_addr, BRW_TYPE_UD, 0);
brw_reg base_addr_udw = subscript(base_addr, BRW_TYPE_UD, 1);
ubld.ADD(addr_ldw, base_addr_ldw, brw_imm_ud(i * REG_SIZE));
ubld.CMP(ubld.null_reg_d(), addr_ldw, base_addr_ldw, BRW_CONDITIONAL_L);
set_predicate(BRW_PREDICATE_NORMAL,
ubld.ADD(addr_udw, base_addr_udw, brw_imm_ud(1)));
set_predicate_inv(BRW_PREDICATE_NORMAL, true,
ubld.MOV(addr_udw, base_addr_udw));
} else {
addr = ubld.ADD(base_addr, brw_imm_ud(i * REG_SIZE));
}
} else {
addr = base_addr;
}
brw_reg srcs[4] = {
brw_imm_ud(0), /* desc */
brw_imm_ud(0), /* ex_desc */
addr, /* payload */
brw_reg(), /* payload2 */
};
brw_reg dest = retype(brw_vec8_grf(payload().num_regs + i, 0),
BRW_TYPE_UD);
brw_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4);
2025-02-10 16:28:48 -08:00
send->sfid = BRW_SFID_UGM;
brw: move final send lowering up into the IR Because we do emit the final send message form in code generation, a lot of emissions look like this : add(8) vgrf0, u0, 0x100 mov(1) a0.1, vgrf0 # emitted by the generator send(8) ..., a0.1 By moving address register manipulation in the IR, we can get this down to : add(1) a0.1, u0, 0x100 send(8) ..., a0.1 This reduce register pressure around some send messages by 1 vgrf. All lost shaders in the below results are fragment SIMD32, due to the throughput estimator. If turned off, we loose no SIMD32 shaders with this change. DG2 results: Assassin's Creed Valhalla: Totals from 2044 (96.87% of 2110) affected shaders: Instrs: 852879 -> 832044 (-2.44%); split: -2.45%, +0.00% Subgroup size: 23832 -> 23824 (-0.03%) Cycle count: 53345742 -> 52144277 (-2.25%); split: -5.08%, +2.82% Spill count: 729 -> 554 (-24.01%); split: -28.40%, +4.39% Fill count: 2005 -> 1256 (-37.36%) Scratch Memory Size: 25600 -> 19456 (-24.00%); split: -32.00%, +8.00% Max live registers: 116765 -> 115058 (-1.46%) Max dispatch width: 19152 -> 18872 (-1.46%); split: +0.21%, -1.67% Cyberpunk 2077: Totals from 1181 (93.43% of 1264) affected shaders: Instrs: 667192 -> 663615 (-0.54%); split: -0.55%, +0.01% Subgroup size: 13016 -> 13032 (+0.12%) Cycle count: 17383539 -> 17986073 (+3.47%); split: -0.93%, +4.39% Spill count: 12 -> 8 (-33.33%) Fill count: 9 -> 6 (-33.33%) Dota2: Totals from 173 (11.59% of 1493) affected shaders: Cycle count: 274403 -> 280817 (+2.34%); split: -0.01%, +2.34% Max live registers: 5787 -> 5779 (-0.14%) Max dispatch width: 1344 -> 1152 (-14.29%) Hitman3: Totals from 5072 (95.39% of 5317) affected shaders: Instrs: 2879952 -> 2841804 (-1.32%); split: -1.32%, +0.00% Cycle count: 153208505 -> 165860401 (+8.26%); split: -2.22%, +10.48% Spill count: 3942 -> 3200 (-18.82%) Fill count: 10158 -> 8846 (-12.92%) Scratch Memory Size: 257024 -> 223232 (-13.15%) Max live registers: 328467 -> 324631 (-1.17%) Max dispatch width: 43928 -> 42768 (-2.64%); split: +0.09%, -2.73% Fortnite: Totals from 360 (4.82% of 7472) affected shaders: Instrs: 778068 -> 777925 (-0.02%) Subgroup size: 3128 -> 3136 (+0.26%) Cycle count: 38684183 -> 38734579 (+0.13%); split: -0.06%, +0.19% Max live registers: 50689 -> 50658 (-0.06%) Hogwarts Legacy: Totals from 1376 (84.00% of 1638) affected shaders: Instrs: 758810 -> 749727 (-1.20%); split: -1.23%, +0.03% Cycle count: 27778983 -> 28805469 (+3.70%); split: -1.42%, +5.12% Spill count: 2475 -> 2299 (-7.11%); split: -7.47%, +0.36% Fill count: 2677 -> 2445 (-8.67%); split: -9.90%, +1.23% Scratch Memory Size: 99328 -> 89088 (-10.31%) Max live registers: 84969 -> 84671 (-0.35%); split: -0.58%, +0.23% Max dispatch width: 11848 -> 11920 (+0.61%) Metro Exodus: Totals from 92 (0.21% of 43072) affected shaders: Instrs: 262995 -> 262968 (-0.01%) Cycle count: 13818007 -> 13851266 (+0.24%); split: -0.01%, +0.25% Max live registers: 11152 -> 11140 (-0.11%) Red Dead Redemption 2 : Totals from 451 (7.71% of 5847) affected shaders: Instrs: 754178 -> 753811 (-0.05%); split: -0.05%, +0.00% Cycle count: 3484078523 -> 3484111965 (+0.00%); split: -0.00%, +0.00% Max live registers: 42294 -> 42185 (-0.26%) Spiderman Remastered: Totals from 6820 (98.02% of 6958) affected shaders: Instrs: 6921500 -> 6747933 (-2.51%); split: -4.16%, +1.65% Cycle count: 234400692460 -> 236846720707 (+1.04%); split: -0.20%, +1.25% Spill count: 72971 -> 72622 (-0.48%); split: -8.08%, +7.61% Fill count: 212921 -> 198483 (-6.78%); split: -12.37%, +5.58% Scratch Memory Size: 3491840 -> 3410944 (-2.32%); split: -12.05%, +9.74% Max live registers: 493149 -> 487458 (-1.15%) Max dispatch width: 56936 -> 56856 (-0.14%); split: +0.06%, -0.20% Strange Brigade: Totals from 3769 (91.21% of 4132) affected shaders: Instrs: 1354476 -> 1321474 (-2.44%) Cycle count: 25351530 -> 25339190 (-0.05%); split: -1.64%, +1.59% Max live registers: 199057 -> 193656 (-2.71%) Max dispatch width: 30272 -> 30240 (-0.11%) Witcher 3: Totals from 25 (2.40% of 1041) affected shaders: Instrs: 24621 -> 24606 (-0.06%) Cycle count: 2218793 -> 2217503 (-0.06%); split: -0.11%, +0.05% Max live registers: 1963 -> 1955 (-0.41%) LNL results: Assassin's Creed Valhalla: Totals from 1928 (98.02% of 1967) affected shaders: Instrs: 856107 -> 835756 (-2.38%); split: -2.48%, +0.11% Subgroup size: 41264 -> 41280 (+0.04%) Cycle count: 64606590 -> 62371700 (-3.46%); split: -5.57%, +2.11% Spill count: 915 -> 669 (-26.89%); split: -32.79%, +5.90% Fill count: 2414 -> 1617 (-33.02%); split: -36.62%, +3.60% Scratch Memory Size: 62464 -> 44032 (-29.51%); split: -36.07%, +6.56% Max live registers: 205483 -> 202192 (-1.60%) Cyberpunk 2077: Totals from 1177 (96.40% of 1221) affected shaders: Instrs: 682237 -> 678931 (-0.48%); split: -0.51%, +0.03% Subgroup size: 24912 -> 24944 (+0.13%) Cycle count: 24355928 -> 25089292 (+3.01%); split: -0.80%, +3.81% Spill count: 8 -> 3 (-62.50%) Fill count: 6 -> 3 (-50.00%) Max live registers: 126922 -> 125472 (-1.14%) Dota2: Totals from 428 (32.47% of 1318) affected shaders: Instrs: 89355 -> 89740 (+0.43%) Cycle count: 1152412 -> 1152706 (+0.03%); split: -0.52%, +0.55% Max live registers: 32863 -> 32847 (-0.05%) Fortnite: Totals from 5354 (81.72% of 6552) affected shaders: Instrs: 4135059 -> 4239015 (+2.51%); split: -0.01%, +2.53% Cycle count: 132557506 -> 132427302 (-0.10%); split: -0.75%, +0.65% Spill count: 7144 -> 7234 (+1.26%); split: -0.46%, +1.72% Fill count: 12086 -> 12403 (+2.62%); split: -0.73%, +3.35% Scratch Memory Size: 600064 -> 604160 (+0.68%); split: -1.02%, +1.71% Hitman3: Totals from 4912 (97.09% of 5059) affected shaders: Instrs: 2952124 -> 2916824 (-1.20%); split: -1.20%, +0.00% Cycle count: 179985656 -> 189175250 (+5.11%); split: -2.44%, +7.55% Spill count: 3739 -> 3136 (-16.13%) Fill count: 10657 -> 9564 (-10.26%) Scratch Memory Size: 373760 -> 318464 (-14.79%) Max live registers: 597566 -> 589460 (-1.36%) Hogwarts Legacy: Totals from 1471 (96.33% of 1527) affected shaders: Instrs: 748749 -> 766214 (+2.33%); split: -0.71%, +3.05% Cycle count: 33301528 -> 34426308 (+3.38%); split: -1.30%, +4.68% Spill count: 3278 -> 3070 (-6.35%); split: -8.30%, +1.95% Fill count: 4553 -> 4097 (-10.02%); split: -10.85%, +0.83% Scratch Memory Size: 251904 -> 217088 (-13.82%) Max live registers: 168911 -> 168106 (-0.48%); split: -0.59%, +0.12% Metro Exodus: Totals from 18356 (49.81% of 36854) affected shaders: Instrs: 7559386 -> 7621591 (+0.82%); split: -0.01%, +0.83% Cycle count: 195240612 -> 196455186 (+0.62%); split: -1.22%, +1.84% Spill count: 595 -> 546 (-8.24%) Fill count: 1604 -> 1408 (-12.22%) Max live registers: 2086937 -> 2086933 (-0.00%) Red Dead Redemption 2: Totals from 4171 (79.31% of 5259) affected shaders: Instrs: 2619392 -> 2719587 (+3.83%); split: -0.00%, +3.83% Subgroup size: 86416 -> 86432 (+0.02%) Cycle count: 8542836160 -> 8531976886 (-0.13%); split: -0.65%, +0.53% Fill count: 12949 -> 12970 (+0.16%); split: -0.43%, +0.59% Scratch Memory Size: 401408 -> 385024 (-4.08%) Spiderman Remastered: Totals from 6639 (98.94% of 6710) affected shaders: Instrs: 6877980 -> 6800592 (-1.13%); split: -3.11%, +1.98% Cycle count: 282183352210 -> 282100051824 (-0.03%); split: -0.62%, +0.59% Spill count: 63147 -> 64218 (+1.70%); split: -7.12%, +8.82% Fill count: 184931 -> 175591 (-5.05%); split: -10.81%, +5.76% Scratch Memory Size: 5318656 -> 5970944 (+12.26%); split: -5.91%, +18.17% Max live registers: 918240 -> 906604 (-1.27%) Strange Brigade: Totals from 3675 (92.24% of 3984) affected shaders: Instrs: 1462231 -> 1429345 (-2.25%); split: -2.25%, +0.00% Cycle count: 37404050 -> 37345292 (-0.16%); split: -1.25%, +1.09% Max live registers: 361849 -> 351265 (-2.92%) Witcher 3: Totals from 13 (46.43% of 28) affected shaders: Instrs: 593 -> 660 (+11.30%) Cycle count: 28302 -> 28714 (+1.46%) Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28199>
2024-02-29 20:51:50 +02:00
uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
LSC_ADDR_SURFTYPE_FLAT,
pull_constants_a64 ?
LSC_ADDR_SIZE_A64 : LSC_ADDR_SIZE_A32,
brw: move final send lowering up into the IR Because we do emit the final send message form in code generation, a lot of emissions look like this : add(8) vgrf0, u0, 0x100 mov(1) a0.1, vgrf0 # emitted by the generator send(8) ..., a0.1 By moving address register manipulation in the IR, we can get this down to : add(1) a0.1, u0, 0x100 send(8) ..., a0.1 This reduce register pressure around some send messages by 1 vgrf. All lost shaders in the below results are fragment SIMD32, due to the throughput estimator. If turned off, we loose no SIMD32 shaders with this change. DG2 results: Assassin's Creed Valhalla: Totals from 2044 (96.87% of 2110) affected shaders: Instrs: 852879 -> 832044 (-2.44%); split: -2.45%, +0.00% Subgroup size: 23832 -> 23824 (-0.03%) Cycle count: 53345742 -> 52144277 (-2.25%); split: -5.08%, +2.82% Spill count: 729 -> 554 (-24.01%); split: -28.40%, +4.39% Fill count: 2005 -> 1256 (-37.36%) Scratch Memory Size: 25600 -> 19456 (-24.00%); split: -32.00%, +8.00% Max live registers: 116765 -> 115058 (-1.46%) Max dispatch width: 19152 -> 18872 (-1.46%); split: +0.21%, -1.67% Cyberpunk 2077: Totals from 1181 (93.43% of 1264) affected shaders: Instrs: 667192 -> 663615 (-0.54%); split: -0.55%, +0.01% Subgroup size: 13016 -> 13032 (+0.12%) Cycle count: 17383539 -> 17986073 (+3.47%); split: -0.93%, +4.39% Spill count: 12 -> 8 (-33.33%) Fill count: 9 -> 6 (-33.33%) Dota2: Totals from 173 (11.59% of 1493) affected shaders: Cycle count: 274403 -> 280817 (+2.34%); split: -0.01%, +2.34% Max live registers: 5787 -> 5779 (-0.14%) Max dispatch width: 1344 -> 1152 (-14.29%) Hitman3: Totals from 5072 (95.39% of 5317) affected shaders: Instrs: 2879952 -> 2841804 (-1.32%); split: -1.32%, +0.00% Cycle count: 153208505 -> 165860401 (+8.26%); split: -2.22%, +10.48% Spill count: 3942 -> 3200 (-18.82%) Fill count: 10158 -> 8846 (-12.92%) Scratch Memory Size: 257024 -> 223232 (-13.15%) Max live registers: 328467 -> 324631 (-1.17%) Max dispatch width: 43928 -> 42768 (-2.64%); split: +0.09%, -2.73% Fortnite: Totals from 360 (4.82% of 7472) affected shaders: Instrs: 778068 -> 777925 (-0.02%) Subgroup size: 3128 -> 3136 (+0.26%) Cycle count: 38684183 -> 38734579 (+0.13%); split: -0.06%, +0.19% Max live registers: 50689 -> 50658 (-0.06%) Hogwarts Legacy: Totals from 1376 (84.00% of 1638) affected shaders: Instrs: 758810 -> 749727 (-1.20%); split: -1.23%, +0.03% Cycle count: 27778983 -> 28805469 (+3.70%); split: -1.42%, +5.12% Spill count: 2475 -> 2299 (-7.11%); split: -7.47%, +0.36% Fill count: 2677 -> 2445 (-8.67%); split: -9.90%, +1.23% Scratch Memory Size: 99328 -> 89088 (-10.31%) Max live registers: 84969 -> 84671 (-0.35%); split: -0.58%, +0.23% Max dispatch width: 11848 -> 11920 (+0.61%) Metro Exodus: Totals from 92 (0.21% of 43072) affected shaders: Instrs: 262995 -> 262968 (-0.01%) Cycle count: 13818007 -> 13851266 (+0.24%); split: -0.01%, +0.25% Max live registers: 11152 -> 11140 (-0.11%) Red Dead Redemption 2 : Totals from 451 (7.71% of 5847) affected shaders: Instrs: 754178 -> 753811 (-0.05%); split: -0.05%, +0.00% Cycle count: 3484078523 -> 3484111965 (+0.00%); split: -0.00%, +0.00% Max live registers: 42294 -> 42185 (-0.26%) Spiderman Remastered: Totals from 6820 (98.02% of 6958) affected shaders: Instrs: 6921500 -> 6747933 (-2.51%); split: -4.16%, +1.65% Cycle count: 234400692460 -> 236846720707 (+1.04%); split: -0.20%, +1.25% Spill count: 72971 -> 72622 (-0.48%); split: -8.08%, +7.61% Fill count: 212921 -> 198483 (-6.78%); split: -12.37%, +5.58% Scratch Memory Size: 3491840 -> 3410944 (-2.32%); split: -12.05%, +9.74% Max live registers: 493149 -> 487458 (-1.15%) Max dispatch width: 56936 -> 56856 (-0.14%); split: +0.06%, -0.20% Strange Brigade: Totals from 3769 (91.21% of 4132) affected shaders: Instrs: 1354476 -> 1321474 (-2.44%) Cycle count: 25351530 -> 25339190 (-0.05%); split: -1.64%, +1.59% Max live registers: 199057 -> 193656 (-2.71%) Max dispatch width: 30272 -> 30240 (-0.11%) Witcher 3: Totals from 25 (2.40% of 1041) affected shaders: Instrs: 24621 -> 24606 (-0.06%) Cycle count: 2218793 -> 2217503 (-0.06%); split: -0.11%, +0.05% Max live registers: 1963 -> 1955 (-0.41%) LNL results: Assassin's Creed Valhalla: Totals from 1928 (98.02% of 1967) affected shaders: Instrs: 856107 -> 835756 (-2.38%); split: -2.48%, +0.11% Subgroup size: 41264 -> 41280 (+0.04%) Cycle count: 64606590 -> 62371700 (-3.46%); split: -5.57%, +2.11% Spill count: 915 -> 669 (-26.89%); split: -32.79%, +5.90% Fill count: 2414 -> 1617 (-33.02%); split: -36.62%, +3.60% Scratch Memory Size: 62464 -> 44032 (-29.51%); split: -36.07%, +6.56% Max live registers: 205483 -> 202192 (-1.60%) Cyberpunk 2077: Totals from 1177 (96.40% of 1221) affected shaders: Instrs: 682237 -> 678931 (-0.48%); split: -0.51%, +0.03% Subgroup size: 24912 -> 24944 (+0.13%) Cycle count: 24355928 -> 25089292 (+3.01%); split: -0.80%, +3.81% Spill count: 8 -> 3 (-62.50%) Fill count: 6 -> 3 (-50.00%) Max live registers: 126922 -> 125472 (-1.14%) Dota2: Totals from 428 (32.47% of 1318) affected shaders: Instrs: 89355 -> 89740 (+0.43%) Cycle count: 1152412 -> 1152706 (+0.03%); split: -0.52%, +0.55% Max live registers: 32863 -> 32847 (-0.05%) Fortnite: Totals from 5354 (81.72% of 6552) affected shaders: Instrs: 4135059 -> 4239015 (+2.51%); split: -0.01%, +2.53% Cycle count: 132557506 -> 132427302 (-0.10%); split: -0.75%, +0.65% Spill count: 7144 -> 7234 (+1.26%); split: -0.46%, +1.72% Fill count: 12086 -> 12403 (+2.62%); split: -0.73%, +3.35% Scratch Memory Size: 600064 -> 604160 (+0.68%); split: -1.02%, +1.71% Hitman3: Totals from 4912 (97.09% of 5059) affected shaders: Instrs: 2952124 -> 2916824 (-1.20%); split: -1.20%, +0.00% Cycle count: 179985656 -> 189175250 (+5.11%); split: -2.44%, +7.55% Spill count: 3739 -> 3136 (-16.13%) Fill count: 10657 -> 9564 (-10.26%) Scratch Memory Size: 373760 -> 318464 (-14.79%) Max live registers: 597566 -> 589460 (-1.36%) Hogwarts Legacy: Totals from 1471 (96.33% of 1527) affected shaders: Instrs: 748749 -> 766214 (+2.33%); split: -0.71%, +3.05% Cycle count: 33301528 -> 34426308 (+3.38%); split: -1.30%, +4.68% Spill count: 3278 -> 3070 (-6.35%); split: -8.30%, +1.95% Fill count: 4553 -> 4097 (-10.02%); split: -10.85%, +0.83% Scratch Memory Size: 251904 -> 217088 (-13.82%) Max live registers: 168911 -> 168106 (-0.48%); split: -0.59%, +0.12% Metro Exodus: Totals from 18356 (49.81% of 36854) affected shaders: Instrs: 7559386 -> 7621591 (+0.82%); split: -0.01%, +0.83% Cycle count: 195240612 -> 196455186 (+0.62%); split: -1.22%, +1.84% Spill count: 595 -> 546 (-8.24%) Fill count: 1604 -> 1408 (-12.22%) Max live registers: 2086937 -> 2086933 (-0.00%) Red Dead Redemption 2: Totals from 4171 (79.31% of 5259) affected shaders: Instrs: 2619392 -> 2719587 (+3.83%); split: -0.00%, +3.83% Subgroup size: 86416 -> 86432 (+0.02%) Cycle count: 8542836160 -> 8531976886 (-0.13%); split: -0.65%, +0.53% Fill count: 12949 -> 12970 (+0.16%); split: -0.43%, +0.59% Scratch Memory Size: 401408 -> 385024 (-4.08%) Spiderman Remastered: Totals from 6639 (98.94% of 6710) affected shaders: Instrs: 6877980 -> 6800592 (-1.13%); split: -3.11%, +1.98% Cycle count: 282183352210 -> 282100051824 (-0.03%); split: -0.62%, +0.59% Spill count: 63147 -> 64218 (+1.70%); split: -7.12%, +8.82% Fill count: 184931 -> 175591 (-5.05%); split: -10.81%, +5.76% Scratch Memory Size: 5318656 -> 5970944 (+12.26%); split: -5.91%, +18.17% Max live registers: 918240 -> 906604 (-1.27%) Strange Brigade: Totals from 3675 (92.24% of 3984) affected shaders: Instrs: 1462231 -> 1429345 (-2.25%); split: -2.25%, +0.00% Cycle count: 37404050 -> 37345292 (-0.16%); split: -1.25%, +1.09% Max live registers: 361849 -> 351265 (-2.92%) Witcher 3: Totals from 13 (46.43% of 28) affected shaders: Instrs: 593 -> 660 (+11.30%) Cycle count: 28302 -> 28714 (+1.46%) Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28199>
2024-02-29 20:51:50 +02:00
LSC_DATA_SIZE_D32,
num_regs * 8 /* num_channels */,
true /* transpose */,
LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
send->header_size = 0;
send->mlen = lsc_msg_addr_len(
devinfo, pull_constants_a64 ?
LSC_ADDR_SIZE_A64 : LSC_ADDR_SIZE_A32, 1);
send->size_written =
lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, num_regs * 8) * REG_SIZE;
assert((payload().num_regs + i + send->size_written / REG_SIZE) <=
(payload().num_regs + prog_data->curb_read_length));
send->send_is_volatile = true;
brw: move final send lowering up into the IR Because we do emit the final send message form in code generation, a lot of emissions look like this : add(8) vgrf0, u0, 0x100 mov(1) a0.1, vgrf0 # emitted by the generator send(8) ..., a0.1 By moving address register manipulation in the IR, we can get this down to : add(1) a0.1, u0, 0x100 send(8) ..., a0.1 This reduce register pressure around some send messages by 1 vgrf. All lost shaders in the below results are fragment SIMD32, due to the throughput estimator. If turned off, we loose no SIMD32 shaders with this change. DG2 results: Assassin's Creed Valhalla: Totals from 2044 (96.87% of 2110) affected shaders: Instrs: 852879 -> 832044 (-2.44%); split: -2.45%, +0.00% Subgroup size: 23832 -> 23824 (-0.03%) Cycle count: 53345742 -> 52144277 (-2.25%); split: -5.08%, +2.82% Spill count: 729 -> 554 (-24.01%); split: -28.40%, +4.39% Fill count: 2005 -> 1256 (-37.36%) Scratch Memory Size: 25600 -> 19456 (-24.00%); split: -32.00%, +8.00% Max live registers: 116765 -> 115058 (-1.46%) Max dispatch width: 19152 -> 18872 (-1.46%); split: +0.21%, -1.67% Cyberpunk 2077: Totals from 1181 (93.43% of 1264) affected shaders: Instrs: 667192 -> 663615 (-0.54%); split: -0.55%, +0.01% Subgroup size: 13016 -> 13032 (+0.12%) Cycle count: 17383539 -> 17986073 (+3.47%); split: -0.93%, +4.39% Spill count: 12 -> 8 (-33.33%) Fill count: 9 -> 6 (-33.33%) Dota2: Totals from 173 (11.59% of 1493) affected shaders: Cycle count: 274403 -> 280817 (+2.34%); split: -0.01%, +2.34% Max live registers: 5787 -> 5779 (-0.14%) Max dispatch width: 1344 -> 1152 (-14.29%) Hitman3: Totals from 5072 (95.39% of 5317) affected shaders: Instrs: 2879952 -> 2841804 (-1.32%); split: -1.32%, +0.00% Cycle count: 153208505 -> 165860401 (+8.26%); split: -2.22%, +10.48% Spill count: 3942 -> 3200 (-18.82%) Fill count: 10158 -> 8846 (-12.92%) Scratch Memory Size: 257024 -> 223232 (-13.15%) Max live registers: 328467 -> 324631 (-1.17%) Max dispatch width: 43928 -> 42768 (-2.64%); split: +0.09%, -2.73% Fortnite: Totals from 360 (4.82% of 7472) affected shaders: Instrs: 778068 -> 777925 (-0.02%) Subgroup size: 3128 -> 3136 (+0.26%) Cycle count: 38684183 -> 38734579 (+0.13%); split: -0.06%, +0.19% Max live registers: 50689 -> 50658 (-0.06%) Hogwarts Legacy: Totals from 1376 (84.00% of 1638) affected shaders: Instrs: 758810 -> 749727 (-1.20%); split: -1.23%, +0.03% Cycle count: 27778983 -> 28805469 (+3.70%); split: -1.42%, +5.12% Spill count: 2475 -> 2299 (-7.11%); split: -7.47%, +0.36% Fill count: 2677 -> 2445 (-8.67%); split: -9.90%, +1.23% Scratch Memory Size: 99328 -> 89088 (-10.31%) Max live registers: 84969 -> 84671 (-0.35%); split: -0.58%, +0.23% Max dispatch width: 11848 -> 11920 (+0.61%) Metro Exodus: Totals from 92 (0.21% of 43072) affected shaders: Instrs: 262995 -> 262968 (-0.01%) Cycle count: 13818007 -> 13851266 (+0.24%); split: -0.01%, +0.25% Max live registers: 11152 -> 11140 (-0.11%) Red Dead Redemption 2 : Totals from 451 (7.71% of 5847) affected shaders: Instrs: 754178 -> 753811 (-0.05%); split: -0.05%, +0.00% Cycle count: 3484078523 -> 3484111965 (+0.00%); split: -0.00%, +0.00% Max live registers: 42294 -> 42185 (-0.26%) Spiderman Remastered: Totals from 6820 (98.02% of 6958) affected shaders: Instrs: 6921500 -> 6747933 (-2.51%); split: -4.16%, +1.65% Cycle count: 234400692460 -> 236846720707 (+1.04%); split: -0.20%, +1.25% Spill count: 72971 -> 72622 (-0.48%); split: -8.08%, +7.61% Fill count: 212921 -> 198483 (-6.78%); split: -12.37%, +5.58% Scratch Memory Size: 3491840 -> 3410944 (-2.32%); split: -12.05%, +9.74% Max live registers: 493149 -> 487458 (-1.15%) Max dispatch width: 56936 -> 56856 (-0.14%); split: +0.06%, -0.20% Strange Brigade: Totals from 3769 (91.21% of 4132) affected shaders: Instrs: 1354476 -> 1321474 (-2.44%) Cycle count: 25351530 -> 25339190 (-0.05%); split: -1.64%, +1.59% Max live registers: 199057 -> 193656 (-2.71%) Max dispatch width: 30272 -> 30240 (-0.11%) Witcher 3: Totals from 25 (2.40% of 1041) affected shaders: Instrs: 24621 -> 24606 (-0.06%) Cycle count: 2218793 -> 2217503 (-0.06%); split: -0.11%, +0.05% Max live registers: 1963 -> 1955 (-0.41%) LNL results: Assassin's Creed Valhalla: Totals from 1928 (98.02% of 1967) affected shaders: Instrs: 856107 -> 835756 (-2.38%); split: -2.48%, +0.11% Subgroup size: 41264 -> 41280 (+0.04%) Cycle count: 64606590 -> 62371700 (-3.46%); split: -5.57%, +2.11% Spill count: 915 -> 669 (-26.89%); split: -32.79%, +5.90% Fill count: 2414 -> 1617 (-33.02%); split: -36.62%, +3.60% Scratch Memory Size: 62464 -> 44032 (-29.51%); split: -36.07%, +6.56% Max live registers: 205483 -> 202192 (-1.60%) Cyberpunk 2077: Totals from 1177 (96.40% of 1221) affected shaders: Instrs: 682237 -> 678931 (-0.48%); split: -0.51%, +0.03% Subgroup size: 24912 -> 24944 (+0.13%) Cycle count: 24355928 -> 25089292 (+3.01%); split: -0.80%, +3.81% Spill count: 8 -> 3 (-62.50%) Fill count: 6 -> 3 (-50.00%) Max live registers: 126922 -> 125472 (-1.14%) Dota2: Totals from 428 (32.47% of 1318) affected shaders: Instrs: 89355 -> 89740 (+0.43%) Cycle count: 1152412 -> 1152706 (+0.03%); split: -0.52%, +0.55% Max live registers: 32863 -> 32847 (-0.05%) Fortnite: Totals from 5354 (81.72% of 6552) affected shaders: Instrs: 4135059 -> 4239015 (+2.51%); split: -0.01%, +2.53% Cycle count: 132557506 -> 132427302 (-0.10%); split: -0.75%, +0.65% Spill count: 7144 -> 7234 (+1.26%); split: -0.46%, +1.72% Fill count: 12086 -> 12403 (+2.62%); split: -0.73%, +3.35% Scratch Memory Size: 600064 -> 604160 (+0.68%); split: -1.02%, +1.71% Hitman3: Totals from 4912 (97.09% of 5059) affected shaders: Instrs: 2952124 -> 2916824 (-1.20%); split: -1.20%, +0.00% Cycle count: 179985656 -> 189175250 (+5.11%); split: -2.44%, +7.55% Spill count: 3739 -> 3136 (-16.13%) Fill count: 10657 -> 9564 (-10.26%) Scratch Memory Size: 373760 -> 318464 (-14.79%) Max live registers: 597566 -> 589460 (-1.36%) Hogwarts Legacy: Totals from 1471 (96.33% of 1527) affected shaders: Instrs: 748749 -> 766214 (+2.33%); split: -0.71%, +3.05% Cycle count: 33301528 -> 34426308 (+3.38%); split: -1.30%, +4.68% Spill count: 3278 -> 3070 (-6.35%); split: -8.30%, +1.95% Fill count: 4553 -> 4097 (-10.02%); split: -10.85%, +0.83% Scratch Memory Size: 251904 -> 217088 (-13.82%) Max live registers: 168911 -> 168106 (-0.48%); split: -0.59%, +0.12% Metro Exodus: Totals from 18356 (49.81% of 36854) affected shaders: Instrs: 7559386 -> 7621591 (+0.82%); split: -0.01%, +0.83% Cycle count: 195240612 -> 196455186 (+0.62%); split: -1.22%, +1.84% Spill count: 595 -> 546 (-8.24%) Fill count: 1604 -> 1408 (-12.22%) Max live registers: 2086937 -> 2086933 (-0.00%) Red Dead Redemption 2: Totals from 4171 (79.31% of 5259) affected shaders: Instrs: 2619392 -> 2719587 (+3.83%); split: -0.00%, +3.83% Subgroup size: 86416 -> 86432 (+0.02%) Cycle count: 8542836160 -> 8531976886 (-0.13%); split: -0.65%, +0.53% Fill count: 12949 -> 12970 (+0.16%); split: -0.43%, +0.59% Scratch Memory Size: 401408 -> 385024 (-4.08%) Spiderman Remastered: Totals from 6639 (98.94% of 6710) affected shaders: Instrs: 6877980 -> 6800592 (-1.13%); split: -3.11%, +1.98% Cycle count: 282183352210 -> 282100051824 (-0.03%); split: -0.62%, +0.59% Spill count: 63147 -> 64218 (+1.70%); split: -7.12%, +8.82% Fill count: 184931 -> 175591 (-5.05%); split: -10.81%, +5.76% Scratch Memory Size: 5318656 -> 5970944 (+12.26%); split: -5.91%, +18.17% Max live registers: 918240 -> 906604 (-1.27%) Strange Brigade: Totals from 3675 (92.24% of 3984) affected shaders: Instrs: 1462231 -> 1429345 (-2.25%); split: -2.25%, +0.00% Cycle count: 37404050 -> 37345292 (-0.16%); split: -1.25%, +1.09% Max live registers: 361849 -> 351265 (-2.92%) Witcher 3: Totals from 13 (46.43% of 28) affected shaders: Instrs: 593 -> 660 (+11.30%) Cycle count: 28302 -> 28714 (+1.46%) Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28199>
2024-02-29 20:51:50 +02:00
send->src[0] = brw_imm_ud(desc |
brw_message_desc(devinfo,
send->mlen,
send->size_written / REG_SIZE,
send->header_size));
i += num_regs;
}
invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);
}
/* Map the offsets in the UNIFORM file to fixed HW regs. */
foreach_block_and_inst(block, brw_inst, inst, cfg) {
for (unsigned int i = 0; i < inst->sources; i++) {
if (inst->src[i].file == UNIFORM) {
i965/fs: Replace fs_reg::reg_offset with fs_reg::offset expressed in bytes. The fs_reg::offset field in byte units introduced in this patch is a more straightforward alternative to the current register offset representation split between fs_reg::reg_offset and ::subreg_offset. The split representation makes it too easy to forget about one of the offsets while dealing with the other, which has led to multiple back-end bugs in the past. To make the matter worse the unit reg_offset was expressed in was rather inconsistent, for uniforms it would be expressed in either 4B or 16B units depending on the back-end, and for most other things it would be expressed in 32B units. This encodes reg_offset as a new offset field expressed consistently in byte units. Each rvalue reference of reg_offset in existing code like 'x = r.reg_offset' is rewritten to 'x = r.offset / reg_unit', and each lvalue reference like 'r.reg_offset = x' is rewritten to 'r.offset = r.offset % reg_unit + x * reg_unit'. Because the change affects a lot of places and is rather non-trivial to verify due to the inconsistent value of reg_unit, I've tried to avoid making any additional changes other than applying the rewrite rule above in order to keep the patch as simple as possible, sometimes at the cost of introducing obvious stupidity (e.g. algebraic expressions that could be simplified given some knowledge of the context) -- I'll clean those up later on in a second pass. Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
2016-09-01 12:42:20 -07:00
int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
int constant_nr;
if (inst->src[i].nr >= UBO_START) {
/* constant_nr is in 32-bit units, the rest are in bytes */
constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
inst->src[i].offset / 4;
} else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
constant_nr = uniform_nr;
} else {
/* Section 5.11 of the OpenGL 4.1 spec says:
* "Out-of-bounds reads return undefined values, which include
* values from other variables of the active program or zero."
* Just return the first push constant.
*/
constant_nr = 0;
}
anv: Emit pushed UBO bounds checking code in the back-end compiler This commit fixes performance regressions introduced by e03f9652801ad7 in which we started bounds checking our push constants. This added a LOT of shader code to shaders which use the robustBufferAccess feature and led to substantial spilling. The checking we just added to the FS back-end is far more efficient for two reasons: 1. It can be done at a whole register granularity rather than per- scalar and so we emit one SIMD8 SEL per 32B GRF rather than one SIMD16 SEL (executed as two SELs) for each component loaded. 2. Because we do it with NoMask instructions, we can do it on whole pushed GRFs without splatting them out to SIMD8 or SIME16 values. This means that robust buffer access no longer explodes our register pressure for no good reason. As a tiny side-benefit, we're now using can use AND instead of SEL which means no need for the flag and better scheduling. Vulkan pipeline database results on ICL: Instructions in all programs: 293586059 -> 238009118 (-18.9%) SENDs in all programs: 13568515 -> 13568515 (+0.0%) Loops in all programs: 149720 -> 149720 (+0.0%) Cycles in all programs: 88499234498 -> 84348917496 (-4.7%) Spills in all programs: 1229018 -> 184339 (-85.0%) Fills in all programs: 1348397 -> 246061 (-81.8%) This also improves the performance of a few apps: - Shadow of the Tomb Raider: +4% - Witcher 3: +3.5% - UE4 Shooter demo: +2% Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4447>
2020-04-03 20:20:53 -05:00
assert(constant_nr / 8 < 64);
used |= BITFIELD64_BIT(constant_nr / 8);
struct brw_reg brw_reg = brw_vec1_grf(payload().num_regs +
constant_nr / 8,
constant_nr % 8);
brw_reg.abs = inst->src[i].abs;
brw_reg.negate = inst->src[i].negate;
/* The combination of is_scalar for load_uniform, copy prop, and
* lower_btd_logical_send can generate a MOV from a UNIFORM with
* exec size 2 and stride of 1.
*/
assert(inst->src[i].stride == 0 || inst->exec_size == 2);
inst->src[i] = byte_offset(
retype(brw_reg, inst->src[i].type),
inst->src[i].offset % 4);
}
}
}
uint64_t want_zero = used & prog_data->zero_push_reg;
anv: Emit pushed UBO bounds checking code in the back-end compiler This commit fixes performance regressions introduced by e03f9652801ad7 in which we started bounds checking our push constants. This added a LOT of shader code to shaders which use the robustBufferAccess feature and led to substantial spilling. The checking we just added to the FS back-end is far more efficient for two reasons: 1. It can be done at a whole register granularity rather than per- scalar and so we emit one SIMD8 SEL per 32B GRF rather than one SIMD16 SEL (executed as two SELs) for each component loaded. 2. Because we do it with NoMask instructions, we can do it on whole pushed GRFs without splatting them out to SIMD8 or SIME16 values. This means that robust buffer access no longer explodes our register pressure for no good reason. As a tiny side-benefit, we're now using can use AND instead of SEL which means no need for the flag and better scheduling. Vulkan pipeline database results on ICL: Instructions in all programs: 293586059 -> 238009118 (-18.9%) SENDs in all programs: 13568515 -> 13568515 (+0.0%) Loops in all programs: 149720 -> 149720 (+0.0%) Cycles in all programs: 88499234498 -> 84348917496 (-4.7%) Spills in all programs: 1229018 -> 184339 (-85.0%) Fills in all programs: 1348397 -> 246061 (-81.8%) This also improves the performance of a few apps: - Shadow of the Tomb Raider: +4% - Witcher 3: +3.5% - UE4 Shooter demo: +2% Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4447>
2020-04-03 20:20:53 -05:00
if (want_zero) {
brw_builder ubld = brw_builder(this, 8).exec_all().at_start(cfg->first_block());
anv: Emit pushed UBO bounds checking code in the back-end compiler This commit fixes performance regressions introduced by e03f9652801ad7 in which we started bounds checking our push constants. This added a LOT of shader code to shaders which use the robustBufferAccess feature and led to substantial spilling. The checking we just added to the FS back-end is far more efficient for two reasons: 1. It can be done at a whole register granularity rather than per- scalar and so we emit one SIMD8 SEL per 32B GRF rather than one SIMD16 SEL (executed as two SELs) for each component loaded. 2. Because we do it with NoMask instructions, we can do it on whole pushed GRFs without splatting them out to SIMD8 or SIME16 values. This means that robust buffer access no longer explodes our register pressure for no good reason. As a tiny side-benefit, we're now using can use AND instead of SEL which means no need for the flag and better scheduling. Vulkan pipeline database results on ICL: Instructions in all programs: 293586059 -> 238009118 (-18.9%) SENDs in all programs: 13568515 -> 13568515 (+0.0%) Loops in all programs: 149720 -> 149720 (+0.0%) Cycles in all programs: 88499234498 -> 84348917496 (-4.7%) Spills in all programs: 1229018 -> 184339 (-85.0%) Fills in all programs: 1348397 -> 246061 (-81.8%) This also improves the performance of a few apps: - Shadow of the Tomb Raider: +4% - Witcher 3: +3.5% - UE4 Shooter demo: +2% Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4447>
2020-04-03 20:20:53 -05:00
/* push_reg_mask_param is in 32-bit units */
unsigned mask_param = prog_data->push_reg_mask_param;
struct brw_reg mask = brw_vec1_grf(payload().num_regs + mask_param / 8,
mask_param % 8);
anv: Emit pushed UBO bounds checking code in the back-end compiler This commit fixes performance regressions introduced by e03f9652801ad7 in which we started bounds checking our push constants. This added a LOT of shader code to shaders which use the robustBufferAccess feature and led to substantial spilling. The checking we just added to the FS back-end is far more efficient for two reasons: 1. It can be done at a whole register granularity rather than per- scalar and so we emit one SIMD8 SEL per 32B GRF rather than one SIMD16 SEL (executed as two SELs) for each component loaded. 2. Because we do it with NoMask instructions, we can do it on whole pushed GRFs without splatting them out to SIMD8 or SIME16 values. This means that robust buffer access no longer explodes our register pressure for no good reason. As a tiny side-benefit, we're now using can use AND instead of SEL which means no need for the flag and better scheduling. Vulkan pipeline database results on ICL: Instructions in all programs: 293586059 -> 238009118 (-18.9%) SENDs in all programs: 13568515 -> 13568515 (+0.0%) Loops in all programs: 149720 -> 149720 (+0.0%) Cycles in all programs: 88499234498 -> 84348917496 (-4.7%) Spills in all programs: 1229018 -> 184339 (-85.0%) Fills in all programs: 1348397 -> 246061 (-81.8%) This also improves the performance of a few apps: - Shadow of the Tomb Raider: +4% - Witcher 3: +3.5% - UE4 Shooter demo: +2% Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4447>
2020-04-03 20:20:53 -05:00
brw_reg b32;
anv: Emit pushed UBO bounds checking code in the back-end compiler This commit fixes performance regressions introduced by e03f9652801ad7 in which we started bounds checking our push constants. This added a LOT of shader code to shaders which use the robustBufferAccess feature and led to substantial spilling. The checking we just added to the FS back-end is far more efficient for two reasons: 1. It can be done at a whole register granularity rather than per- scalar and so we emit one SIMD8 SEL per 32B GRF rather than one SIMD16 SEL (executed as two SELs) for each component loaded. 2. Because we do it with NoMask instructions, we can do it on whole pushed GRFs without splatting them out to SIMD8 or SIME16 values. This means that robust buffer access no longer explodes our register pressure for no good reason. As a tiny side-benefit, we're now using can use AND instead of SEL which means no need for the flag and better scheduling. Vulkan pipeline database results on ICL: Instructions in all programs: 293586059 -> 238009118 (-18.9%) SENDs in all programs: 13568515 -> 13568515 (+0.0%) Loops in all programs: 149720 -> 149720 (+0.0%) Cycles in all programs: 88499234498 -> 84348917496 (-4.7%) Spills in all programs: 1229018 -> 184339 (-85.0%) Fills in all programs: 1348397 -> 246061 (-81.8%) This also improves the performance of a few apps: - Shadow of the Tomb Raider: +4% - Witcher 3: +3.5% - UE4 Shooter demo: +2% Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4447>
2020-04-03 20:20:53 -05:00
for (unsigned i = 0; i < 64; i++) {
if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
brw_reg shifted = ubld.vgrf(BRW_TYPE_W, 2);
anv: Emit pushed UBO bounds checking code in the back-end compiler This commit fixes performance regressions introduced by e03f9652801ad7 in which we started bounds checking our push constants. This added a LOT of shader code to shaders which use the robustBufferAccess feature and led to substantial spilling. The checking we just added to the FS back-end is far more efficient for two reasons: 1. It can be done at a whole register granularity rather than per- scalar and so we emit one SIMD8 SEL per 32B GRF rather than one SIMD16 SEL (executed as two SELs) for each component loaded. 2. Because we do it with NoMask instructions, we can do it on whole pushed GRFs without splatting them out to SIMD8 or SIME16 values. This means that robust buffer access no longer explodes our register pressure for no good reason. As a tiny side-benefit, we're now using can use AND instead of SEL which means no need for the flag and better scheduling. Vulkan pipeline database results on ICL: Instructions in all programs: 293586059 -> 238009118 (-18.9%) SENDs in all programs: 13568515 -> 13568515 (+0.0%) Loops in all programs: 149720 -> 149720 (+0.0%) Cycles in all programs: 88499234498 -> 84348917496 (-4.7%) Spills in all programs: 1229018 -> 184339 (-85.0%) Fills in all programs: 1348397 -> 246061 (-81.8%) This also improves the performance of a few apps: - Shadow of the Tomb Raider: +4% - Witcher 3: +3.5% - UE4 Shooter demo: +2% Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4447>
2020-04-03 20:20:53 -05:00
ubld.SHL(horiz_offset(shifted, 8),
byte_offset(retype(mask, BRW_TYPE_W), i / 8),
anv: Emit pushed UBO bounds checking code in the back-end compiler This commit fixes performance regressions introduced by e03f9652801ad7 in which we started bounds checking our push constants. This added a LOT of shader code to shaders which use the robustBufferAccess feature and led to substantial spilling. The checking we just added to the FS back-end is far more efficient for two reasons: 1. It can be done at a whole register granularity rather than per- scalar and so we emit one SIMD8 SEL per 32B GRF rather than one SIMD16 SEL (executed as two SELs) for each component loaded. 2. Because we do it with NoMask instructions, we can do it on whole pushed GRFs without splatting them out to SIMD8 or SIME16 values. This means that robust buffer access no longer explodes our register pressure for no good reason. As a tiny side-benefit, we're now using can use AND instead of SEL which means no need for the flag and better scheduling. Vulkan pipeline database results on ICL: Instructions in all programs: 293586059 -> 238009118 (-18.9%) SENDs in all programs: 13568515 -> 13568515 (+0.0%) Loops in all programs: 149720 -> 149720 (+0.0%) Cycles in all programs: 88499234498 -> 84348917496 (-4.7%) Spills in all programs: 1229018 -> 184339 (-85.0%) Fills in all programs: 1348397 -> 246061 (-81.8%) This also improves the performance of a few apps: - Shadow of the Tomb Raider: +4% - Witcher 3: +3.5% - UE4 Shooter demo: +2% Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4447>
2020-04-03 20:20:53 -05:00
brw_imm_v(0x01234567));
ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
brw_builder ubld16 = ubld.group(16, 0);
b32 = ubld16.vgrf(BRW_TYPE_D);
anv: Emit pushed UBO bounds checking code in the back-end compiler This commit fixes performance regressions introduced by e03f9652801ad7 in which we started bounds checking our push constants. This added a LOT of shader code to shaders which use the robustBufferAccess feature and led to substantial spilling. The checking we just added to the FS back-end is far more efficient for two reasons: 1. It can be done at a whole register granularity rather than per- scalar and so we emit one SIMD8 SEL per 32B GRF rather than one SIMD16 SEL (executed as two SELs) for each component loaded. 2. Because we do it with NoMask instructions, we can do it on whole pushed GRFs without splatting them out to SIMD8 or SIME16 values. This means that robust buffer access no longer explodes our register pressure for no good reason. As a tiny side-benefit, we're now using can use AND instead of SEL which means no need for the flag and better scheduling. Vulkan pipeline database results on ICL: Instructions in all programs: 293586059 -> 238009118 (-18.9%) SENDs in all programs: 13568515 -> 13568515 (+0.0%) Loops in all programs: 149720 -> 149720 (+0.0%) Cycles in all programs: 88499234498 -> 84348917496 (-4.7%) Spills in all programs: 1229018 -> 184339 (-85.0%) Fills in all programs: 1348397 -> 246061 (-81.8%) This also improves the performance of a few apps: - Shadow of the Tomb Raider: +4% - Witcher 3: +3.5% - UE4 Shooter demo: +2% Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4447>
2020-04-03 20:20:53 -05:00
ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
}
if (want_zero & BITFIELD64_BIT(i)) {
assert(i < prog_data->curb_read_length);
struct brw_reg push_reg =
retype(brw_vec8_grf(payload().num_regs + i, 0), BRW_TYPE_D);
anv: Emit pushed UBO bounds checking code in the back-end compiler This commit fixes performance regressions introduced by e03f9652801ad7 in which we started bounds checking our push constants. This added a LOT of shader code to shaders which use the robustBufferAccess feature and led to substantial spilling. The checking we just added to the FS back-end is far more efficient for two reasons: 1. It can be done at a whole register granularity rather than per- scalar and so we emit one SIMD8 SEL per 32B GRF rather than one SIMD16 SEL (executed as two SELs) for each component loaded. 2. Because we do it with NoMask instructions, we can do it on whole pushed GRFs without splatting them out to SIMD8 or SIME16 values. This means that robust buffer access no longer explodes our register pressure for no good reason. As a tiny side-benefit, we're now using can use AND instead of SEL which means no need for the flag and better scheduling. Vulkan pipeline database results on ICL: Instructions in all programs: 293586059 -> 238009118 (-18.9%) SENDs in all programs: 13568515 -> 13568515 (+0.0%) Loops in all programs: 149720 -> 149720 (+0.0%) Cycles in all programs: 88499234498 -> 84348917496 (-4.7%) Spills in all programs: 1229018 -> 184339 (-85.0%) Fills in all programs: 1348397 -> 246061 (-81.8%) This also improves the performance of a few apps: - Shadow of the Tomb Raider: +4% - Witcher 3: +3.5% - UE4 Shooter demo: +2% Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4447>
2020-04-03 20:20:53 -05:00
ubld.AND(push_reg, push_reg, component(b32, i % 16));
}
}
invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);
anv: Emit pushed UBO bounds checking code in the back-end compiler This commit fixes performance regressions introduced by e03f9652801ad7 in which we started bounds checking our push constants. This added a LOT of shader code to shaders which use the robustBufferAccess feature and led to substantial spilling. The checking we just added to the FS back-end is far more efficient for two reasons: 1. It can be done at a whole register granularity rather than per- scalar and so we emit one SIMD8 SEL per 32B GRF rather than one SIMD16 SEL (executed as two SELs) for each component loaded. 2. Because we do it with NoMask instructions, we can do it on whole pushed GRFs without splatting them out to SIMD8 or SIME16 values. This means that robust buffer access no longer explodes our register pressure for no good reason. As a tiny side-benefit, we're now using can use AND instead of SEL which means no need for the flag and better scheduling. Vulkan pipeline database results on ICL: Instructions in all programs: 293586059 -> 238009118 (-18.9%) SENDs in all programs: 13568515 -> 13568515 (+0.0%) Loops in all programs: 149720 -> 149720 (+0.0%) Cycles in all programs: 88499234498 -> 84348917496 (-4.7%) Spills in all programs: 1229018 -> 184339 (-85.0%) Fills in all programs: 1348397 -> 246061 (-81.8%) This also improves the performance of a few apps: - Shadow of the Tomb Raider: +4% - Witcher 3: +3.5% - UE4 Shooter demo: +2% Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4447>
2020-04-03 20:20:53 -05:00
}
/* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length;
}
/*
* Build up an array of indices into the urb_setup array that
* references the active entries of the urb_setup array.
* Used to accelerate walking the active entries of the urb_setup array
* on each upload.
*/
void
brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data)
{
/* TODO(mesh): Review usage of this in the context of Mesh, we may want to
* skip per-primitive attributes here.
*/
/* Make sure uint8_t is sufficient */
STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
uint8_t index = 0;
for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
if (wm_prog_data->urb_setup[attr] >= 0) {
wm_prog_data->urb_setup_attribs[index++] = attr;
}
}
wm_prog_data->urb_setup_attribs_count = index;
}
void
brw_shader::convert_attr_sources_to_hw_regs(brw_inst *inst)
{
for (int i = 0; i < inst->sources; i++) {
if (inst->src[i].file == ATTR) {
assert(inst->src[i].nr == 0);
int grf = payload().num_regs +
prog_data->curb_read_length +
i965/fs: Replace fs_reg::reg_offset with fs_reg::offset expressed in bytes. The fs_reg::offset field in byte units introduced in this patch is a more straightforward alternative to the current register offset representation split between fs_reg::reg_offset and ::subreg_offset. The split representation makes it too easy to forget about one of the offsets while dealing with the other, which has led to multiple back-end bugs in the past. To make the matter worse the unit reg_offset was expressed in was rather inconsistent, for uniforms it would be expressed in either 4B or 16B units depending on the back-end, and for most other things it would be expressed in 32B units. This encodes reg_offset as a new offset field expressed consistently in byte units. Each rvalue reference of reg_offset in existing code like 'x = r.reg_offset' is rewritten to 'x = r.offset / reg_unit', and each lvalue reference like 'r.reg_offset = x' is rewritten to 'r.offset = r.offset % reg_unit + x * reg_unit'. Because the change affects a lot of places and is rather non-trivial to verify due to the inconsistent value of reg_unit, I've tried to avoid making any additional changes other than applying the rewrite rule above in order to keep the patch as simple as possible, sometimes at the cost of introducing obvious stupidity (e.g. algebraic expressions that could be simplified given some knowledge of the context) -- I'll clean those up later on in a second pass. Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
2016-09-01 12:42:20 -07:00
inst->src[i].offset / REG_SIZE;
/* As explained at brw_lower_vgrf_to_fixed_grf, From the Haswell PRM:
*
* VertStride must be used to cross GRF register boundaries. This
* rule implies that elements within a 'Width' cannot cross GRF
* boundaries.
*
* So, for registers that are large enough, we have to split the exec
* size in two and trust the compression state to sort it out.
*/
unsigned total_size = inst->exec_size *
inst->src[i].stride *
brw_type_size_bytes(inst->src[i].type);
assert(total_size <= 2 * REG_SIZE);
const unsigned exec_size =
(total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
struct brw_reg reg =
stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
inst->src[i].offset % REG_SIZE),
exec_size * inst->src[i].stride,
width, inst->src[i].stride);
reg.abs = inst->src[i].abs;
reg.negate = inst->src[i].negate;
inst->src[i] = reg;
}
}
}
int
brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
const brw_stage_prog_data *prog_data)
{
if (prog_data->nr_params == 0)
return -1;
if (devinfo->verx10 >= 125)
return -1;
/* The local thread id is always the last parameter in the list */
uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
return prog_data->nr_params - 1;
return -1;
}
uint32_t
brw_fb_write_msg_control(const brw_inst *inst,
const struct brw_wm_prog_data *prog_data)
{
uint32_t mctl;
if (prog_data->dual_src_blend) {
assert(inst->exec_size < 32);
if (inst->group % 16 == 0)
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
else if (inst->group % 16 == 8)
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
else
build: avoid redefining unreachable() which is standard in C23 In the C23 standard unreachable() is now a predefined function-like macro in <stddef.h> See https://android.googlesource.com/platform/bionic/+/HEAD/docs/c23.md#is-now-a-predefined-function_like-macro-in And this causes build errors when building for C23: ----------------------------------------------------------------------- In file included from ../src/util/log.h:30, from ../src/util/log.c:30: ../src/util/macros.h:123:9: warning: "unreachable" redefined 123 | #define unreachable(str) \ | ^~~~~~~~~~~ In file included from ../src/util/macros.h:31: /usr/lib/gcc/x86_64-linux-gnu/14/include/stddef.h:456:9: note: this is the location of the previous definition 456 | #define unreachable() (__builtin_unreachable ()) | ^~~~~~~~~~~ ----------------------------------------------------------------------- So don't redefine it with the same name, but use the name UNREACHABLE() to also signify it's a macro. Using a different name also makes sense because the behavior of the macro was extending the one of __builtin_unreachable() anyway, and it also had a different signature, accepting one argument, compared to the standard unreachable() with no arguments. This change improves the chances of building mesa with the C23 standard, which for instance is the default in recent AOSP versions. All the instances of the macro, including the definition, were updated with the following command line: git grep -l '[^_]unreachable(' -- "src/**" | sort | uniq | \ while read file; \ do \ sed -e 's/\([^_]\)unreachable(/\1UNREACHABLE(/g' -i "$file"; \ done && \ sed -e 's/#undef unreachable/#undef UNREACHABLE/g' -i src/intel/isl/isl_aux_info.c Reviewed-by: Erik Faye-Lund <erik.faye-lund@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36437>
2025-07-23 09:17:35 +02:00
UNREACHABLE("Invalid dual-source FB write instruction group");
} else {
assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
if (inst->exec_size == 16)
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
else if (inst->exec_size == 8)
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
else if (inst->exec_size == 32)
mctl = XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE;
else
build: avoid redefining unreachable() which is standard in C23 In the C23 standard unreachable() is now a predefined function-like macro in <stddef.h> See https://android.googlesource.com/platform/bionic/+/HEAD/docs/c23.md#is-now-a-predefined-function_like-macro-in And this causes build errors when building for C23: ----------------------------------------------------------------------- In file included from ../src/util/log.h:30, from ../src/util/log.c:30: ../src/util/macros.h:123:9: warning: "unreachable" redefined 123 | #define unreachable(str) \ | ^~~~~~~~~~~ In file included from ../src/util/macros.h:31: /usr/lib/gcc/x86_64-linux-gnu/14/include/stddef.h:456:9: note: this is the location of the previous definition 456 | #define unreachable() (__builtin_unreachable ()) | ^~~~~~~~~~~ ----------------------------------------------------------------------- So don't redefine it with the same name, but use the name UNREACHABLE() to also signify it's a macro. Using a different name also makes sense because the behavior of the macro was extending the one of __builtin_unreachable() anyway, and it also had a different signature, accepting one argument, compared to the standard unreachable() with no arguments. This change improves the chances of building mesa with the C23 standard, which for instance is the default in recent AOSP versions. All the instances of the macro, including the definition, were updated with the following command line: git grep -l '[^_]unreachable(' -- "src/**" | sort | uniq | \ while read file; \ do \ sed -e 's/\([^_]\)unreachable(/\1UNREACHABLE(/g' -i "$file"; \ done && \ sed -e 's/#undef unreachable/#undef UNREACHABLE/g' -i src/intel/isl/isl_aux_info.c Reviewed-by: Erik Faye-Lund <erik.faye-lund@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36437>
2025-07-23 09:17:35 +02:00
UNREACHABLE("Invalid FB write execution size");
}
return mctl;
}
void
brw_shader::invalidate_analysis(brw_analysis_dependency_class c)
{
live_analysis.invalidate(c);
regpressure_analysis.invalidate(c);
performance_analysis.invalidate(c);
idom_analysis.invalidate(c);
def_analysis.invalidate(c);
ip_ranges_analysis.invalidate(c);
}
void
brw_shader::debug_optimizer(const nir_shader *nir,
const char *pass_name,
int iteration, int pass_num) const
{
/* source_hash is not readily accessible in this context */
if (!brw_should_print_shader(nir, DEBUG_OPTIMIZER, 0))
return;
char *filename;
int ret = asprintf(&filename, "%s/%s%d-%s-%02d-%02d-%s",
debug_get_option("INTEL_SHADER_OPTIMIZER_PATH", "./"),
_mesa_shader_stage_to_abbrev(stage), dispatch_width, nir->info.name,
iteration, pass_num, pass_name);
if (ret == -1)
return;
FILE *file = stderr;
if (__normal_user()) {
file = fopen(filename, "w");
if (!file)
file = stderr;
}
brw_print_instructions(*this, file);
if (file != stderr)
fclose(file);
free(filename);
}
static uint32_t
brw_compute_max_register_pressure(brw_shader &s)
{
const brw_register_pressure &rp = s.regpressure_analysis.require();
uint32_t ip = 0, max_pressure = 0;
foreach_block_and_inst(block, brw_inst, inst, s.cfg) {
max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
ip++;
}
return max_pressure;
}
static brw_inst **
save_instruction_order(const struct cfg_t *cfg)
{
/* Before we schedule anything, stash off the instruction order as an array
* of brw_inst *. This way, we can reset it between scheduling passes to
* prevent dependencies between the different scheduling modes.
*/
int num_insts = cfg->total_instructions;
brw_inst **inst_arr = new brw_inst * [num_insts];
int ip = 0;
foreach_block_and_inst(block, brw_inst, inst, cfg) {
inst_arr[ip++] = inst;
}
assert(ip == num_insts);
return inst_arr;
}
static void
restore_instruction_order(struct cfg_t *cfg, brw_inst **inst_arr)
{
ASSERTED int num_insts = cfg->total_instructions;
int ip = 0;
foreach_block (block, cfg) {
block->instructions.make_empty();
for (unsigned i = 0; i < block->num_instructions; i++)
block->instructions.push_tail(inst_arr[ip++]);
}
assert(ip == num_insts);
}
/* Per-thread scratch space is a power-of-two multiple of 1KB. */
static inline unsigned
brw_get_scratch_size(int size)
{
return MAX2(1024, util_next_power_of_two(size));
}
void
brw_allocate_registers(brw_shader &s, bool allow_spilling)
{
const struct intel_device_info *devinfo = s.devinfo;
const nir_shader *nir = s.nir;
intel/fs: Stop doing extra RA calls In the last phase of the schedule and RA loop, the RA call is redundant if we spill. Immediately afterwards, we're going to see that we couldn't allocate without spilling and call back into RA and tell it to go ahead and spill. We've known about it for a while but we've always brushed over it on the theory that, if you're going to spill, you'll be calling RA a bunch anyway and what does one extra RA hurt? As it turns out, it hurts more than you'd expect. Because the RA interference graph gets sparser with each spill and the RA algorithm is more efficient on sparser graphs, the RA call that we're duplicating is actually the most expensive call in the RA-and-spill loop. There's another extra RA call we do that's a bit harder to see which this also removes. If we try to compile a shader that isn't the minimum dispatch width and it fails to allocate without spilling we call fail() to set an error but then go ahead and do the first spilling RA pass and only after that's complete do we detect the fail and bail out. By making minimum dispatch widths part of the spill condition, we side-step this problem. Getting rid of these extra spills takes the compile time of a nasty Aztec Ruins shader from about 28 seconds to about 26 seconds on my laptop. It also makes shader-db 1.5% faster Shader-db results on Kaby Lake: total instructions in shared programs: 15311100 -> 15311100 (0.00%) instructions in affected programs: 0 -> 0 helped: 0 HURT: 0 total cycles in shared programs: 355468050 -> 355468050 (0.00%) cycles in affected programs: 0 -> 0 helped: 0 HURT: 0 Total CPU time (seconds): 2524.31 -> 2486.63 (-1.49%) Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2019-05-09 14:44:16 -05:00
bool allocated;
static const enum brw_instruction_scheduler_mode pre_modes[] = {
BRW_SCHEDULE_PRE,
BRW_SCHEDULE_PRE_NON_LIFO,
BRW_SCHEDULE_NONE,
BRW_SCHEDULE_PRE_LIFO,
};
static const char *scheduler_mode_name[] = {
[BRW_SCHEDULE_PRE] = "top-down",
[BRW_SCHEDULE_PRE_NON_LIFO] = "non-lifo",
[BRW_SCHEDULE_PRE_LIFO] = "lifo",
[BRW_SCHEDULE_POST] = "post",
[BRW_SCHEDULE_NONE] = "none",
};
intel/fs: Pick the lowest register pressure schedule when spilling We try various pre-RA scheduler modes and see if any of them allow us to register allocate without spilling. If all of them spill, however, we left it on the last mode: LIFO. This is unfortunately sometimes significantly worse than other modes (such as "none"). This patch makes us instead select the pre-RA scheduling mode that gives the lowest register pressure estimate, if none of them manage to avoid spilling. The hope is that this scheduling will spill the least out of all of them. fossil-db stats (on Alchemist) speak for themselves: Totals: Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03% Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64% Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70% Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43% Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10% Totals from 1791 (0.27% of 668386) affected shaders: Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67% Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65% Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20% Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34% Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30% Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770. Improves performance of Borderlands 3 by 1.54% on A770. Reviewed-by: Emma Anholt <emma@anholt.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
uint32_t best_register_pressure = UINT32_MAX;
enum brw_instruction_scheduler_mode best_sched = BRW_SCHEDULE_NONE;
intel/fs: Pick the lowest register pressure schedule when spilling We try various pre-RA scheduler modes and see if any of them allow us to register allocate without spilling. If all of them spill, however, we left it on the last mode: LIFO. This is unfortunately sometimes significantly worse than other modes (such as "none"). This patch makes us instead select the pre-RA scheduling mode that gives the lowest register pressure estimate, if none of them manage to avoid spilling. The hope is that this scheduling will spill the least out of all of them. fossil-db stats (on Alchemist) speak for themselves: Totals: Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03% Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64% Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70% Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43% Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10% Totals from 1791 (0.27% of 668386) affected shaders: Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67% Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65% Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20% Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34% Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30% Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770. Improves performance of Borderlands 3 by 1.54% on A770. Reviewed-by: Emma Anholt <emma@anholt.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
brw_opt_compact_virtual_grfs(s);
if (s.needs_register_pressure)
s.shader_stats.max_register_pressure = brw_compute_max_register_pressure(s);
s.debug_optimizer(nir, "pre_register_allocate", 90, 90);
bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
intel/fs: Reset instruction order before re-scheduling The way the current scheduler loop is implemented, each scheduling pass starts with what the previous pass had. This means that, if PRE screwed everything up majorly, PRE_NON_LIFO would have to try to fix it. It also meant that tiny changes to one pass would affect every later pass. Instead, reset the order of the instructions before each scheduling pass. This makes the passes entirely independent of each other. Shader-db results on Ice Lake: total instructions in shared programs: 19670486 -> 19670648 (<.01%) instructions in affected programs: 25317 -> 25479 (0.64%) helped: 2 HURT: 7 helped stats (abs) min: 4 max: 4 x̄: 4.00 x̃: 4 helped stats (rel) min: 0.07% max: 0.07% x̄: 0.07% x̃: 0.07% HURT stats (abs) min: 8 max: 70 x̄: 24.29 x̃: 12 HURT stats (rel) min: 0.41% max: 4.95% x̄: 1.47% x̃: 0.87% 95% mean confidence interval for instructions value: -1.28 37.28 95% mean confidence interval for instructions %-change: -0.04% 2.30% Inconclusive result (value mean confidence interval includes 0). total cycles in shared programs: 935535948 -> 935490243 (<.01%) cycles in affected programs: 421994824 -> 421949119 (-0.01%) helped: 1269 HURT: 879 helped stats (abs) min: 1 max: 12008 x̄: 259.38 x̃: 52 helped stats (rel) min: <.01% max: 28.02% x̄: 1.12% x̃: 0.14% HURT stats (abs) min: 1 max: 29931 x̄: 322.46 x̃: 20 HURT stats (rel) min: <.01% max: 32.17% x̄: 1.74% x̃: 0.22% 95% mean confidence interval for cycles value: -71.37 28.81 95% mean confidence interval for cycles %-change: -0.11% 0.21% Inconclusive result (value mean confidence interval includes 0). total spills in shared programs: 12403 -> 12430 (0.22%) spills in affected programs: 1355 -> 1382 (1.99%) helped: 2 HURT: 7 total fills in shared programs: 15128 -> 15182 (0.36%) fills in affected programs: 3294 -> 3348 (1.64%) helped: 2 HURT: 7 LOST: 21 GAINED: 28 Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13734>
2021-11-09 19:03:19 -06:00
/* Before we schedule anything, stash off the instruction order as an array
* of brw_inst *. This way, we can reset it between scheduling passes to
intel/fs: Reset instruction order before re-scheduling The way the current scheduler loop is implemented, each scheduling pass starts with what the previous pass had. This means that, if PRE screwed everything up majorly, PRE_NON_LIFO would have to try to fix it. It also meant that tiny changes to one pass would affect every later pass. Instead, reset the order of the instructions before each scheduling pass. This makes the passes entirely independent of each other. Shader-db results on Ice Lake: total instructions in shared programs: 19670486 -> 19670648 (<.01%) instructions in affected programs: 25317 -> 25479 (0.64%) helped: 2 HURT: 7 helped stats (abs) min: 4 max: 4 x̄: 4.00 x̃: 4 helped stats (rel) min: 0.07% max: 0.07% x̄: 0.07% x̃: 0.07% HURT stats (abs) min: 8 max: 70 x̄: 24.29 x̃: 12 HURT stats (rel) min: 0.41% max: 4.95% x̄: 1.47% x̃: 0.87% 95% mean confidence interval for instructions value: -1.28 37.28 95% mean confidence interval for instructions %-change: -0.04% 2.30% Inconclusive result (value mean confidence interval includes 0). total cycles in shared programs: 935535948 -> 935490243 (<.01%) cycles in affected programs: 421994824 -> 421949119 (-0.01%) helped: 1269 HURT: 879 helped stats (abs) min: 1 max: 12008 x̄: 259.38 x̃: 52 helped stats (rel) min: <.01% max: 28.02% x̄: 1.12% x̃: 0.14% HURT stats (abs) min: 1 max: 29931 x̄: 322.46 x̃: 20 HURT stats (rel) min: <.01% max: 32.17% x̄: 1.74% x̃: 0.22% 95% mean confidence interval for cycles value: -71.37 28.81 95% mean confidence interval for cycles %-change: -0.11% 0.21% Inconclusive result (value mean confidence interval includes 0). total spills in shared programs: 12403 -> 12430 (0.22%) spills in affected programs: 1355 -> 1382 (1.99%) helped: 2 HURT: 7 total fills in shared programs: 15128 -> 15182 (0.36%) fills in affected programs: 3294 -> 3348 (1.64%) helped: 2 HURT: 7 LOST: 21 GAINED: 28 Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13734>
2021-11-09 19:03:19 -06:00
* prevent dependencies between the different scheduling modes.
*/
brw_inst **orig_order = save_instruction_order(s.cfg);
brw_inst **best_pressure_order = NULL;
intel/fs: Reset instruction order before re-scheduling The way the current scheduler loop is implemented, each scheduling pass starts with what the previous pass had. This means that, if PRE screwed everything up majorly, PRE_NON_LIFO would have to try to fix it. It also meant that tiny changes to one pass would affect every later pass. Instead, reset the order of the instructions before each scheduling pass. This makes the passes entirely independent of each other. Shader-db results on Ice Lake: total instructions in shared programs: 19670486 -> 19670648 (<.01%) instructions in affected programs: 25317 -> 25479 (0.64%) helped: 2 HURT: 7 helped stats (abs) min: 4 max: 4 x̄: 4.00 x̃: 4 helped stats (rel) min: 0.07% max: 0.07% x̄: 0.07% x̃: 0.07% HURT stats (abs) min: 8 max: 70 x̄: 24.29 x̃: 12 HURT stats (rel) min: 0.41% max: 4.95% x̄: 1.47% x̃: 0.87% 95% mean confidence interval for instructions value: -1.28 37.28 95% mean confidence interval for instructions %-change: -0.04% 2.30% Inconclusive result (value mean confidence interval includes 0). total cycles in shared programs: 935535948 -> 935490243 (<.01%) cycles in affected programs: 421994824 -> 421949119 (-0.01%) helped: 1269 HURT: 879 helped stats (abs) min: 1 max: 12008 x̄: 259.38 x̃: 52 helped stats (rel) min: <.01% max: 28.02% x̄: 1.12% x̃: 0.14% HURT stats (abs) min: 1 max: 29931 x̄: 322.46 x̃: 20 HURT stats (rel) min: <.01% max: 32.17% x̄: 1.74% x̃: 0.22% 95% mean confidence interval for cycles value: -71.37 28.81 95% mean confidence interval for cycles %-change: -0.11% 0.21% Inconclusive result (value mean confidence interval includes 0). total spills in shared programs: 12403 -> 12430 (0.22%) spills in affected programs: 1355 -> 1382 (1.99%) helped: 2 HURT: 7 total fills in shared programs: 15128 -> 15182 (0.36%) fills in affected programs: 3294 -> 3348 (1.64%) helped: 2 HURT: 7 LOST: 21 GAINED: 28 Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13734>
2021-11-09 19:03:19 -06:00
void *scheduler_ctx = ralloc_context(NULL);
brw_instruction_scheduler *sched = brw_prepare_scheduler(s, scheduler_ctx);
/* Try each scheduling heuristic to see if it can successfully register
* allocate without spilling. They should be ordered by decreasing
* performance but increasing likelihood of allocating.
*/
for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
enum brw_instruction_scheduler_mode sched_mode = pre_modes[i];
brw_schedule_instructions_pre_ra(s, sched, sched_mode);
s.shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
s.debug_optimizer(nir, s.shader_stats.scheduler_mode, 95, i);
if (0) {
brw_assign_regs_trivial(s);
intel/fs: Stop doing extra RA calls In the last phase of the schedule and RA loop, the RA call is redundant if we spill. Immediately afterwards, we're going to see that we couldn't allocate without spilling and call back into RA and tell it to go ahead and spill. We've known about it for a while but we've always brushed over it on the theory that, if you're going to spill, you'll be calling RA a bunch anyway and what does one extra RA hurt? As it turns out, it hurts more than you'd expect. Because the RA interference graph gets sparser with each spill and the RA algorithm is more efficient on sparser graphs, the RA call that we're duplicating is actually the most expensive call in the RA-and-spill loop. There's another extra RA call we do that's a bit harder to see which this also removes. If we try to compile a shader that isn't the minimum dispatch width and it fails to allocate without spilling we call fail() to set an error but then go ahead and do the first spilling RA pass and only after that's complete do we detect the fail and bail out. By making minimum dispatch widths part of the spill condition, we side-step this problem. Getting rid of these extra spills takes the compile time of a nasty Aztec Ruins shader from about 28 seconds to about 26 seconds on my laptop. It also makes shader-db 1.5% faster Shader-db results on Kaby Lake: total instructions in shared programs: 15311100 -> 15311100 (0.00%) instructions in affected programs: 0 -> 0 helped: 0 HURT: 0 total cycles in shared programs: 355468050 -> 355468050 (0.00%) cycles in affected programs: 0 -> 0 helped: 0 HURT: 0 Total CPU time (seconds): 2524.31 -> 2486.63 (-1.49%) Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2019-05-09 14:44:16 -05:00
allocated = true;
break;
}
intel/fs: Stop doing extra RA calls In the last phase of the schedule and RA loop, the RA call is redundant if we spill. Immediately afterwards, we're going to see that we couldn't allocate without spilling and call back into RA and tell it to go ahead and spill. We've known about it for a while but we've always brushed over it on the theory that, if you're going to spill, you'll be calling RA a bunch anyway and what does one extra RA hurt? As it turns out, it hurts more than you'd expect. Because the RA interference graph gets sparser with each spill and the RA algorithm is more efficient on sparser graphs, the RA call that we're duplicating is actually the most expensive call in the RA-and-spill loop. There's another extra RA call we do that's a bit harder to see which this also removes. If we try to compile a shader that isn't the minimum dispatch width and it fails to allocate without spilling we call fail() to set an error but then go ahead and do the first spilling RA pass and only after that's complete do we detect the fail and bail out. By making minimum dispatch widths part of the spill condition, we side-step this problem. Getting rid of these extra spills takes the compile time of a nasty Aztec Ruins shader from about 28 seconds to about 26 seconds on my laptop. It also makes shader-db 1.5% faster Shader-db results on Kaby Lake: total instructions in shared programs: 15311100 -> 15311100 (0.00%) instructions in affected programs: 0 -> 0 helped: 0 HURT: 0 total cycles in shared programs: 355468050 -> 355468050 (0.00%) cycles in affected programs: 0 -> 0 helped: 0 HURT: 0 Total CPU time (seconds): 2524.31 -> 2486.63 (-1.49%) Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2019-05-09 14:44:16 -05:00
/* We should only spill registers on the last scheduling. */
assert(!s.spilled_any_registers);
intel/fs: Stop doing extra RA calls In the last phase of the schedule and RA loop, the RA call is redundant if we spill. Immediately afterwards, we're going to see that we couldn't allocate without spilling and call back into RA and tell it to go ahead and spill. We've known about it for a while but we've always brushed over it on the theory that, if you're going to spill, you'll be calling RA a bunch anyway and what does one extra RA hurt? As it turns out, it hurts more than you'd expect. Because the RA interference graph gets sparser with each spill and the RA algorithm is more efficient on sparser graphs, the RA call that we're duplicating is actually the most expensive call in the RA-and-spill loop. There's another extra RA call we do that's a bit harder to see which this also removes. If we try to compile a shader that isn't the minimum dispatch width and it fails to allocate without spilling we call fail() to set an error but then go ahead and do the first spilling RA pass and only after that's complete do we detect the fail and bail out. By making minimum dispatch widths part of the spill condition, we side-step this problem. Getting rid of these extra spills takes the compile time of a nasty Aztec Ruins shader from about 28 seconds to about 26 seconds on my laptop. It also makes shader-db 1.5% faster Shader-db results on Kaby Lake: total instructions in shared programs: 15311100 -> 15311100 (0.00%) instructions in affected programs: 0 -> 0 helped: 0 HURT: 0 total cycles in shared programs: 355468050 -> 355468050 (0.00%) cycles in affected programs: 0 -> 0 helped: 0 HURT: 0 Total CPU time (seconds): 2524.31 -> 2486.63 (-1.49%) Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2019-05-09 14:44:16 -05:00
allocated = brw_assign_regs(s, false, spill_all);
intel/fs: Stop doing extra RA calls In the last phase of the schedule and RA loop, the RA call is redundant if we spill. Immediately afterwards, we're going to see that we couldn't allocate without spilling and call back into RA and tell it to go ahead and spill. We've known about it for a while but we've always brushed over it on the theory that, if you're going to spill, you'll be calling RA a bunch anyway and what does one extra RA hurt? As it turns out, it hurts more than you'd expect. Because the RA interference graph gets sparser with each spill and the RA algorithm is more efficient on sparser graphs, the RA call that we're duplicating is actually the most expensive call in the RA-and-spill loop. There's another extra RA call we do that's a bit harder to see which this also removes. If we try to compile a shader that isn't the minimum dispatch width and it fails to allocate without spilling we call fail() to set an error but then go ahead and do the first spilling RA pass and only after that's complete do we detect the fail and bail out. By making minimum dispatch widths part of the spill condition, we side-step this problem. Getting rid of these extra spills takes the compile time of a nasty Aztec Ruins shader from about 28 seconds to about 26 seconds on my laptop. It also makes shader-db 1.5% faster Shader-db results on Kaby Lake: total instructions in shared programs: 15311100 -> 15311100 (0.00%) instructions in affected programs: 0 -> 0 helped: 0 HURT: 0 total cycles in shared programs: 355468050 -> 355468050 (0.00%) cycles in affected programs: 0 -> 0 helped: 0 HURT: 0 Total CPU time (seconds): 2524.31 -> 2486.63 (-1.49%) Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2019-05-09 14:44:16 -05:00
if (allocated)
break;
intel/fs: Pick the lowest register pressure schedule when spilling We try various pre-RA scheduler modes and see if any of them allow us to register allocate without spilling. If all of them spill, however, we left it on the last mode: LIFO. This is unfortunately sometimes significantly worse than other modes (such as "none"). This patch makes us instead select the pre-RA scheduling mode that gives the lowest register pressure estimate, if none of them manage to avoid spilling. The hope is that this scheduling will spill the least out of all of them. fossil-db stats (on Alchemist) speak for themselves: Totals: Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03% Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64% Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70% Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43% Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10% Totals from 1791 (0.27% of 668386) affected shaders: Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67% Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65% Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20% Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34% Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30% Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770. Improves performance of Borderlands 3 by 1.54% on A770. Reviewed-by: Emma Anholt <emma@anholt.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
/* Save the maximum register pressure */
uint32_t this_pressure = brw_compute_max_register_pressure(s);
intel/fs: Pick the lowest register pressure schedule when spilling We try various pre-RA scheduler modes and see if any of them allow us to register allocate without spilling. If all of them spill, however, we left it on the last mode: LIFO. This is unfortunately sometimes significantly worse than other modes (such as "none"). This patch makes us instead select the pre-RA scheduling mode that gives the lowest register pressure estimate, if none of them manage to avoid spilling. The hope is that this scheduling will spill the least out of all of them. fossil-db stats (on Alchemist) speak for themselves: Totals: Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03% Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64% Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70% Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43% Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10% Totals from 1791 (0.27% of 668386) affected shaders: Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67% Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65% Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20% Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34% Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30% Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770. Improves performance of Borderlands 3 by 1.54% on A770. Reviewed-by: Emma Anholt <emma@anholt.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
if (0) {
fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
scheduler_mode_name[sched_mode], this_pressure);
}
if (this_pressure < best_register_pressure) {
best_register_pressure = this_pressure;
best_sched = sched_mode;
delete[] best_pressure_order;
best_pressure_order = save_instruction_order(s.cfg);
intel/fs: Pick the lowest register pressure schedule when spilling We try various pre-RA scheduler modes and see if any of them allow us to register allocate without spilling. If all of them spill, however, we left it on the last mode: LIFO. This is unfortunately sometimes significantly worse than other modes (such as "none"). This patch makes us instead select the pre-RA scheduling mode that gives the lowest register pressure estimate, if none of them manage to avoid spilling. The hope is that this scheduling will spill the least out of all of them. fossil-db stats (on Alchemist) speak for themselves: Totals: Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03% Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64% Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70% Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43% Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10% Totals from 1791 (0.27% of 668386) affected shaders: Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67% Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65% Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20% Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34% Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30% Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770. Improves performance of Borderlands 3 by 1.54% on A770. Reviewed-by: Emma Anholt <emma@anholt.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
}
/* Reset back to the original order before trying the next mode */
restore_instruction_order(s.cfg, orig_order);
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);
intel/fs: Pick the lowest register pressure schedule when spilling We try various pre-RA scheduler modes and see if any of them allow us to register allocate without spilling. If all of them spill, however, we left it on the last mode: LIFO. This is unfortunately sometimes significantly worse than other modes (such as "none"). This patch makes us instead select the pre-RA scheduling mode that gives the lowest register pressure estimate, if none of them manage to avoid spilling. The hope is that this scheduling will spill the least out of all of them. fossil-db stats (on Alchemist) speak for themselves: Totals: Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03% Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64% Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70% Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43% Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10% Totals from 1791 (0.27% of 668386) affected shaders: Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67% Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65% Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20% Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34% Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30% Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770. Improves performance of Borderlands 3 by 1.54% on A770. Reviewed-by: Emma Anholt <emma@anholt.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
}
ralloc_free(scheduler_ctx);
intel/fs: Pick the lowest register pressure schedule when spilling We try various pre-RA scheduler modes and see if any of them allow us to register allocate without spilling. If all of them spill, however, we left it on the last mode: LIFO. This is unfortunately sometimes significantly worse than other modes (such as "none"). This patch makes us instead select the pre-RA scheduling mode that gives the lowest register pressure estimate, if none of them manage to avoid spilling. The hope is that this scheduling will spill the least out of all of them. fossil-db stats (on Alchemist) speak for themselves: Totals: Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03% Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64% Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70% Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43% Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10% Totals from 1791 (0.27% of 668386) affected shaders: Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67% Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65% Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20% Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34% Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30% Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770. Improves performance of Borderlands 3 by 1.54% on A770. Reviewed-by: Emma Anholt <emma@anholt.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
if (!allocated) {
if (0) {
fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
scheduler_mode_name[best_sched]);
}
restore_instruction_order(s.cfg, best_pressure_order);
s.shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
intel/fs: Pick the lowest register pressure schedule when spilling We try various pre-RA scheduler modes and see if any of them allow us to register allocate without spilling. If all of them spill, however, we left it on the last mode: LIFO. This is unfortunately sometimes significantly worse than other modes (such as "none"). This patch makes us instead select the pre-RA scheduling mode that gives the lowest register pressure estimate, if none of them manage to avoid spilling. The hope is that this scheduling will spill the least out of all of them. fossil-db stats (on Alchemist) speak for themselves: Totals: Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03% Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64% Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70% Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43% Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10% Totals from 1791 (0.27% of 668386) affected shaders: Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67% Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65% Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20% Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34% Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30% Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770. Improves performance of Borderlands 3 by 1.54% on A770. Reviewed-by: Emma Anholt <emma@anholt.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
allocated = brw_assign_regs(s, allow_spilling, spill_all);
}
delete[] orig_order;
intel/fs: Pick the lowest register pressure schedule when spilling We try various pre-RA scheduler modes and see if any of them allow us to register allocate without spilling. If all of them spill, however, we left it on the last mode: LIFO. This is unfortunately sometimes significantly worse than other modes (such as "none"). This patch makes us instead select the pre-RA scheduling mode that gives the lowest register pressure estimate, if none of them manage to avoid spilling. The hope is that this scheduling will spill the least out of all of them. fossil-db stats (on Alchemist) speak for themselves: Totals: Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03% Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64% Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70% Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43% Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10% Totals from 1791 (0.27% of 668386) affected shaders: Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67% Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65% Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20% Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34% Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30% Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770. Improves performance of Borderlands 3 by 1.54% on A770. Reviewed-by: Emma Anholt <emma@anholt.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
delete[] best_pressure_order;
intel/fs: Stop doing extra RA calls In the last phase of the schedule and RA loop, the RA call is redundant if we spill. Immediately afterwards, we're going to see that we couldn't allocate without spilling and call back into RA and tell it to go ahead and spill. We've known about it for a while but we've always brushed over it on the theory that, if you're going to spill, you'll be calling RA a bunch anyway and what does one extra RA hurt? As it turns out, it hurts more than you'd expect. Because the RA interference graph gets sparser with each spill and the RA algorithm is more efficient on sparser graphs, the RA call that we're duplicating is actually the most expensive call in the RA-and-spill loop. There's another extra RA call we do that's a bit harder to see which this also removes. If we try to compile a shader that isn't the minimum dispatch width and it fails to allocate without spilling we call fail() to set an error but then go ahead and do the first spilling RA pass and only after that's complete do we detect the fail and bail out. By making minimum dispatch widths part of the spill condition, we side-step this problem. Getting rid of these extra spills takes the compile time of a nasty Aztec Ruins shader from about 28 seconds to about 26 seconds on my laptop. It also makes shader-db 1.5% faster Shader-db results on Kaby Lake: total instructions in shared programs: 15311100 -> 15311100 (0.00%) instructions in affected programs: 0 -> 0 helped: 0 HURT: 0 total cycles in shared programs: 355468050 -> 355468050 (0.00%) cycles in affected programs: 0 -> 0 helped: 0 HURT: 0 Total CPU time (seconds): 2524.31 -> 2486.63 (-1.49%) Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2019-05-09 14:44:16 -05:00
if (!allocated) {
s.fail("Failure to register allocate. Reduce number of "
"live scalar values to avoid this.");
} else if (s.spilled_any_registers) {
brw_shader_perf_log(s.compiler, s.log_data,
"%s shader triggered register spilling. "
"Try reducing the number of live scalar "
"values to improve performance.\n",
_mesa_shader_stage_to_string(s.stage));
}
if (s.failed)
return;
int pass_num = 0;
s.debug_optimizer(nir, "post_ra_alloc", 96, pass_num++);
brw_opt_bank_conflicts(s);
intel/fs: Implement GRF bank conflict mitigation pass. Unnecessary GRF bank conflicts increase the issue time of ternary instructions (the overwhelmingly most common of which is MAD) by roughly 50%, leading to reduced ALU throughput. This pass attempts to minimize the number of bank conflicts by rearranging the layout of the GRF space post-register allocation. It's in general not possible to eliminate all of them without introducing extra copies, which are typically more expensive than the bank conflict itself. In a shader-db run on SKL this helps roughly 46k shaders: total conflicts in shared programs: 1008981 -> 600461 (-40.49%) conflicts in affected programs: 816222 -> 407702 (-50.05%) helped: 46234 HURT: 72 The running time of shader-db itself on SKL seems to be increased by roughly 2.52%±1.13% with n=20 due to the additional work done by the compiler back-end. On earlier generations the pass is somewhat less effective in relative terms because the hardware incurs a bank conflict anytime the last two sources of the instruction are duplicate (e.g. while trying to square a value using MAD), which is impossible to avoid without introducing copies. E.g. for a shader-db run on SNB: total conflicts in shared programs: 944636 -> 623185 (-34.03%) conflicts in affected programs: 853258 -> 531807 (-37.67%) helped: 31052 HURT: 19 And on BDW: total conflicts in shared programs: 1418393 -> 987539 (-30.38%) conflicts in affected programs: 1179787 -> 748933 (-36.52%) helped: 47592 HURT: 70 On SKL GT4e this improves performance of GpuTest Volplosion by 3.64% ±0.33% with n=16. NOTE: This patch intentionally disregards some i965 coding conventions for the sake of reviewability. This is addressed by the next squash patch which introduces an amount of (for the most part boring) boilerplate that might distract reviewers from the non-trivial algorithmic details of the pass. The following patch is squashed in: SQUASH: intel/fs/bank_conflicts: Roll back to the nineties. Acked-by: Matt Turner <mattst88@gmail.com>
2017-06-15 15:23:57 -07:00
s.debug_optimizer(nir, "bank_conflict", 96, pass_num++);
brw_schedule_instructions_post_ra(s);
s.debug_optimizer(nir, "post_ra_alloc_scheduling", 96, pass_num++);
/* Lowering VGRF to FIXED_GRF is currently done as a separate pass instead
* of part of assign_regs since both bank conflicts optimization and post
* RA scheduling take advantage of distinguishing references to registers
* that were allocated from references that were already fixed.
*
* TODO: Change the passes above, then move this lowering to be part of
* assign_regs.
*/
brw_lower_vgrfs_to_fixed_grfs(s);
s.debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, pass_num++);
if (s.devinfo->ver >= 30) {
brw_lower_send_gather(s);
s.debug_optimizer(nir, "lower_send_gather", 96, pass_num++);
}
brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_REGALLOC);
if (s.last_scratch > 0) {
/* We currently only support up to 2MB of scratch space. If we
* need to support more eventually, the documentation suggests
* that we could allocate a larger buffer, and partition it out
* ourselves. We'd just have to undo the hardware's address
* calculation by subtracting (FFTID * Per Thread Scratch Space)
* and then add FFTID * (Larger Per Thread Scratch Space).
*
* See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
* Thread Group Tracking > Local Memory/Scratch Space.
*/
if (s.last_scratch <= devinfo->max_scratch_size_per_thread) {
/* Take the max of any previously compiled variant of the shader. In the
* case of bindless shaders with return parts, this will also take the
* max of all parts.
*/
s.prog_data->total_scratch = MAX2(brw_get_scratch_size(s.last_scratch),
s.prog_data->total_scratch);
} else {
s.fail("Scratch space required is larger than supported");
}
}
if (s.failed)
return;
brw_lower_scoreboard(s);
s.debug_optimizer(nir, "scoreboard", 96, pass_num++);
}
unsigned
brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
unsigned threads)
{
assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
return cs_prog_data->push.per_thread.size * threads +
cs_prog_data->push.cross_thread.size;
}
struct intel_cs_dispatch_info
brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
const struct brw_cs_prog_data *prog_data,
const unsigned *override_local_size)
{
struct intel_cs_dispatch_info info = {};
const unsigned *sizes =
override_local_size ? override_local_size :
prog_data->local_size;
const int simd = brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
assert(simd >= 0 && simd < 3);
info.group_size = sizes[0] * sizes[1] * sizes[2];
info.simd_size = 8u << simd;
info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
const uint32_t remainder = info.group_size & (info.simd_size - 1);
if (remainder > 0)
info.right_mask = ~0u >> (32 - remainder);
else
info.right_mask = ~0u >> (32 - info.simd_size);
return info;
}
void
brw_shader_phase_update(brw_shader &s, enum brw_shader_phase phase)
{
assert(phase == s.phase + 1);
s.phase = phase;
brw_validate(s);
}
bool brw_should_print_shader(const nir_shader *shader, uint64_t debug_flag, uint32_t source_hash)
{
if (intel_shader_dump_filter && intel_shader_dump_filter != source_hash) {
return false;
}
return INTEL_DEBUG(debug_flag) && (!shader->info.internal || NIR_DEBUG(PRINT_INTERNAL));
}
static unsigned
brw_allocate_vgrf_number(brw_shader &s, unsigned size_in_REGSIZE_units)
{
assert(size_in_REGSIZE_units > 0);
if (s.alloc.capacity <= s.alloc.count) {
unsigned new_cap = MAX2(16, s.alloc.capacity * 2);
s.alloc.sizes = rerzalloc(s.mem_ctx, s.alloc.sizes, unsigned,
s.alloc.capacity, new_cap);
s.alloc.capacity = new_cap;
}
s.alloc.sizes[s.alloc.count] = size_in_REGSIZE_units;
return s.alloc.count++;
}
brw_reg
brw_allocate_vgrf(brw_shader &s, brw_reg_type type, unsigned count)
{
const unsigned unit = reg_unit(s.devinfo);
const unsigned size = DIV_ROUND_UP(count * brw_type_size_bytes(type),
unit * REG_SIZE) * unit;
return retype(brw_allocate_vgrf_units(s, size), type);
}
brw_reg
brw_allocate_vgrf_units(brw_shader &s, unsigned units_of_REGSIZE)
{
return brw_vgrf(brw_allocate_vgrf_number(s, units_of_REGSIZE), BRW_TYPE_UD);
}