mesa/src/intel/compiler/brw_fs_nir.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

6963 lines
252 KiB
C++
Raw Normal View History

/*
* Copyright © 2010 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "compiler/glsl/ir.h"
#include "brw_fs.h"
#include "brw_nir.h"
#include "brw_rt.h"
intel/fs: Improve discard_if code generation Previously we would blindly emit an sequence like: mov(1) f0.1<1>UW g1.14<0,1,0>UW ... cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */ (+f0.1) cmp.z.f0.1(16) null<1>D g7<8,8,1>D 0D The first move sets the flags based on the initial execution mask. Later discard sequences contain a predicated compare that can only remove more SIMD channels. Often times the only user of the result from the first compare is the second compare. Instead, generate a sequence like mov(1) f0.1<1>UW g1.14<0,1,0>UW ... cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */ (+f0.1) cmp.ge.f0.1(8) null<1>F g5<8,8,1>F 0x41700000F /* 15F */ If the results stored in g7 and f0.0 are not used, the comparison will be eliminated. This removes an instruction and potentially reduces register pressure. v2: Major re-write of the commit message (including fixing the assembly code). Suggested by Matt. All Gen8+ platforms had similar results. (Ice Lake shown) total instructions in shared programs: 17224434 -> 17198659 (-0.15%) instructions in affected programs: 2908125 -> 2882350 (-0.89%) helped: 18891 HURT: 5 helped stats (abs) min: 1 max: 12 x̄: 1.38 x̃: 1 helped stats (rel) min: 0.03% max: 25.00% x̄: 1.76% x̃: 1.02% HURT stats (abs) min: 9 max: 105 x̄: 51.40 x̃: 35 HURT stats (rel) min: 0.43% max: 4.92% x̄: 2.34% x̃: 1.56% 95% mean confidence interval for instructions value: -1.39 -1.34 95% mean confidence interval for instructions %-change: -1.79% -1.73% Instructions are helped. total cycles in shared programs: 361468458 -> 361170679 (-0.08%) cycles in affected programs: 38470116 -> 38172337 (-0.77%) helped: 16202 HURT: 1456 helped stats (abs) min: 1 max: 4473 x̄: 26.24 x̃: 18 helped stats (rel) min: <.01% max: 28.44% x̄: 2.90% x̃: 2.18% HURT stats (abs) min: 1 max: 5982 x̄: 87.51 x̃: 28 HURT stats (rel) min: <.01% max: 51.29% x̄: 5.48% x̃: 1.64% 95% mean confidence interval for cycles value: -18.24 -15.49 95% mean confidence interval for cycles %-change: -2.26% -2.14% Cycles are helped. total spills in shared programs: 12147 -> 12176 (0.24%) spills in affected programs: 175 -> 204 (16.57%) helped: 8 HURT: 5 total fills in shared programs: 25262 -> 25292 (0.12%) fills in affected programs: 269 -> 299 (11.15%) helped: 8 HURT: 5 Haswell total instructions in shared programs: 13530316 -> 13502647 (-0.20%) instructions in affected programs: 2507824 -> 2480155 (-1.10%) helped: 18859 HURT: 10 helped stats (abs) min: 1 max: 12 x̄: 1.48 x̃: 1 helped stats (rel) min: 0.03% max: 27.78% x̄: 2.38% x̃: 1.41% HURT stats (abs) min: 5 max: 39 x̄: 25.70 x̃: 31 HURT stats (rel) min: 0.22% max: 1.66% x̄: 1.09% x̃: 1.31% 95% mean confidence interval for instructions value: -1.49 -1.44 95% mean confidence interval for instructions %-change: -2.42% -2.34% Instructions are helped. total cycles in shared programs: 377865412 -> 377639034 (-0.06%) cycles in affected programs: 40169572 -> 39943194 (-0.56%) helped: 15550 HURT: 1938 helped stats (abs) min: 1 max: 2482 x̄: 25.67 x̃: 18 helped stats (rel) min: <.01% max: 37.77% x̄: 3.00% x̃: 2.25% HURT stats (abs) min: 1 max: 4862 x̄: 89.17 x̃: 35 HURT stats (rel) min: <.01% max: 67.67% x̄: 6.16% x̃: 2.75% 95% mean confidence interval for cycles value: -14.42 -11.47 95% mean confidence interval for cycles %-change: -2.05% -1.91% Cycles are helped. total spills in shared programs: 26769 -> 26814 (0.17%) spills in affected programs: 826 -> 871 (5.45%) helped: 9 HURT: 10 total fills in shared programs: 38383 -> 38425 (0.11%) fills in affected programs: 834 -> 876 (5.04%) helped: 9 HURT: 10 LOST: 5 GAINED: 10 Ivy Bridge total instructions in shared programs: 12079250 -> 12044139 (-0.29%) instructions in affected programs: 2409680 -> 2374569 (-1.46%) helped: 16135 HURT: 0 helped stats (abs) min: 1 max: 23 x̄: 2.18 x̃: 2 helped stats (rel) min: 0.07% max: 37.50% x̄: 2.72% x̃: 1.68% 95% mean confidence interval for instructions value: -2.21 -2.14 95% mean confidence interval for instructions %-change: -2.76% -2.67% Instructions are helped. total cycles in shared programs: 180116747 -> 179900405 (-0.12%) cycles in affected programs: 25439823 -> 25223481 (-0.85%) helped: 13817 HURT: 1499 helped stats (abs) min: 1 max: 1886 x̄: 26.40 x̃: 18 helped stats (rel) min: <.01% max: 38.84% x̄: 2.57% x̃: 1.97% HURT stats (abs) min: 1 max: 3684 x̄: 98.99 x̃: 52 HURT stats (rel) min: <.01% max: 97.01% x̄: 6.37% x̃: 3.42% 95% mean confidence interval for cycles value: -15.68 -12.57 95% mean confidence interval for cycles %-change: -1.77% -1.63% Cycles are helped. LOST: 8 GAINED: 10 Sandy Bridge total instructions in shared programs: 10878990 -> 10863659 (-0.14%) instructions in affected programs: 1806702 -> 1791371 (-0.85%) helped: 13023 HURT: 0 helped stats (abs) min: 1 max: 5 x̄: 1.18 x̃: 1 helped stats (rel) min: 0.07% max: 13.79% x̄: 1.65% x̃: 1.10% 95% mean confidence interval for instructions value: -1.18 -1.17 95% mean confidence interval for instructions %-change: -1.68% -1.62% Instructions are helped. total cycles in shared programs: 154082878 -> 153862810 (-0.14%) cycles in affected programs: 20199374 -> 19979306 (-1.09%) helped: 12048 HURT: 510 helped stats (abs) min: 1 max: 323 x̄: 20.57 x̃: 18 helped stats (rel) min: 0.03% max: 17.78% x̄: 2.05% x̃: 1.52% HURT stats (abs) min: 1 max: 448 x̄: 54.39 x̃: 16 HURT stats (rel) min: 0.02% max: 37.98% x̄: 4.13% x̃: 1.17% 95% mean confidence interval for cycles value: -17.97 -17.08 95% mean confidence interval for cycles %-change: -1.84% -1.75% Cycles are helped. LOST: 1 GAINED: 0 Iron Lake total instructions in shared programs: 8155075 -> 8142729 (-0.15%) instructions in affected programs: 949495 -> 937149 (-1.30%) helped: 5810 HURT: 0 helped stats (abs) min: 1 max: 8 x̄: 2.12 x̃: 2 helped stats (rel) min: 0.10% max: 16.67% x̄: 2.53% x̃: 1.85% 95% mean confidence interval for instructions value: -2.14 -2.11 95% mean confidence interval for instructions %-change: -2.59% -2.48% Instructions are helped. total cycles in shared programs: 188584610 -> 188549632 (-0.02%) cycles in affected programs: 17274446 -> 17239468 (-0.20%) helped: 3881 HURT: 90 helped stats (abs) min: 2 max: 168 x̄: 9.08 x̃: 6 helped stats (rel) min: <.01% max: 23.53% x̄: 0.83% x̃: 0.30% HURT stats (abs) min: 2 max: 10 x̄: 2.80 x̃: 2 HURT stats (rel) min: <.01% max: 0.60% x̄: 0.10% x̃: 0.07% 95% mean confidence interval for cycles value: -9.35 -8.27 95% mean confidence interval for cycles %-change: -0.85% -0.77% Cycles are helped. GM45 total instructions in shared programs: 5019308 -> 5013119 (-0.12%) instructions in affected programs: 489028 -> 482839 (-1.27%) helped: 2912 HURT: 0 helped stats (abs) min: 1 max: 8 x̄: 2.13 x̃: 2 helped stats (rel) min: 0.10% max: 16.67% x̄: 2.46% x̃: 1.81% 95% mean confidence interval for instructions value: -2.14 -2.11 95% mean confidence interval for instructions %-change: -2.54% -2.39% Instructions are helped. total cycles in shared programs: 129002592 -> 128977804 (-0.02%) cycles in affected programs: 12669152 -> 12644364 (-0.20%) helped: 2759 HURT: 37 helped stats (abs) min: 2 max: 168 x̄: 9.03 x̃: 4 helped stats (rel) min: <.01% max: 21.43% x̄: 0.75% x̃: 0.31% HURT stats (abs) min: 2 max: 10 x̄: 3.62 x̃: 4 HURT stats (rel) min: <.01% max: 0.41% x̄: 0.10% x̃: 0.04% 95% mean confidence interval for cycles value: -9.53 -8.20 95% mean confidence interval for cycles %-change: -0.79% -0.70% Cycles are helped. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Matt Turner <mattst88@gmail.com>
2019-05-20 17:25:01 -07:00
#include "brw_eu.h"
#include "nir.h"
#include "nir_intrinsics.h"
intel/fs: Generate better code for fsign multiplied by a value v2: Rebase on v2 changes in previous two commits. v3: Rebase on 85c35885b38 ("nir: Rework nir_src_as_alu_instr to not take a pointer"). shader-db results: Skylake and Broadwell had similar results. (Skylake shown) total instructions in shared programs: 15297100 -> 15282141 (-0.10%) instructions in affected programs: 956685 -> 941726 (-1.56%) helped: 4527 HURT: 0 helped stats (abs) min: 1 max: 221 x̄: 3.30 x̃: 2 helped stats (rel) min: 0.07% max: 10.53% x̄: 1.85% x̃: 1.37% 95% mean confidence interval for instructions value: -3.48 -3.12 95% mean confidence interval for instructions %-change: -1.88% -1.81% Instructions are helped. total cycles in shared programs: 372809551 -> 372597886 (-0.06%) cycles in affected programs: 13645512 -> 13433847 (-1.55%) helped: 4362 HURT: 125 helped stats (abs) min: 1 max: 2088 x̄: 50.73 x̃: 28 helped stats (rel) min: 0.01% max: 28.20% x̄: 2.77% x̃: 2.39% HURT stats (abs) min: 1 max: 1836 x̄: 76.90 x̃: 28 HURT stats (rel) min: <.01% max: 34.36% x̄: 3.03% x̃: 1.42% 95% mean confidence interval for cycles value: -50.98 -43.37 95% mean confidence interval for cycles %-change: -2.67% -2.55% Cycles are helped. total spills in shared programs: 23465 -> 23463 (<.01%) spills in affected programs: 42 -> 40 (-4.76%) helped: 1 HURT: 0 total fills in shared programs: 31766 -> 31763 (<.01%) fills in affected programs: 69 -> 66 (-4.35%) helped: 1 HURT: 0 Haswell total instructions in shared programs: 13839992 -> 13828311 (-0.08%) instructions in affected programs: 712503 -> 700822 (-1.64%) helped: 3477 HURT: 0 helped stats (abs) min: 1 max: 221 x̄: 3.36 x̃: 2 helped stats (rel) min: 0.07% max: 10.64% x̄: 1.96% x̃: 1.52% 95% mean confidence interval for instructions value: -3.58 -3.14 95% mean confidence interval for instructions %-change: -2.01% -1.92% Instructions are helped. total cycles in shared programs: 387026330 -> 386872483 (-0.04%) cycles in affected programs: 11329966 -> 11176119 (-1.36%) helped: 3307 HURT: 139 helped stats (abs) min: 2 max: 1776 x̄: 49.58 x̃: 18 helped stats (rel) min: 0.01% max: 20.38% x̄: 2.27% x̃: 1.79% HURT stats (abs) min: 1 max: 2314 x̄: 72.68 x̃: 20 HURT stats (rel) min: <.01% max: 33.99% x̄: 2.28% x̃: 0.96% 95% mean confidence interval for cycles value: -49.31 -39.98 95% mean confidence interval for cycles %-change: -2.15% -2.01% Cycles are helped. LOST: 1 GAINED: 0 Ivy Bridge total instructions in shared programs: 12045602 -> 12038463 (-0.06%) instructions in affected programs: 623837 -> 616698 (-1.14%) helped: 2498 HURT: 0 helped stats (abs) min: 1 max: 39 x̄: 2.86 x̃: 2 helped stats (rel) min: 0.05% max: 10.00% x̄: 1.30% x̃: 1.05% 95% mean confidence interval for instructions value: -2.96 -2.75 95% mean confidence interval for instructions %-change: -1.34% -1.26% Instructions are helped. total cycles in shared programs: 181025675 -> 180891323 (-0.07%) cycles in affected programs: 11329329 -> 11194977 (-1.19%) helped: 2439 HURT: 47 helped stats (abs) min: 1 max: 1565 x̄: 57.06 x̃: 26 helped stats (rel) min: 0.02% max: 24.56% x̄: 2.02% x̃: 1.64% HURT stats (abs) min: 1 max: 1269 x̄: 102.51 x̃: 43 HURT stats (rel) min: 0.11% max: 52.94% x̄: 4.15% x̃: 1.34% 95% mean confidence interval for cycles value: -59.91 -48.17 95% mean confidence interval for cycles %-change: -1.99% -1.82% Cycles are helped. Sandy Bridge, Iron Lake, and GM45 had similar results. (Sandy Bridge shown) total instructions in shared programs: 10896368 -> 10896339 (<.01%) instructions in affected programs: 3767 -> 3738 (-0.77%) helped: 17 HURT: 0 helped stats (abs) min: 1 max: 4 x̄: 1.71 x̃: 1 helped stats (rel) min: 0.13% max: 9.52% x̄: 3.58% x̃: 2.73% 95% mean confidence interval for instructions value: -2.27 -1.14 95% mean confidence interval for instructions %-change: -5.14% -2.03% Instructions are helped. total cycles in shared programs: 155091109 -> 155091021 (<.01%) cycles in affected programs: 47241 -> 47153 (-0.19%) helped: 15 HURT: 8 helped stats (abs) min: 2 max: 81 x̄: 15.73 x̃: 4 helped stats (rel) min: 0.03% max: 10.59% x̄: 1.55% x̃: 0.71% HURT stats (abs) min: 14 max: 32 x̄: 18.50 x̃: 17 HURT stats (rel) min: 0.32% max: 2.79% x̄: 2.43% x̃: 2.71% 95% mean confidence interval for cycles value: -14.59 6.93 95% mean confidence interval for cycles %-change: -1.41% 1.08% Inconclusive result (value mean confidence interval includes 0). Reviewed-by: Matt Turner <mattst88@gmail.com> [v2]
2018-06-25 19:55:31 -07:00
#include "nir_search_helpers.h"
#include "util/u_math.h"
#include "util/bitscan.h"
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
#include <vector>
using namespace brw;
void
fs_visitor::emit_nir_code()
{
emit_shader_float_controls_execution_mode();
/* emit the arrays used for inputs and outputs - load/store intrinsics will
* be converted to reads/writes of these arrays
*/
nir_setup_outputs();
nir_setup_uniforms();
nir_emit_system_values();
last_scratch = ALIGN(nir->scratch_size, 4) * dispatch_width;
nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir));
bld.emit(SHADER_OPCODE_HALT_TARGET);
}
void
fs_visitor::nir_setup_outputs()
{
if (stage == MESA_SHADER_TESS_CTRL ||
stage == MESA_SHADER_TASK ||
stage == MESA_SHADER_MESH ||
stage == MESA_SHADER_FRAGMENT)
return;
unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };
/* Calculate the size of output registers in a separate pass, before
* allocating them. With ARB_enhanced_layouts, multiple output variables
* may occupy the same slot, but have different type sizes.
*/
nir_foreach_shader_out_variable(var, nir) {
const int loc = var->data.driver_location;
const unsigned var_vec4s = nir_variable_count_slots(var, var->type);
vec4s[loc] = MAX2(vec4s[loc], var_vec4s);
}
for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) {
if (vec4s[loc] == 0) {
loc++;
continue;
}
unsigned reg_size = vec4s[loc];
/* Check if there are any ranges that start within this range and extend
* past it. If so, include them in this allocation.
*/
for (unsigned i = 1; i < reg_size; i++) {
assert(i + loc < ARRAY_SIZE(vec4s));
reg_size = MAX2(vec4s[i + loc] + i, reg_size);
}
fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * reg_size);
for (unsigned i = 0; i < reg_size; i++) {
assert(loc + i < ARRAY_SIZE(outputs));
outputs[loc + i] = offset(reg, bld, 4 * i);
}
loc += reg_size;
}
}
void
fs_visitor::nir_setup_uniforms()
{
/* Only the first compile gets to set up uniforms. */
if (push_constant_loc)
return;
uniforms = nir->num_uniforms / 4;
if (gl_shader_stage_is_compute(stage) && devinfo->verx10 < 125) {
/* Add uniforms for builtins after regular NIR uniforms. */
assert(uniforms == prog_data->nr_params);
/* Subgroup ID must be the last uniform on the list. This will make
* easier later to split between cross thread and per thread
* uniforms.
*/
uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1);
*param = BRW_PARAM_BUILTIN_SUBGROUP_ID;
uniforms++;
}
}
static bool
emit_system_values_block(nir_block *block, fs_visitor *v)
{
fs_reg *reg;
nir_foreach_instr(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_load_vertex_id:
case nir_intrinsic_load_base_vertex:
unreachable("should be lowered by nir_lower_system_values().");
case nir_intrinsic_load_vertex_id_zero_base:
case nir_intrinsic_load_is_indexed_draw:
case nir_intrinsic_load_first_vertex:
case nir_intrinsic_load_instance_id:
case nir_intrinsic_load_base_instance:
unreachable("should be lowered by brw_nir_lower_vs_inputs().");
break;
case nir_intrinsic_load_draw_id:
/* For Task/Mesh, draw_id will be handled later in
* nir_emit_mesh_task_intrinsic().
*/
if (!gl_shader_stage_is_mesh(v->stage))
unreachable("should be lowered by brw_nir_lower_vs_inputs().");
break;
case nir_intrinsic_load_invocation_id:
if (v->stage == MESA_SHADER_TESS_CTRL)
break;
assert(v->stage == MESA_SHADER_GEOMETRY);
reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
if (reg->file == BAD_FILE) {
const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL);
fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
abld.SHR(iid, g1, brw_imm_ud(27u));
*reg = iid;
}
break;
case nir_intrinsic_load_sample_pos:
case nir_intrinsic_load_sample_pos_or_center:
assert(v->stage == MESA_SHADER_FRAGMENT);
reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
if (reg->file == BAD_FILE)
*reg = v->emit_samplepos_setup();
break;
case nir_intrinsic_load_sample_id:
assert(v->stage == MESA_SHADER_FRAGMENT);
reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
if (reg->file == BAD_FILE)
*reg = v->emit_sampleid_setup();
break;
case nir_intrinsic_load_sample_mask_in:
assert(v->stage == MESA_SHADER_FRAGMENT);
assert(v->devinfo->ver >= 7);
reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
if (reg->file == BAD_FILE)
*reg = v->emit_samplemaskin_setup();
break;
case nir_intrinsic_load_workgroup_id:
case nir_intrinsic_load_workgroup_id_zero_base:
if (gl_shader_stage_is_mesh(v->stage))
unreachable("should be lowered by nir_lower_compute_system_values().");
assert(gl_shader_stage_is_compute(v->stage));
reg = &v->nir_system_values[SYSTEM_VALUE_WORKGROUP_ID];
if (reg->file == BAD_FILE)
*reg = v->emit_work_group_id_setup();
break;
case nir_intrinsic_load_helper_invocation:
assert(v->stage == MESA_SHADER_FRAGMENT);
reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION];
if (reg->file == BAD_FILE) {
const fs_builder abld =
v->bld.annotate("gl_HelperInvocation", NULL);
/* On Gfx6+ (gl_HelperInvocation is only exposed on Gfx7+) the
* pixel mask is in g1.7 of the thread payload.
*
* We move the per-channel pixel enable bit to the low bit of each
* channel by shifting the byte containing the pixel mask by the
* vector immediate 0x76543210UV.
*
* The region of <1,8,0> reads only 1 byte (the pixel masks for
* subspans 0 and 1) in SIMD8 and an additional byte (the pixel
* masks for 2 and 3) in SIMD16.
*/
fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1);
for (unsigned i = 0; i < DIV_ROUND_UP(v->dispatch_width, 16); i++) {
const fs_builder hbld = abld.group(MIN2(16, v->dispatch_width), i);
hbld.SHR(offset(shifted, hbld, i),
stride(retype(brw_vec1_grf(1 + i, 7),
BRW_REGISTER_TYPE_UB),
1, 8, 0),
brw_imm_v(0x76543210));
}
/* A set bit in the pixel mask means the channel is enabled, but
* that is the opposite of gl_HelperInvocation so we need to invert
* the mask.
*
* The negate source-modifier bit of logical instructions on Gfx8+
* performs 1's complement negation, so we can use that instead of
* a NOT instruction.
*/
fs_reg inverted = negate(shifted);
if (v->devinfo->ver < 8) {
inverted = abld.vgrf(BRW_REGISTER_TYPE_UW);
abld.NOT(inverted, shifted);
}
/* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
* with 1 and negating.
*/
fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
abld.AND(anded, inverted, brw_imm_uw(1));
fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1);
abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D)));
*reg = dst;
}
break;
case nir_intrinsic_load_frag_shading_rate:
reg = &v->nir_system_values[SYSTEM_VALUE_FRAG_SHADING_RATE];
if (reg->file == BAD_FILE)
*reg = v->emit_shading_rate_setup();
break;
default:
break;
}
}
return true;
}
void
fs_visitor::nir_emit_system_values()
{
nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
nir_system_values[i] = fs_reg();
}
/* Always emit SUBGROUP_INVOCATION. Dead code will clean it up if we
* never end up using it.
*/
{
const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
fs_reg &reg = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
reg = abld.vgrf(BRW_REGISTER_TYPE_UW);
abld.UNDEF(reg);
const fs_builder allbld8 = abld.group(8, 0).exec_all();
allbld8.MOV(reg, brw_imm_v(0x76543210));
if (dispatch_width > 8)
allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u));
if (dispatch_width > 16) {
const fs_builder allbld16 = abld.group(16, 0).exec_all();
allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u));
}
}
nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)nir);
nir_foreach_block(block, impl)
emit_system_values_block(block, this);
}
void
fs_visitor::nir_emit_impl(nir_function_impl *impl)
{
nir_ssa_values = rzalloc_array(mem_ctx, fs_reg, impl->ssa_alloc);
nir_resource_insts = rzalloc_array(mem_ctx, fs_inst *, impl->ssa_alloc);
nir_ssa_bind_infos = rzalloc_array(mem_ctx, struct brw_fs_bind_info, impl->ssa_alloc);
nir_resource_values = rzalloc_array(mem_ctx, fs_reg, impl->ssa_alloc);
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
nir_emit_cf_list(&impl->body);
}
void
fs_visitor::nir_emit_cf_list(exec_list *list)
{
exec_list_validate(list);
foreach_list_typed(nir_cf_node, node, node, list) {
switch (node->type) {
case nir_cf_node_if:
nir_emit_if(nir_cf_node_as_if(node));
break;
case nir_cf_node_loop:
nir_emit_loop(nir_cf_node_as_loop(node));
break;
case nir_cf_node_block:
nir_emit_block(nir_cf_node_as_block(node));
break;
default:
unreachable("Invalid CFG node block");
}
}
}
void
fs_visitor::nir_emit_if(nir_if *if_stmt)
{
intel/fs: Generate if instructions with inverted conditions Per-platform results were all over the place, so I have included all the results here. There is an important note at the bottom of the commit message. Skylake total instructions in shared programs: 15184683 -> 15184679 (<.01%) instructions in affected programs: 2786 -> 2782 (-0.14%) helped: 4 HURT: 0 helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 helped stats (rel) min: 0.05% max: 0.84% x̄: 0.44% x̃: 0.44% 95% mean confidence interval for instructions value: -1.00 -1.00 95% mean confidence interval for instructions %-change: -0.96% 0.07% Inconclusive result (%-change mean confidence interval includes 0). total cycles in shared programs: 370961367 -> 370961173 (<.01%) cycles in affected programs: 205867 -> 205673 (-0.09%) helped: 5 HURT: 1 helped stats (abs) min: 1 max: 149 x̄: 39.60 x̃: 16 helped stats (rel) min: 0.02% max: 1.05% x̄: 0.45% x̃: 0.55% HURT stats (abs) min: 4 max: 4 x̄: 4.00 x̃: 4 HURT stats (rel) min: 0.03% max: 0.03% x̄: 0.03% x̃: 0.03% 95% mean confidence interval for cycles value: -93.01 28.34 95% mean confidence interval for cycles %-change: -0.82% 0.08% Inconclusive result (value mean confidence interval includes 0). Broadwell total instructions in shared programs: 15465366 -> 15465362 (<.01%) instructions in affected programs: 2799 -> 2795 (-0.14%) helped: 4 HURT: 0 helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 helped stats (rel) min: 0.04% max: 0.84% x̄: 0.44% x̃: 0.44% 95% mean confidence interval for instructions value: -1.00 -1.00 95% mean confidence interval for instructions %-change: -0.96% 0.07% Inconclusive result (%-change mean confidence interval includes 0). total cycles in shared programs: 410938419 -> 410938531 (<.01%) cycles in affected programs: 566028 -> 566140 (0.02%) helped: 18 HURT: 17 helped stats (abs) min: 1 max: 16 x̄: 3.50 x̃: 1 helped stats (rel) min: <.01% max: 1.05% x̄: 0.13% x̃: <.01% HURT stats (abs) min: 1 max: 12 x̄: 10.29 x̃: 12 HURT stats (rel) min: <.01% max: 0.16% x̄: 0.08% x̃: 0.09% 95% mean confidence interval for cycles value: 0.31 6.09 95% mean confidence interval for cycles %-change: -0.10% 0.05% Inconclusive result (%-change mean confidence interval includes 0). Haswell total instructions in shared programs: 13749760 -> 13749759 (<.01%) instructions in affected programs: 2241 -> 2240 (-0.04%) helped: 1 HURT: 0 total cycles in shared programs: 385398913 -> 385398363 (<.01%) cycles in affected programs: 554914 -> 554364 (-0.10%) helped: 31 HURT: 1 helped stats (abs) min: 1 max: 453 x̄: 18.00 x̃: 6 helped stats (rel) min: <.01% max: 0.25% x̄: 0.03% x̃: 0.05% HURT stats (abs) min: 8 max: 8 x̄: 8.00 x̃: 8 HURT stats (rel) min: 0.06% max: 0.06% x̄: 0.06% x̃: 0.06% 95% mean confidence interval for cycles value: -45.88 11.51 95% mean confidence interval for cycles %-change: -0.05% -0.02% Inconclusive result (value mean confidence interval includes 0). Ivy Bridge total cycles in shared programs: 180663626 -> 180663881 (<.01%) cycles in affected programs: 472350 -> 472605 (0.05%) helped: 15 HURT: 30 helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 helped stats (rel) min: <.01% max: <.01% x̄: <.01% x̃: <.01% HURT stats (abs) min: 8 max: 10 x̄: 9.00 x̃: 9 HURT stats (rel) min: 0.06% max: 0.14% x̄: 0.10% x̃: 0.10% 95% mean confidence interval for cycles value: 4.21 7.12 95% mean confidence interval for cycles %-change: 0.05% 0.08% Cycles are HURT. Sandy Bridge total cycles in shared programs: 154568664 -> 154569225 (<.01%) cycles in affected programs: 356486 -> 357047 (0.16%) helped: 1 HURT: 31 helped stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 helped stats (rel) min: 0.02% max: 0.02% x̄: 0.02% x̃: 0.02% HURT stats (abs) min: 4 max: 33 x̄: 18.16 x̃: 8 HURT stats (rel) min: 0.05% max: 0.23% x̄: 0.14% x̃: 0.10% 95% mean confidence interval for cycles value: 12.19 22.87 95% mean confidence interval for cycles %-change: 0.10% 0.16% Cycles are HURT. Iron Lake total instructions in shared programs: 8206589 -> 8206565 (<.01%) instructions in affected programs: 3024 -> 3000 (-0.79%) helped: 12 HURT: 0 helped stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 helped stats (rel) min: 0.75% max: 0.83% x̄: 0.80% x̃: 0.80% 95% mean confidence interval for instructions value: -2.00 -2.00 95% mean confidence interval for instructions %-change: -0.82% -0.77% Instructions are helped. total cycles in shared programs: 187657428 -> 187656228 (<.01%) cycles in affected programs: 95748 -> 94548 (-1.25%) helped: 12 HURT: 0 helped stats (abs) min: 80 max: 120 x̄: 100.00 x̃: 100 helped stats (rel) min: 1.00% max: 1.66% x̄: 1.27% x̃: 1.21% 95% mean confidence interval for cycles value: -113.27 -86.73 95% mean confidence interval for cycles %-change: -1.43% -1.11% Cycles are helped. GM45 total instructions in shared programs: 5037569 -> 5037557 (<.01%) instructions in affected programs: 1521 -> 1509 (-0.79%) helped: 6 HURT: 0 helped stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 helped stats (rel) min: 0.75% max: 0.83% x̄: 0.79% x̃: 0.79% 95% mean confidence interval for instructions value: -2.00 -2.00 95% mean confidence interval for instructions %-change: -0.83% -0.75% Instructions are helped. total cycles in shared programs: 128101478 -> 128100758 (<.01%) cycles in affected programs: 52746 -> 52026 (-1.37%) helped: 6 HURT: 0 helped stats (abs) min: 120 max: 120 x̄: 120.00 x̃: 120 helped stats (rel) min: 1.16% max: 1.66% x̄: 1.41% x̃: 1.41% 95% mean confidence interval for cycles value: -120.00 -120.00 95% mean confidence interval for cycles %-change: -1.70% -1.12% Cycles are helped. This change has almost no effect right now. However, removing this patch (but leaving the patch "nir/algebraic: Replace a bcsel of a b2f with a b2f(!(a || b))") after adding a patch that removes !(a < b) -> (a >= b) optimizations (like https://patchwork.freedesktop.org/patch/264787/) has the following results on Skylake: Skylake total instructions in shared programs: 15071022 -> 15089710 (0.12%) instructions in affected programs: 1022219 -> 1040907 (1.83%) helped: 1 HURT: 3937 helped stats (abs) min: 41 max: 41 x̄: 41.00 x̃: 41 helped stats (rel) min: 1.01% max: 1.01% x̄: 1.01% x̃: 1.01% HURT stats (abs) min: 1 max: 256 x̄: 4.76 x̃: 4 HURT stats (rel) min: 0.05% max: 11.18% x̄: 2.59% x̃: 2.60% 95% mean confidence interval for instructions value: 4.56 4.93 95% mean confidence interval for instructions %-change: 2.54% 2.64% Instructions are HURT. total cycles in shared programs: 369777134 -> 370092923 (0.09%) cycles in affected programs: 17516573 -> 17832362 (1.80%) helped: 115 HURT: 3624 helped stats (abs) min: 1 max: 1721 x̄: 81.18 x̃: 28 helped stats (rel) min: <.01% max: 10.74% x̄: 1.24% x̃: 0.65% HURT stats (abs) min: 1 max: 12640 x̄: 89.71 x̃: 54 HURT stats (rel) min: <.01% max: 28.24% x̄: 4.72% x̃: 4.52% 95% mean confidence interval for cycles value: 75.21 93.71 95% mean confidence interval for cycles %-change: 4.43% 4.64% Cycles are HURT. total spills in shared programs: 9450 -> 9442 (-0.08%) spills in affected programs: 166 -> 158 (-4.82%) helped: 2 HURT: 0 total fills in shared programs: 21115 -> 21094 (-0.10%) fills in affected programs: 438 -> 417 (-4.79%) helped: 2 HURT: 0 LOST: 1 GAINED: 0 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2018-12-03 12:06:50 -08:00
bool invert;
fs_reg cond_reg;
/* If the condition has the form !other_condition, use other_condition as
* the source, but invert the predicate on the if instruction.
*/
nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition);
intel/fs: Generate if instructions with inverted conditions Per-platform results were all over the place, so I have included all the results here. There is an important note at the bottom of the commit message. Skylake total instructions in shared programs: 15184683 -> 15184679 (<.01%) instructions in affected programs: 2786 -> 2782 (-0.14%) helped: 4 HURT: 0 helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 helped stats (rel) min: 0.05% max: 0.84% x̄: 0.44% x̃: 0.44% 95% mean confidence interval for instructions value: -1.00 -1.00 95% mean confidence interval for instructions %-change: -0.96% 0.07% Inconclusive result (%-change mean confidence interval includes 0). total cycles in shared programs: 370961367 -> 370961173 (<.01%) cycles in affected programs: 205867 -> 205673 (-0.09%) helped: 5 HURT: 1 helped stats (abs) min: 1 max: 149 x̄: 39.60 x̃: 16 helped stats (rel) min: 0.02% max: 1.05% x̄: 0.45% x̃: 0.55% HURT stats (abs) min: 4 max: 4 x̄: 4.00 x̃: 4 HURT stats (rel) min: 0.03% max: 0.03% x̄: 0.03% x̃: 0.03% 95% mean confidence interval for cycles value: -93.01 28.34 95% mean confidence interval for cycles %-change: -0.82% 0.08% Inconclusive result (value mean confidence interval includes 0). Broadwell total instructions in shared programs: 15465366 -> 15465362 (<.01%) instructions in affected programs: 2799 -> 2795 (-0.14%) helped: 4 HURT: 0 helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 helped stats (rel) min: 0.04% max: 0.84% x̄: 0.44% x̃: 0.44% 95% mean confidence interval for instructions value: -1.00 -1.00 95% mean confidence interval for instructions %-change: -0.96% 0.07% Inconclusive result (%-change mean confidence interval includes 0). total cycles in shared programs: 410938419 -> 410938531 (<.01%) cycles in affected programs: 566028 -> 566140 (0.02%) helped: 18 HURT: 17 helped stats (abs) min: 1 max: 16 x̄: 3.50 x̃: 1 helped stats (rel) min: <.01% max: 1.05% x̄: 0.13% x̃: <.01% HURT stats (abs) min: 1 max: 12 x̄: 10.29 x̃: 12 HURT stats (rel) min: <.01% max: 0.16% x̄: 0.08% x̃: 0.09% 95% mean confidence interval for cycles value: 0.31 6.09 95% mean confidence interval for cycles %-change: -0.10% 0.05% Inconclusive result (%-change mean confidence interval includes 0). Haswell total instructions in shared programs: 13749760 -> 13749759 (<.01%) instructions in affected programs: 2241 -> 2240 (-0.04%) helped: 1 HURT: 0 total cycles in shared programs: 385398913 -> 385398363 (<.01%) cycles in affected programs: 554914 -> 554364 (-0.10%) helped: 31 HURT: 1 helped stats (abs) min: 1 max: 453 x̄: 18.00 x̃: 6 helped stats (rel) min: <.01% max: 0.25% x̄: 0.03% x̃: 0.05% HURT stats (abs) min: 8 max: 8 x̄: 8.00 x̃: 8 HURT stats (rel) min: 0.06% max: 0.06% x̄: 0.06% x̃: 0.06% 95% mean confidence interval for cycles value: -45.88 11.51 95% mean confidence interval for cycles %-change: -0.05% -0.02% Inconclusive result (value mean confidence interval includes 0). Ivy Bridge total cycles in shared programs: 180663626 -> 180663881 (<.01%) cycles in affected programs: 472350 -> 472605 (0.05%) helped: 15 HURT: 30 helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 helped stats (rel) min: <.01% max: <.01% x̄: <.01% x̃: <.01% HURT stats (abs) min: 8 max: 10 x̄: 9.00 x̃: 9 HURT stats (rel) min: 0.06% max: 0.14% x̄: 0.10% x̃: 0.10% 95% mean confidence interval for cycles value: 4.21 7.12 95% mean confidence interval for cycles %-change: 0.05% 0.08% Cycles are HURT. Sandy Bridge total cycles in shared programs: 154568664 -> 154569225 (<.01%) cycles in affected programs: 356486 -> 357047 (0.16%) helped: 1 HURT: 31 helped stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 helped stats (rel) min: 0.02% max: 0.02% x̄: 0.02% x̃: 0.02% HURT stats (abs) min: 4 max: 33 x̄: 18.16 x̃: 8 HURT stats (rel) min: 0.05% max: 0.23% x̄: 0.14% x̃: 0.10% 95% mean confidence interval for cycles value: 12.19 22.87 95% mean confidence interval for cycles %-change: 0.10% 0.16% Cycles are HURT. Iron Lake total instructions in shared programs: 8206589 -> 8206565 (<.01%) instructions in affected programs: 3024 -> 3000 (-0.79%) helped: 12 HURT: 0 helped stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 helped stats (rel) min: 0.75% max: 0.83% x̄: 0.80% x̃: 0.80% 95% mean confidence interval for instructions value: -2.00 -2.00 95% mean confidence interval for instructions %-change: -0.82% -0.77% Instructions are helped. total cycles in shared programs: 187657428 -> 187656228 (<.01%) cycles in affected programs: 95748 -> 94548 (-1.25%) helped: 12 HURT: 0 helped stats (abs) min: 80 max: 120 x̄: 100.00 x̃: 100 helped stats (rel) min: 1.00% max: 1.66% x̄: 1.27% x̃: 1.21% 95% mean confidence interval for cycles value: -113.27 -86.73 95% mean confidence interval for cycles %-change: -1.43% -1.11% Cycles are helped. GM45 total instructions in shared programs: 5037569 -> 5037557 (<.01%) instructions in affected programs: 1521 -> 1509 (-0.79%) helped: 6 HURT: 0 helped stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 helped stats (rel) min: 0.75% max: 0.83% x̄: 0.79% x̃: 0.79% 95% mean confidence interval for instructions value: -2.00 -2.00 95% mean confidence interval for instructions %-change: -0.83% -0.75% Instructions are helped. total cycles in shared programs: 128101478 -> 128100758 (<.01%) cycles in affected programs: 52746 -> 52026 (-1.37%) helped: 6 HURT: 0 helped stats (abs) min: 120 max: 120 x̄: 120.00 x̃: 120 helped stats (rel) min: 1.16% max: 1.66% x̄: 1.41% x̃: 1.41% 95% mean confidence interval for cycles value: -120.00 -120.00 95% mean confidence interval for cycles %-change: -1.70% -1.12% Cycles are helped. This change has almost no effect right now. However, removing this patch (but leaving the patch "nir/algebraic: Replace a bcsel of a b2f with a b2f(!(a || b))") after adding a patch that removes !(a < b) -> (a >= b) optimizations (like https://patchwork.freedesktop.org/patch/264787/) has the following results on Skylake: Skylake total instructions in shared programs: 15071022 -> 15089710 (0.12%) instructions in affected programs: 1022219 -> 1040907 (1.83%) helped: 1 HURT: 3937 helped stats (abs) min: 41 max: 41 x̄: 41.00 x̃: 41 helped stats (rel) min: 1.01% max: 1.01% x̄: 1.01% x̃: 1.01% HURT stats (abs) min: 1 max: 256 x̄: 4.76 x̃: 4 HURT stats (rel) min: 0.05% max: 11.18% x̄: 2.59% x̃: 2.60% 95% mean confidence interval for instructions value: 4.56 4.93 95% mean confidence interval for instructions %-change: 2.54% 2.64% Instructions are HURT. total cycles in shared programs: 369777134 -> 370092923 (0.09%) cycles in affected programs: 17516573 -> 17832362 (1.80%) helped: 115 HURT: 3624 helped stats (abs) min: 1 max: 1721 x̄: 81.18 x̃: 28 helped stats (rel) min: <.01% max: 10.74% x̄: 1.24% x̃: 0.65% HURT stats (abs) min: 1 max: 12640 x̄: 89.71 x̃: 54 HURT stats (rel) min: <.01% max: 28.24% x̄: 4.72% x̃: 4.52% 95% mean confidence interval for cycles value: 75.21 93.71 95% mean confidence interval for cycles %-change: 4.43% 4.64% Cycles are HURT. total spills in shared programs: 9450 -> 9442 (-0.08%) spills in affected programs: 166 -> 158 (-4.82%) helped: 2 HURT: 0 total fills in shared programs: 21115 -> 21094 (-0.10%) fills in affected programs: 438 -> 417 (-4.79%) helped: 2 HURT: 0 LOST: 1 GAINED: 0 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2018-12-03 12:06:50 -08:00
if (cond != NULL && cond->op == nir_op_inot) {
invert = true;
cond_reg = get_nir_src(cond->src[0].src);
cond_reg = offset(cond_reg, bld, cond->src[0].swizzle[0]);
intel/fs: Generate if instructions with inverted conditions Per-platform results were all over the place, so I have included all the results here. There is an important note at the bottom of the commit message. Skylake total instructions in shared programs: 15184683 -> 15184679 (<.01%) instructions in affected programs: 2786 -> 2782 (-0.14%) helped: 4 HURT: 0 helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 helped stats (rel) min: 0.05% max: 0.84% x̄: 0.44% x̃: 0.44% 95% mean confidence interval for instructions value: -1.00 -1.00 95% mean confidence interval for instructions %-change: -0.96% 0.07% Inconclusive result (%-change mean confidence interval includes 0). total cycles in shared programs: 370961367 -> 370961173 (<.01%) cycles in affected programs: 205867 -> 205673 (-0.09%) helped: 5 HURT: 1 helped stats (abs) min: 1 max: 149 x̄: 39.60 x̃: 16 helped stats (rel) min: 0.02% max: 1.05% x̄: 0.45% x̃: 0.55% HURT stats (abs) min: 4 max: 4 x̄: 4.00 x̃: 4 HURT stats (rel) min: 0.03% max: 0.03% x̄: 0.03% x̃: 0.03% 95% mean confidence interval for cycles value: -93.01 28.34 95% mean confidence interval for cycles %-change: -0.82% 0.08% Inconclusive result (value mean confidence interval includes 0). Broadwell total instructions in shared programs: 15465366 -> 15465362 (<.01%) instructions in affected programs: 2799 -> 2795 (-0.14%) helped: 4 HURT: 0 helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 helped stats (rel) min: 0.04% max: 0.84% x̄: 0.44% x̃: 0.44% 95% mean confidence interval for instructions value: -1.00 -1.00 95% mean confidence interval for instructions %-change: -0.96% 0.07% Inconclusive result (%-change mean confidence interval includes 0). total cycles in shared programs: 410938419 -> 410938531 (<.01%) cycles in affected programs: 566028 -> 566140 (0.02%) helped: 18 HURT: 17 helped stats (abs) min: 1 max: 16 x̄: 3.50 x̃: 1 helped stats (rel) min: <.01% max: 1.05% x̄: 0.13% x̃: <.01% HURT stats (abs) min: 1 max: 12 x̄: 10.29 x̃: 12 HURT stats (rel) min: <.01% max: 0.16% x̄: 0.08% x̃: 0.09% 95% mean confidence interval for cycles value: 0.31 6.09 95% mean confidence interval for cycles %-change: -0.10% 0.05% Inconclusive result (%-change mean confidence interval includes 0). Haswell total instructions in shared programs: 13749760 -> 13749759 (<.01%) instructions in affected programs: 2241 -> 2240 (-0.04%) helped: 1 HURT: 0 total cycles in shared programs: 385398913 -> 385398363 (<.01%) cycles in affected programs: 554914 -> 554364 (-0.10%) helped: 31 HURT: 1 helped stats (abs) min: 1 max: 453 x̄: 18.00 x̃: 6 helped stats (rel) min: <.01% max: 0.25% x̄: 0.03% x̃: 0.05% HURT stats (abs) min: 8 max: 8 x̄: 8.00 x̃: 8 HURT stats (rel) min: 0.06% max: 0.06% x̄: 0.06% x̃: 0.06% 95% mean confidence interval for cycles value: -45.88 11.51 95% mean confidence interval for cycles %-change: -0.05% -0.02% Inconclusive result (value mean confidence interval includes 0). Ivy Bridge total cycles in shared programs: 180663626 -> 180663881 (<.01%) cycles in affected programs: 472350 -> 472605 (0.05%) helped: 15 HURT: 30 helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 helped stats (rel) min: <.01% max: <.01% x̄: <.01% x̃: <.01% HURT stats (abs) min: 8 max: 10 x̄: 9.00 x̃: 9 HURT stats (rel) min: 0.06% max: 0.14% x̄: 0.10% x̃: 0.10% 95% mean confidence interval for cycles value: 4.21 7.12 95% mean confidence interval for cycles %-change: 0.05% 0.08% Cycles are HURT. Sandy Bridge total cycles in shared programs: 154568664 -> 154569225 (<.01%) cycles in affected programs: 356486 -> 357047 (0.16%) helped: 1 HURT: 31 helped stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 helped stats (rel) min: 0.02% max: 0.02% x̄: 0.02% x̃: 0.02% HURT stats (abs) min: 4 max: 33 x̄: 18.16 x̃: 8 HURT stats (rel) min: 0.05% max: 0.23% x̄: 0.14% x̃: 0.10% 95% mean confidence interval for cycles value: 12.19 22.87 95% mean confidence interval for cycles %-change: 0.10% 0.16% Cycles are HURT. Iron Lake total instructions in shared programs: 8206589 -> 8206565 (<.01%) instructions in affected programs: 3024 -> 3000 (-0.79%) helped: 12 HURT: 0 helped stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 helped stats (rel) min: 0.75% max: 0.83% x̄: 0.80% x̃: 0.80% 95% mean confidence interval for instructions value: -2.00 -2.00 95% mean confidence interval for instructions %-change: -0.82% -0.77% Instructions are helped. total cycles in shared programs: 187657428 -> 187656228 (<.01%) cycles in affected programs: 95748 -> 94548 (-1.25%) helped: 12 HURT: 0 helped stats (abs) min: 80 max: 120 x̄: 100.00 x̃: 100 helped stats (rel) min: 1.00% max: 1.66% x̄: 1.27% x̃: 1.21% 95% mean confidence interval for cycles value: -113.27 -86.73 95% mean confidence interval for cycles %-change: -1.43% -1.11% Cycles are helped. GM45 total instructions in shared programs: 5037569 -> 5037557 (<.01%) instructions in affected programs: 1521 -> 1509 (-0.79%) helped: 6 HURT: 0 helped stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 helped stats (rel) min: 0.75% max: 0.83% x̄: 0.79% x̃: 0.79% 95% mean confidence interval for instructions value: -2.00 -2.00 95% mean confidence interval for instructions %-change: -0.83% -0.75% Instructions are helped. total cycles in shared programs: 128101478 -> 128100758 (<.01%) cycles in affected programs: 52746 -> 52026 (-1.37%) helped: 6 HURT: 0 helped stats (abs) min: 120 max: 120 x̄: 120.00 x̃: 120 helped stats (rel) min: 1.16% max: 1.66% x̄: 1.41% x̃: 1.41% 95% mean confidence interval for cycles value: -120.00 -120.00 95% mean confidence interval for cycles %-change: -1.70% -1.12% Cycles are helped. This change has almost no effect right now. However, removing this patch (but leaving the patch "nir/algebraic: Replace a bcsel of a b2f with a b2f(!(a || b))") after adding a patch that removes !(a < b) -> (a >= b) optimizations (like https://patchwork.freedesktop.org/patch/264787/) has the following results on Skylake: Skylake total instructions in shared programs: 15071022 -> 15089710 (0.12%) instructions in affected programs: 1022219 -> 1040907 (1.83%) helped: 1 HURT: 3937 helped stats (abs) min: 41 max: 41 x̄: 41.00 x̃: 41 helped stats (rel) min: 1.01% max: 1.01% x̄: 1.01% x̃: 1.01% HURT stats (abs) min: 1 max: 256 x̄: 4.76 x̃: 4 HURT stats (rel) min: 0.05% max: 11.18% x̄: 2.59% x̃: 2.60% 95% mean confidence interval for instructions value: 4.56 4.93 95% mean confidence interval for instructions %-change: 2.54% 2.64% Instructions are HURT. total cycles in shared programs: 369777134 -> 370092923 (0.09%) cycles in affected programs: 17516573 -> 17832362 (1.80%) helped: 115 HURT: 3624 helped stats (abs) min: 1 max: 1721 x̄: 81.18 x̃: 28 helped stats (rel) min: <.01% max: 10.74% x̄: 1.24% x̃: 0.65% HURT stats (abs) min: 1 max: 12640 x̄: 89.71 x̃: 54 HURT stats (rel) min: <.01% max: 28.24% x̄: 4.72% x̃: 4.52% 95% mean confidence interval for cycles value: 75.21 93.71 95% mean confidence interval for cycles %-change: 4.43% 4.64% Cycles are HURT. total spills in shared programs: 9450 -> 9442 (-0.08%) spills in affected programs: 166 -> 158 (-4.82%) helped: 2 HURT: 0 total fills in shared programs: 21115 -> 21094 (-0.10%) fills in affected programs: 438 -> 417 (-4.79%) helped: 2 HURT: 0 LOST: 1 GAINED: 0 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2018-12-03 12:06:50 -08:00
} else {
invert = false;
cond_reg = get_nir_src(if_stmt->condition);
}
/* first, put the condition into f0 */
fs_inst *inst = bld.MOV(bld.null_reg_d(),
intel/fs: Generate if instructions with inverted conditions Per-platform results were all over the place, so I have included all the results here. There is an important note at the bottom of the commit message. Skylake total instructions in shared programs: 15184683 -> 15184679 (<.01%) instructions in affected programs: 2786 -> 2782 (-0.14%) helped: 4 HURT: 0 helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 helped stats (rel) min: 0.05% max: 0.84% x̄: 0.44% x̃: 0.44% 95% mean confidence interval for instructions value: -1.00 -1.00 95% mean confidence interval for instructions %-change: -0.96% 0.07% Inconclusive result (%-change mean confidence interval includes 0). total cycles in shared programs: 370961367 -> 370961173 (<.01%) cycles in affected programs: 205867 -> 205673 (-0.09%) helped: 5 HURT: 1 helped stats (abs) min: 1 max: 149 x̄: 39.60 x̃: 16 helped stats (rel) min: 0.02% max: 1.05% x̄: 0.45% x̃: 0.55% HURT stats (abs) min: 4 max: 4 x̄: 4.00 x̃: 4 HURT stats (rel) min: 0.03% max: 0.03% x̄: 0.03% x̃: 0.03% 95% mean confidence interval for cycles value: -93.01 28.34 95% mean confidence interval for cycles %-change: -0.82% 0.08% Inconclusive result (value mean confidence interval includes 0). Broadwell total instructions in shared programs: 15465366 -> 15465362 (<.01%) instructions in affected programs: 2799 -> 2795 (-0.14%) helped: 4 HURT: 0 helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 helped stats (rel) min: 0.04% max: 0.84% x̄: 0.44% x̃: 0.44% 95% mean confidence interval for instructions value: -1.00 -1.00 95% mean confidence interval for instructions %-change: -0.96% 0.07% Inconclusive result (%-change mean confidence interval includes 0). total cycles in shared programs: 410938419 -> 410938531 (<.01%) cycles in affected programs: 566028 -> 566140 (0.02%) helped: 18 HURT: 17 helped stats (abs) min: 1 max: 16 x̄: 3.50 x̃: 1 helped stats (rel) min: <.01% max: 1.05% x̄: 0.13% x̃: <.01% HURT stats (abs) min: 1 max: 12 x̄: 10.29 x̃: 12 HURT stats (rel) min: <.01% max: 0.16% x̄: 0.08% x̃: 0.09% 95% mean confidence interval for cycles value: 0.31 6.09 95% mean confidence interval for cycles %-change: -0.10% 0.05% Inconclusive result (%-change mean confidence interval includes 0). Haswell total instructions in shared programs: 13749760 -> 13749759 (<.01%) instructions in affected programs: 2241 -> 2240 (-0.04%) helped: 1 HURT: 0 total cycles in shared programs: 385398913 -> 385398363 (<.01%) cycles in affected programs: 554914 -> 554364 (-0.10%) helped: 31 HURT: 1 helped stats (abs) min: 1 max: 453 x̄: 18.00 x̃: 6 helped stats (rel) min: <.01% max: 0.25% x̄: 0.03% x̃: 0.05% HURT stats (abs) min: 8 max: 8 x̄: 8.00 x̃: 8 HURT stats (rel) min: 0.06% max: 0.06% x̄: 0.06% x̃: 0.06% 95% mean confidence interval for cycles value: -45.88 11.51 95% mean confidence interval for cycles %-change: -0.05% -0.02% Inconclusive result (value mean confidence interval includes 0). Ivy Bridge total cycles in shared programs: 180663626 -> 180663881 (<.01%) cycles in affected programs: 472350 -> 472605 (0.05%) helped: 15 HURT: 30 helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 helped stats (rel) min: <.01% max: <.01% x̄: <.01% x̃: <.01% HURT stats (abs) min: 8 max: 10 x̄: 9.00 x̃: 9 HURT stats (rel) min: 0.06% max: 0.14% x̄: 0.10% x̃: 0.10% 95% mean confidence interval for cycles value: 4.21 7.12 95% mean confidence interval for cycles %-change: 0.05% 0.08% Cycles are HURT. Sandy Bridge total cycles in shared programs: 154568664 -> 154569225 (<.01%) cycles in affected programs: 356486 -> 357047 (0.16%) helped: 1 HURT: 31 helped stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 helped stats (rel) min: 0.02% max: 0.02% x̄: 0.02% x̃: 0.02% HURT stats (abs) min: 4 max: 33 x̄: 18.16 x̃: 8 HURT stats (rel) min: 0.05% max: 0.23% x̄: 0.14% x̃: 0.10% 95% mean confidence interval for cycles value: 12.19 22.87 95% mean confidence interval for cycles %-change: 0.10% 0.16% Cycles are HURT. Iron Lake total instructions in shared programs: 8206589 -> 8206565 (<.01%) instructions in affected programs: 3024 -> 3000 (-0.79%) helped: 12 HURT: 0 helped stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 helped stats (rel) min: 0.75% max: 0.83% x̄: 0.80% x̃: 0.80% 95% mean confidence interval for instructions value: -2.00 -2.00 95% mean confidence interval for instructions %-change: -0.82% -0.77% Instructions are helped. total cycles in shared programs: 187657428 -> 187656228 (<.01%) cycles in affected programs: 95748 -> 94548 (-1.25%) helped: 12 HURT: 0 helped stats (abs) min: 80 max: 120 x̄: 100.00 x̃: 100 helped stats (rel) min: 1.00% max: 1.66% x̄: 1.27% x̃: 1.21% 95% mean confidence interval for cycles value: -113.27 -86.73 95% mean confidence interval for cycles %-change: -1.43% -1.11% Cycles are helped. GM45 total instructions in shared programs: 5037569 -> 5037557 (<.01%) instructions in affected programs: 1521 -> 1509 (-0.79%) helped: 6 HURT: 0 helped stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 helped stats (rel) min: 0.75% max: 0.83% x̄: 0.79% x̃: 0.79% 95% mean confidence interval for instructions value: -2.00 -2.00 95% mean confidence interval for instructions %-change: -0.83% -0.75% Instructions are helped. total cycles in shared programs: 128101478 -> 128100758 (<.01%) cycles in affected programs: 52746 -> 52026 (-1.37%) helped: 6 HURT: 0 helped stats (abs) min: 120 max: 120 x̄: 120.00 x̃: 120 helped stats (rel) min: 1.16% max: 1.66% x̄: 1.41% x̃: 1.41% 95% mean confidence interval for cycles value: -120.00 -120.00 95% mean confidence interval for cycles %-change: -1.70% -1.12% Cycles are helped. This change has almost no effect right now. However, removing this patch (but leaving the patch "nir/algebraic: Replace a bcsel of a b2f with a b2f(!(a || b))") after adding a patch that removes !(a < b) -> (a >= b) optimizations (like https://patchwork.freedesktop.org/patch/264787/) has the following results on Skylake: Skylake total instructions in shared programs: 15071022 -> 15089710 (0.12%) instructions in affected programs: 1022219 -> 1040907 (1.83%) helped: 1 HURT: 3937 helped stats (abs) min: 41 max: 41 x̄: 41.00 x̃: 41 helped stats (rel) min: 1.01% max: 1.01% x̄: 1.01% x̃: 1.01% HURT stats (abs) min: 1 max: 256 x̄: 4.76 x̃: 4 HURT stats (rel) min: 0.05% max: 11.18% x̄: 2.59% x̃: 2.60% 95% mean confidence interval for instructions value: 4.56 4.93 95% mean confidence interval for instructions %-change: 2.54% 2.64% Instructions are HURT. total cycles in shared programs: 369777134 -> 370092923 (0.09%) cycles in affected programs: 17516573 -> 17832362 (1.80%) helped: 115 HURT: 3624 helped stats (abs) min: 1 max: 1721 x̄: 81.18 x̃: 28 helped stats (rel) min: <.01% max: 10.74% x̄: 1.24% x̃: 0.65% HURT stats (abs) min: 1 max: 12640 x̄: 89.71 x̃: 54 HURT stats (rel) min: <.01% max: 28.24% x̄: 4.72% x̃: 4.52% 95% mean confidence interval for cycles value: 75.21 93.71 95% mean confidence interval for cycles %-change: 4.43% 4.64% Cycles are HURT. total spills in shared programs: 9450 -> 9442 (-0.08%) spills in affected programs: 166 -> 158 (-4.82%) helped: 2 HURT: 0 total fills in shared programs: 21115 -> 21094 (-0.10%) fills in affected programs: 438 -> 417 (-4.79%) helped: 2 HURT: 0 LOST: 1 GAINED: 0 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2018-12-03 12:06:50 -08:00
retype(cond_reg, BRW_REGISTER_TYPE_D));
inst->conditional_mod = BRW_CONDITIONAL_NZ;
intel/fs: Generate if instructions with inverted conditions Per-platform results were all over the place, so I have included all the results here. There is an important note at the bottom of the commit message. Skylake total instructions in shared programs: 15184683 -> 15184679 (<.01%) instructions in affected programs: 2786 -> 2782 (-0.14%) helped: 4 HURT: 0 helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 helped stats (rel) min: 0.05% max: 0.84% x̄: 0.44% x̃: 0.44% 95% mean confidence interval for instructions value: -1.00 -1.00 95% mean confidence interval for instructions %-change: -0.96% 0.07% Inconclusive result (%-change mean confidence interval includes 0). total cycles in shared programs: 370961367 -> 370961173 (<.01%) cycles in affected programs: 205867 -> 205673 (-0.09%) helped: 5 HURT: 1 helped stats (abs) min: 1 max: 149 x̄: 39.60 x̃: 16 helped stats (rel) min: 0.02% max: 1.05% x̄: 0.45% x̃: 0.55% HURT stats (abs) min: 4 max: 4 x̄: 4.00 x̃: 4 HURT stats (rel) min: 0.03% max: 0.03% x̄: 0.03% x̃: 0.03% 95% mean confidence interval for cycles value: -93.01 28.34 95% mean confidence interval for cycles %-change: -0.82% 0.08% Inconclusive result (value mean confidence interval includes 0). Broadwell total instructions in shared programs: 15465366 -> 15465362 (<.01%) instructions in affected programs: 2799 -> 2795 (-0.14%) helped: 4 HURT: 0 helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 helped stats (rel) min: 0.04% max: 0.84% x̄: 0.44% x̃: 0.44% 95% mean confidence interval for instructions value: -1.00 -1.00 95% mean confidence interval for instructions %-change: -0.96% 0.07% Inconclusive result (%-change mean confidence interval includes 0). total cycles in shared programs: 410938419 -> 410938531 (<.01%) cycles in affected programs: 566028 -> 566140 (0.02%) helped: 18 HURT: 17 helped stats (abs) min: 1 max: 16 x̄: 3.50 x̃: 1 helped stats (rel) min: <.01% max: 1.05% x̄: 0.13% x̃: <.01% HURT stats (abs) min: 1 max: 12 x̄: 10.29 x̃: 12 HURT stats (rel) min: <.01% max: 0.16% x̄: 0.08% x̃: 0.09% 95% mean confidence interval for cycles value: 0.31 6.09 95% mean confidence interval for cycles %-change: -0.10% 0.05% Inconclusive result (%-change mean confidence interval includes 0). Haswell total instructions in shared programs: 13749760 -> 13749759 (<.01%) instructions in affected programs: 2241 -> 2240 (-0.04%) helped: 1 HURT: 0 total cycles in shared programs: 385398913 -> 385398363 (<.01%) cycles in affected programs: 554914 -> 554364 (-0.10%) helped: 31 HURT: 1 helped stats (abs) min: 1 max: 453 x̄: 18.00 x̃: 6 helped stats (rel) min: <.01% max: 0.25% x̄: 0.03% x̃: 0.05% HURT stats (abs) min: 8 max: 8 x̄: 8.00 x̃: 8 HURT stats (rel) min: 0.06% max: 0.06% x̄: 0.06% x̃: 0.06% 95% mean confidence interval for cycles value: -45.88 11.51 95% mean confidence interval for cycles %-change: -0.05% -0.02% Inconclusive result (value mean confidence interval includes 0). Ivy Bridge total cycles in shared programs: 180663626 -> 180663881 (<.01%) cycles in affected programs: 472350 -> 472605 (0.05%) helped: 15 HURT: 30 helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 helped stats (rel) min: <.01% max: <.01% x̄: <.01% x̃: <.01% HURT stats (abs) min: 8 max: 10 x̄: 9.00 x̃: 9 HURT stats (rel) min: 0.06% max: 0.14% x̄: 0.10% x̃: 0.10% 95% mean confidence interval for cycles value: 4.21 7.12 95% mean confidence interval for cycles %-change: 0.05% 0.08% Cycles are HURT. Sandy Bridge total cycles in shared programs: 154568664 -> 154569225 (<.01%) cycles in affected programs: 356486 -> 357047 (0.16%) helped: 1 HURT: 31 helped stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 helped stats (rel) min: 0.02% max: 0.02% x̄: 0.02% x̃: 0.02% HURT stats (abs) min: 4 max: 33 x̄: 18.16 x̃: 8 HURT stats (rel) min: 0.05% max: 0.23% x̄: 0.14% x̃: 0.10% 95% mean confidence interval for cycles value: 12.19 22.87 95% mean confidence interval for cycles %-change: 0.10% 0.16% Cycles are HURT. Iron Lake total instructions in shared programs: 8206589 -> 8206565 (<.01%) instructions in affected programs: 3024 -> 3000 (-0.79%) helped: 12 HURT: 0 helped stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 helped stats (rel) min: 0.75% max: 0.83% x̄: 0.80% x̃: 0.80% 95% mean confidence interval for instructions value: -2.00 -2.00 95% mean confidence interval for instructions %-change: -0.82% -0.77% Instructions are helped. total cycles in shared programs: 187657428 -> 187656228 (<.01%) cycles in affected programs: 95748 -> 94548 (-1.25%) helped: 12 HURT: 0 helped stats (abs) min: 80 max: 120 x̄: 100.00 x̃: 100 helped stats (rel) min: 1.00% max: 1.66% x̄: 1.27% x̃: 1.21% 95% mean confidence interval for cycles value: -113.27 -86.73 95% mean confidence interval for cycles %-change: -1.43% -1.11% Cycles are helped. GM45 total instructions in shared programs: 5037569 -> 5037557 (<.01%) instructions in affected programs: 1521 -> 1509 (-0.79%) helped: 6 HURT: 0 helped stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 helped stats (rel) min: 0.75% max: 0.83% x̄: 0.79% x̃: 0.79% 95% mean confidence interval for instructions value: -2.00 -2.00 95% mean confidence interval for instructions %-change: -0.83% -0.75% Instructions are helped. total cycles in shared programs: 128101478 -> 128100758 (<.01%) cycles in affected programs: 52746 -> 52026 (-1.37%) helped: 6 HURT: 0 helped stats (abs) min: 120 max: 120 x̄: 120.00 x̃: 120 helped stats (rel) min: 1.16% max: 1.66% x̄: 1.41% x̃: 1.41% 95% mean confidence interval for cycles value: -120.00 -120.00 95% mean confidence interval for cycles %-change: -1.70% -1.12% Cycles are helped. This change has almost no effect right now. However, removing this patch (but leaving the patch "nir/algebraic: Replace a bcsel of a b2f with a b2f(!(a || b))") after adding a patch that removes !(a < b) -> (a >= b) optimizations (like https://patchwork.freedesktop.org/patch/264787/) has the following results on Skylake: Skylake total instructions in shared programs: 15071022 -> 15089710 (0.12%) instructions in affected programs: 1022219 -> 1040907 (1.83%) helped: 1 HURT: 3937 helped stats (abs) min: 41 max: 41 x̄: 41.00 x̃: 41 helped stats (rel) min: 1.01% max: 1.01% x̄: 1.01% x̃: 1.01% HURT stats (abs) min: 1 max: 256 x̄: 4.76 x̃: 4 HURT stats (rel) min: 0.05% max: 11.18% x̄: 2.59% x̃: 2.60% 95% mean confidence interval for instructions value: 4.56 4.93 95% mean confidence interval for instructions %-change: 2.54% 2.64% Instructions are HURT. total cycles in shared programs: 369777134 -> 370092923 (0.09%) cycles in affected programs: 17516573 -> 17832362 (1.80%) helped: 115 HURT: 3624 helped stats (abs) min: 1 max: 1721 x̄: 81.18 x̃: 28 helped stats (rel) min: <.01% max: 10.74% x̄: 1.24% x̃: 0.65% HURT stats (abs) min: 1 max: 12640 x̄: 89.71 x̃: 54 HURT stats (rel) min: <.01% max: 28.24% x̄: 4.72% x̃: 4.52% 95% mean confidence interval for cycles value: 75.21 93.71 95% mean confidence interval for cycles %-change: 4.43% 4.64% Cycles are HURT. total spills in shared programs: 9450 -> 9442 (-0.08%) spills in affected programs: 166 -> 158 (-4.82%) helped: 2 HURT: 0 total fills in shared programs: 21115 -> 21094 (-0.10%) fills in affected programs: 438 -> 417 (-4.79%) helped: 2 HURT: 0 LOST: 1 GAINED: 0 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2018-12-03 12:06:50 -08:00
bld.IF(BRW_PREDICATE_NORMAL)->predicate_inverse = invert;
nir_emit_cf_list(&if_stmt->then_list);
if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
bld.emit(BRW_OPCODE_ELSE);
nir_emit_cf_list(&if_stmt->else_list);
}
bld.emit(BRW_OPCODE_ENDIF);
if (devinfo->ver < 7)
limit_dispatch_width(16, "Non-uniform control flow unsupported "
"in SIMD32 mode.");
}
void
fs_visitor::nir_emit_loop(nir_loop *loop)
{
assert(!nir_loop_has_continue_construct(loop));
bld.emit(BRW_OPCODE_DO);
nir_emit_cf_list(&loop->body);
bld.emit(BRW_OPCODE_WHILE);
if (devinfo->ver < 7)
limit_dispatch_width(16, "Non-uniform control flow unsupported "
"in SIMD32 mode.");
}
void
fs_visitor::nir_emit_block(nir_block *block)
{
nir_foreach_instr(instr, block) {
nir_emit_instr(instr);
}
}
void
fs_visitor::nir_emit_instr(nir_instr *instr)
{
const fs_builder abld = bld.annotate(NULL, instr);
switch (instr->type) {
case nir_instr_type_alu:
nir_emit_alu(abld, nir_instr_as_alu(instr), true);
break;
case nir_instr_type_deref:
unreachable("All derefs should've been lowered");
break;
case nir_instr_type_intrinsic:
switch (stage) {
case MESA_SHADER_VERTEX:
nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
break;
case MESA_SHADER_TESS_CTRL:
nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr));
break;
case MESA_SHADER_TESS_EVAL:
nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr));
break;
case MESA_SHADER_GEOMETRY:
nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr));
break;
case MESA_SHADER_FRAGMENT:
nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr));
break;
case MESA_SHADER_COMPUTE:
case MESA_SHADER_KERNEL:
nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr));
break;
case MESA_SHADER_RAYGEN:
case MESA_SHADER_ANY_HIT:
case MESA_SHADER_CLOSEST_HIT:
case MESA_SHADER_MISS:
case MESA_SHADER_INTERSECTION:
case MESA_SHADER_CALLABLE:
nir_emit_bs_intrinsic(abld, nir_instr_as_intrinsic(instr));
break;
case MESA_SHADER_TASK:
nir_emit_task_intrinsic(abld, nir_instr_as_intrinsic(instr));
break;
case MESA_SHADER_MESH:
nir_emit_mesh_intrinsic(abld, nir_instr_as_intrinsic(instr));
break;
default:
unreachable("unsupported shader stage");
}
break;
case nir_instr_type_tex:
nir_emit_texture(abld, nir_instr_as_tex(instr));
break;
case nir_instr_type_load_const:
nir_emit_load_const(abld, nir_instr_as_load_const(instr));
break;
case nir_instr_type_undef:
/* We create a new VGRF for undefs on every use (by handling
* them in get_nir_src()), rather than for each definition.
* This helps register coalescing eliminate MOVs from undef.
*/
break;
case nir_instr_type_jump:
nir_emit_jump(abld, nir_instr_as_jump(instr));
break;
default:
unreachable("unknown instruction type");
}
}
/**
* Recognizes a parent instruction of nir_op_extract_* and changes the type to
* match instr.
*/
bool
fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
const fs_reg &result)
{
if (!instr->src[0].src.ssa->parent_instr)
return false;
if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
return false;
nir_alu_instr *src0 =
nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 &&
src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
return false;
unsigned element = nir_src_as_uint(src0->src[1].src);
/* Element type to extract.*/
const brw_reg_type type = brw_int_type(
src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1,
src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8);
fs_reg op0 = get_nir_src(src0->src[0].src);
op0.type = brw_type_for_nir_type(devinfo,
(nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
nir_src_bit_size(src0->src[0].src)));
op0 = offset(op0, bld, src0->src[0].swizzle[0]);
intel/compiler: Drop nir_lower_to_source_mods() and related handling. I think we're unanimous in wanting to drop nir_lower_to_source_mods. It's a bit of complexity to handle in the backend, but perhaps more importantly, would be even more complexity to handle in nir_search. And, it turns out that since we made other compiler improvements in the last few years, they no longer appear to buy us anything of value. Summarizing the results from shader-db from this patch: - Icelake (scalar mode) Instruction counts: - 411 helped, 598 hurt (out of 139,470 shaders) - 99.2% of shaders remain unaffected. The average increase in instruction count in hurt programs is 1.78 instructions. - total instructions in shared programs: 17214951 -> 17215206 (<.01%) - instructions in affected programs: 1143879 -> 1144134 (0.02%) Cycles: - 1042 helped, 1357 hurt - total cycles in shared programs: 365613294 -> 365882263 (0.07%) - cycles in affected programs: 138155497 -> 138424466 (0.19%) - Haswell (both scalar and vector modes) Instruction counts: - 73 helped, 1680 hurt (out of 139,470 shaders) - 98.7% of shaders remain unaffected. The average increase in instruction count in hurt programs is 1.9 instructions. - total instructions in shared programs: 14199527 -> 14202262 (0.02%) - instructions in affected programs: 446499 -> 449234 (0.61%) Cycles: - 5253 helped, 5559 hurt - total cycles in shared programs: 359996545 -> 360038731 (0.01%) - cycles in affected programs: 155897127 -> 155939313 (0.03%) Given that ~99% of shader-db remains unaffected, and the affected programs are hurt by about 1-2 instructions - which are all cheap ALU instructions - this is unlikely to be measurable in terms of any real performance impact that would affect users. So, drop them and simplify the backend, and hopefully enable other future simplifications in NIR. Reviewed-by: Eric Anholt <eric@anholt.net> [v1] Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4616>
2020-04-18 01:20:42 -07:00
bld.MOV(result, subscript(op0, type, element));
return true;
}
bool
fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
const fs_reg &result)
{
nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src);
if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face)
return false;
if (!nir_src_is_const(instr->src[1].src) ||
!nir_src_is_const(instr->src[2].src))
return false;
const float value1 = nir_src_as_float(instr->src[1].src);
const float value2 = nir_src_as_float(instr->src[2].src);
if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f)
return false;
/* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */
assert(value1 == -value2);
fs_reg tmp = vgrf(glsl_type::int_type);
if (devinfo->ver >= 12) {
/* Bit 15 of g1.1 is 0 if the polygon is front facing. */
fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W));
/* For (gl_FrontFacing ? 1.0 : -1.0), emit:
*
* or(8) tmp.1<2>W g1.1<0,1,0>W 0x00003f80W
* and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
*
* and negate g1.1<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
*/
if (value1 == -1.0f)
g1.negate = true;
bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
g1, brw_imm_uw(0x3f80));
} else if (devinfo->ver >= 6) {
/* Bit 15 of g0.0 is 0 if the polygon is front facing. */
fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
/* For (gl_FrontFacing ? 1.0 : -1.0), emit:
*
* or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W
* and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
*
* and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
*
* This negation looks like it's safe in practice, because bits 0:4 will
* surely be TRIANGLES
*/
if (value1 == -1.0f) {
g0.negate = true;
}
bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
g0, brw_imm_uw(0x3f80));
} else {
/* Bit 31 of g1.6 is 0 if the polygon is front facing. */
fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
/* For (gl_FrontFacing ? 1.0 : -1.0), emit:
*
* or(8) tmp<1>D g1.6<0,1,0>D 0x3f800000D
* and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
*
* and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
*
* This negation looks like it's safe in practice, because bits 0:4 will
* surely be TRIANGLES
*/
if (value1 == -1.0f) {
g1_6.negate = true;
}
bld.OR(tmp, g1_6, brw_imm_d(0x3f800000));
}
bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000));
return true;
}
static brw_rnd_mode
brw_rnd_mode_from_nir_op (const nir_op op) {
switch (op) {
case nir_op_f2f16_rtz:
return BRW_RND_MODE_RTZ;
case nir_op_f2f16_rtne:
return BRW_RND_MODE_RTNE;
default:
unreachable("Operation doesn't support rounding mode");
}
}
static brw_rnd_mode
brw_rnd_mode_from_execution_mode(unsigned execution_mode)
{
if (nir_has_any_rounding_mode_rtne(execution_mode))
return BRW_RND_MODE_RTNE;
if (nir_has_any_rounding_mode_rtz(execution_mode))
return BRW_RND_MODE_RTZ;
return BRW_RND_MODE_UNSPECIFIED;
}
fs_reg
fs_visitor::prepare_alu_destination_and_sources(const fs_builder &bld,
nir_alu_instr *instr,
fs_reg *op,
bool need_dest)
{
fs_reg result =
need_dest ? get_nir_def(instr->def) : bld.null_reg_ud();
result.type = brw_type_for_nir_type(devinfo,
(nir_alu_type)(nir_op_infos[instr->op].output_type |
instr->def.bit_size));
for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
op[i] = get_nir_src(instr->src[i].src);
op[i].type = brw_type_for_nir_type(devinfo,
(nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
nir_src_bit_size(instr->src[i].src)));
}
/* Move and vecN instrutions may still be vectored. Return the raw,
* vectored source and destination so that fs_visitor::nir_emit_alu can
* handle it. Other callers should not have to handle these kinds of
* instructions.
*/
switch (instr->op) {
case nir_op_mov:
case nir_op_vec2:
case nir_op_vec3:
case nir_op_vec4:
case nir_op_vec8:
case nir_op_vec16:
return result;
default:
break;
}
/* At this point, we have dealt with any instruction that operates on
* more than a single channel. Therefore, we can just adjust the source
* and destination registers for that channel and emit the instruction.
*/
unsigned channel = 0;
if (nir_op_infos[instr->op].output_size == 0) {
/* Since NIR is doing the scalarizing for us, we should only ever see
* vectorized operations with a single channel.
*/
nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
assert(util_bitcount(write_mask) == 1);
channel = ffs(write_mask) - 1;
result = offset(result, bld, channel);
}
for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
assert(nir_op_infos[instr->op].input_sizes[i] < 2);
op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
}
return result;
}
intel/fs: Emit logical-not of operands on Gen8+ On Gen8+ specifying negation of a logical operation such as AND actually performs a logical-not. Take advantage of this to generate fewer instructions. v2: Major rebase. Use nir_src_as_alu_instr. Fix swizzle handling. No changes on any pre-Gen8 platform. Skylake and Broadwell had similar results. (Broadwell shown) total instructions in shared programs: 15466902 -> 15466274 (<.01%) instructions in affected programs: 1262953 -> 1262325 (-0.05%) helped: 682 HURT: 4 helped stats (abs) min: 1 max: 5 x̄: 1.02 x̃: 1 helped stats (rel) min: 0.03% max: 2.40% x̄: 0.18% x̃: 0.04% HURT stats (abs) min: 1 max: 62 x̄: 17.50 x̃: 3 HURT stats (rel) min: 0.03% max: 1.89% x̄: 0.53% x̃: 0.10% 95% mean confidence interval for instructions value: -1.10 -0.73 95% mean confidence interval for instructions %-change: -0.19% -0.15% Instructions are helped. total cycles in shared programs: 410996093 -> 410950440 (-0.01%) cycles in affected programs: 144389048 -> 144343395 (-0.03%) helped: 519 HURT: 51 helped stats (abs) min: 1 max: 1060 x̄: 104.46 x̃: 140 helped stats (rel) min: 0.01% max: 10.98% x̄: 0.34% x̃: 0.03% HURT stats (abs) min: 1 max: 4060 x̄: 167.90 x̃: 22 HURT stats (rel) min: <.01% max: 8.20% x̄: 0.96% x̃: 0.25% 95% mean confidence interval for cycles value: -97.16 -63.02 95% mean confidence interval for cycles %-change: -0.32% -0.13% Cycles are helped. total spills in shared programs: 95311 -> 95329 (0.02%) spills in affected programs: 881 -> 899 (2.04%) helped: 0 HURT: 4 total fills in shared programs: 93629 -> 93634 (<.01%) fills in affected programs: 794 -> 799 (0.63%) helped: 1 HURT: 2 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2017-02-09 15:20:04 +00:00
void
fs_visitor::resolve_inot_sources(const fs_builder &bld, nir_alu_instr *instr,
fs_reg *op)
{
for (unsigned i = 0; i < 2; i++) {
nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src);
intel/fs: Emit logical-not of operands on Gen8+ On Gen8+ specifying negation of a logical operation such as AND actually performs a logical-not. Take advantage of this to generate fewer instructions. v2: Major rebase. Use nir_src_as_alu_instr. Fix swizzle handling. No changes on any pre-Gen8 platform. Skylake and Broadwell had similar results. (Broadwell shown) total instructions in shared programs: 15466902 -> 15466274 (<.01%) instructions in affected programs: 1262953 -> 1262325 (-0.05%) helped: 682 HURT: 4 helped stats (abs) min: 1 max: 5 x̄: 1.02 x̃: 1 helped stats (rel) min: 0.03% max: 2.40% x̄: 0.18% x̃: 0.04% HURT stats (abs) min: 1 max: 62 x̄: 17.50 x̃: 3 HURT stats (rel) min: 0.03% max: 1.89% x̄: 0.53% x̃: 0.10% 95% mean confidence interval for instructions value: -1.10 -0.73 95% mean confidence interval for instructions %-change: -0.19% -0.15% Instructions are helped. total cycles in shared programs: 410996093 -> 410950440 (-0.01%) cycles in affected programs: 144389048 -> 144343395 (-0.03%) helped: 519 HURT: 51 helped stats (abs) min: 1 max: 1060 x̄: 104.46 x̃: 140 helped stats (rel) min: 0.01% max: 10.98% x̄: 0.34% x̃: 0.03% HURT stats (abs) min: 1 max: 4060 x̄: 167.90 x̃: 22 HURT stats (rel) min: <.01% max: 8.20% x̄: 0.96% x̃: 0.25% 95% mean confidence interval for cycles value: -97.16 -63.02 95% mean confidence interval for cycles %-change: -0.32% -0.13% Cycles are helped. total spills in shared programs: 95311 -> 95329 (0.02%) spills in affected programs: 881 -> 899 (2.04%) helped: 0 HURT: 4 total fills in shared programs: 93629 -> 93634 (<.01%) fills in affected programs: 794 -> 799 (0.63%) helped: 1 HURT: 2 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2017-02-09 15:20:04 +00:00
intel/compiler: Drop nir_lower_to_source_mods() and related handling. I think we're unanimous in wanting to drop nir_lower_to_source_mods. It's a bit of complexity to handle in the backend, but perhaps more importantly, would be even more complexity to handle in nir_search. And, it turns out that since we made other compiler improvements in the last few years, they no longer appear to buy us anything of value. Summarizing the results from shader-db from this patch: - Icelake (scalar mode) Instruction counts: - 411 helped, 598 hurt (out of 139,470 shaders) - 99.2% of shaders remain unaffected. The average increase in instruction count in hurt programs is 1.78 instructions. - total instructions in shared programs: 17214951 -> 17215206 (<.01%) - instructions in affected programs: 1143879 -> 1144134 (0.02%) Cycles: - 1042 helped, 1357 hurt - total cycles in shared programs: 365613294 -> 365882263 (0.07%) - cycles in affected programs: 138155497 -> 138424466 (0.19%) - Haswell (both scalar and vector modes) Instruction counts: - 73 helped, 1680 hurt (out of 139,470 shaders) - 98.7% of shaders remain unaffected. The average increase in instruction count in hurt programs is 1.9 instructions. - total instructions in shared programs: 14199527 -> 14202262 (0.02%) - instructions in affected programs: 446499 -> 449234 (0.61%) Cycles: - 5253 helped, 5559 hurt - total cycles in shared programs: 359996545 -> 360038731 (0.01%) - cycles in affected programs: 155897127 -> 155939313 (0.03%) Given that ~99% of shader-db remains unaffected, and the affected programs are hurt by about 1-2 instructions - which are all cheap ALU instructions - this is unlikely to be measurable in terms of any real performance impact that would affect users. So, drop them and simplify the backend, and hopefully enable other future simplifications in NIR. Reviewed-by: Eric Anholt <eric@anholt.net> [v1] Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4616>
2020-04-18 01:20:42 -07:00
if (inot_instr != NULL && inot_instr->op == nir_op_inot) {
intel/fs: Emit logical-not of operands on Gen8+ On Gen8+ specifying negation of a logical operation such as AND actually performs a logical-not. Take advantage of this to generate fewer instructions. v2: Major rebase. Use nir_src_as_alu_instr. Fix swizzle handling. No changes on any pre-Gen8 platform. Skylake and Broadwell had similar results. (Broadwell shown) total instructions in shared programs: 15466902 -> 15466274 (<.01%) instructions in affected programs: 1262953 -> 1262325 (-0.05%) helped: 682 HURT: 4 helped stats (abs) min: 1 max: 5 x̄: 1.02 x̃: 1 helped stats (rel) min: 0.03% max: 2.40% x̄: 0.18% x̃: 0.04% HURT stats (abs) min: 1 max: 62 x̄: 17.50 x̃: 3 HURT stats (rel) min: 0.03% max: 1.89% x̄: 0.53% x̃: 0.10% 95% mean confidence interval for instructions value: -1.10 -0.73 95% mean confidence interval for instructions %-change: -0.19% -0.15% Instructions are helped. total cycles in shared programs: 410996093 -> 410950440 (-0.01%) cycles in affected programs: 144389048 -> 144343395 (-0.03%) helped: 519 HURT: 51 helped stats (abs) min: 1 max: 1060 x̄: 104.46 x̃: 140 helped stats (rel) min: 0.01% max: 10.98% x̄: 0.34% x̃: 0.03% HURT stats (abs) min: 1 max: 4060 x̄: 167.90 x̃: 22 HURT stats (rel) min: <.01% max: 8.20% x̄: 0.96% x̃: 0.25% 95% mean confidence interval for cycles value: -97.16 -63.02 95% mean confidence interval for cycles %-change: -0.32% -0.13% Cycles are helped. total spills in shared programs: 95311 -> 95329 (0.02%) spills in affected programs: 881 -> 899 (2.04%) helped: 0 HURT: 4 total fills in shared programs: 93629 -> 93634 (<.01%) fills in affected programs: 794 -> 799 (0.63%) helped: 1 HURT: 2 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2017-02-09 15:20:04 +00:00
/* The source of the inot is now the source of instr. */
prepare_alu_destination_and_sources(bld, inot_instr, &op[i], false);
assert(!op[i].negate);
op[i].negate = true;
} else {
op[i] = resolve_source_modifiers(bld, op[i]);
intel/fs: Emit logical-not of operands on Gen8+ On Gen8+ specifying negation of a logical operation such as AND actually performs a logical-not. Take advantage of this to generate fewer instructions. v2: Major rebase. Use nir_src_as_alu_instr. Fix swizzle handling. No changes on any pre-Gen8 platform. Skylake and Broadwell had similar results. (Broadwell shown) total instructions in shared programs: 15466902 -> 15466274 (<.01%) instructions in affected programs: 1262953 -> 1262325 (-0.05%) helped: 682 HURT: 4 helped stats (abs) min: 1 max: 5 x̄: 1.02 x̃: 1 helped stats (rel) min: 0.03% max: 2.40% x̄: 0.18% x̃: 0.04% HURT stats (abs) min: 1 max: 62 x̄: 17.50 x̃: 3 HURT stats (rel) min: 0.03% max: 1.89% x̄: 0.53% x̃: 0.10% 95% mean confidence interval for instructions value: -1.10 -0.73 95% mean confidence interval for instructions %-change: -0.19% -0.15% Instructions are helped. total cycles in shared programs: 410996093 -> 410950440 (-0.01%) cycles in affected programs: 144389048 -> 144343395 (-0.03%) helped: 519 HURT: 51 helped stats (abs) min: 1 max: 1060 x̄: 104.46 x̃: 140 helped stats (rel) min: 0.01% max: 10.98% x̄: 0.34% x̃: 0.03% HURT stats (abs) min: 1 max: 4060 x̄: 167.90 x̃: 22 HURT stats (rel) min: <.01% max: 8.20% x̄: 0.96% x̃: 0.25% 95% mean confidence interval for cycles value: -97.16 -63.02 95% mean confidence interval for cycles %-change: -0.32% -0.13% Cycles are helped. total spills in shared programs: 95311 -> 95329 (0.02%) spills in affected programs: 881 -> 899 (2.04%) helped: 0 HURT: 4 total fills in shared programs: 93629 -> 93634 (<.01%) fills in affected programs: 794 -> 799 (0.63%) helped: 1 HURT: 2 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2017-02-09 15:20:04 +00:00
}
}
}
intel/fs: Emit better code for b2f(inot(a)) and b2i(inot(a)) Since Boolean values are either -1 (true) or 0 (false), b2f(inot(a)) maps -1 => 0.0 and 0 => 1.0. This is equivalent to 1.0 + float(boolBitsToInt(a)). On Intel GPUs, ADD is one of the few instructions that can type-convert during write to destination, so we can achieve this in a single instruction: add g47F, g26D, 1D v2: Fix swizzles. v3: Fix typos in comments. Noticed by Ken. All Gen6+ platforms had similar results. (Skylake shown) Skylake total instructions in shared programs: 15185583 -> 15184683 (<.01%) instructions in affected programs: 239389 -> 238489 (-0.38%) helped: 899 HURT: 1 helped stats (abs) min: 1 max: 2 x̄: 1.00 x̃: 1 helped stats (rel) min: 0.15% max: 1.85% x̄: 0.49% x̃: 0.44% HURT stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 HURT stats (rel) min: 0.09% max: 0.09% x̄: 0.09% x̃: 0.09% 95% mean confidence interval for instructions value: -1.01 -0.99 95% mean confidence interval for instructions %-change: -0.51% -0.48% Instructions are helped. total cycles in shared programs: 370964249 -> 370961508 (<.01%) cycles in affected programs: 1487586 -> 1484845 (-0.18%) helped: 420 HURT: 268 helped stats (abs) min: 1 max: 232 x̄: 22.41 x̃: 6 helped stats (rel) min: 0.05% max: 22.60% x̄: 1.30% x̃: 0.41% HURT stats (abs) min: 1 max: 230 x̄: 24.90 x̃: 10 HURT stats (rel) min: <.01% max: 21.60% x̄: 1.45% x̃: 0.52% 95% mean confidence interval for cycles value: -7.61 -0.36 95% mean confidence interval for cycles %-change: -0.44% -0.02% Cycles are helped. No changes on Iron Lake or GM45. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2018-12-03 15:53:36 -08:00
bool
fs_visitor::try_emit_b2fi_of_inot(const fs_builder &bld,
fs_reg result,
nir_alu_instr *instr)
{
intel/fs: Enable b2f(inot(a)) and b2i(inot(a)) optimization for Gfx12+ The commit enables the optimization for Intel Gfx12+ graphics. Tigerlake ``` total instructions in shared programs: 1289326 -> 1289015 (-0.02%) instructions in affected programs: 37841 -> 37530 (-0.82%) helped: 78 HURT: 9 helped stats (abs) min: 1 max: 26 x̄: 4.69 x̃: 3 helped stats (rel) min: 0.10% max: 12.50% x̄: 2.07% x̃: 1.21% HURT stats (abs) min: 1 max: 18 x̄: 6.11 x̃: 4 HURT stats (rel) min: 0.16% max: 1.95% x̄: 0.94% x̃: 0.61% 95% mean confidence interval for instructions value: -4.95 -2.20 95% mean confidence interval for instructions %-change: -2.34% -1.18% Instructions are helped. total cycles in shared programs: 105606388 -> 105606442 (<.01%) cycles in affected programs: 620119 -> 620173 (<.01%) helped: 49 HURT: 28 helped stats (abs) min: 2 max: 3618 x̄: 228.63 x̃: 12 helped stats (rel) min: 0.02% max: 23.31% x̄: 4.60% x̃: 1.11% HURT stats (abs) min: 1 max: 2142 x̄: 402.04 x̃: 29 HURT stats (rel) min: 0.01% max: 36.42% x̄: 5.01% x̃: 0.46% 95% mean confidence interval for cycles value: -151.80 153.20 95% mean confidence interval for cycles %-change: -3.00% 0.79% Inconclusive result (value mean confidence interval includes 0). ``` Related-to: https://gitlab.freedesktop.org/mesa/mesa/-/commit/7725d609387a8165ccb71e2d9e0221d9248b1729 Signed-off-by: Mykhailo Skorokhodov <mykhailo.skorokhodov@globallogic.com> Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14017>
2021-12-02 10:33:05 +02:00
if (devinfo->ver < 6 || devinfo->verx10 >= 125)
intel/fs: Emit better code for b2f(inot(a)) and b2i(inot(a)) Since Boolean values are either -1 (true) or 0 (false), b2f(inot(a)) maps -1 => 0.0 and 0 => 1.0. This is equivalent to 1.0 + float(boolBitsToInt(a)). On Intel GPUs, ADD is one of the few instructions that can type-convert during write to destination, so we can achieve this in a single instruction: add g47F, g26D, 1D v2: Fix swizzles. v3: Fix typos in comments. Noticed by Ken. All Gen6+ platforms had similar results. (Skylake shown) Skylake total instructions in shared programs: 15185583 -> 15184683 (<.01%) instructions in affected programs: 239389 -> 238489 (-0.38%) helped: 899 HURT: 1 helped stats (abs) min: 1 max: 2 x̄: 1.00 x̃: 1 helped stats (rel) min: 0.15% max: 1.85% x̄: 0.49% x̃: 0.44% HURT stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 HURT stats (rel) min: 0.09% max: 0.09% x̄: 0.09% x̃: 0.09% 95% mean confidence interval for instructions value: -1.01 -0.99 95% mean confidence interval for instructions %-change: -0.51% -0.48% Instructions are helped. total cycles in shared programs: 370964249 -> 370961508 (<.01%) cycles in affected programs: 1487586 -> 1484845 (-0.18%) helped: 420 HURT: 268 helped stats (abs) min: 1 max: 232 x̄: 22.41 x̃: 6 helped stats (rel) min: 0.05% max: 22.60% x̄: 1.30% x̃: 0.41% HURT stats (abs) min: 1 max: 230 x̄: 24.90 x̃: 10 HURT stats (rel) min: <.01% max: 21.60% x̄: 1.45% x̃: 0.52% 95% mean confidence interval for cycles value: -7.61 -0.36 95% mean confidence interval for cycles %-change: -0.44% -0.02% Cycles are helped. No changes on Iron Lake or GM45. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2018-12-03 15:53:36 -08:00
return false;
nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src);
intel/fs: Emit better code for b2f(inot(a)) and b2i(inot(a)) Since Boolean values are either -1 (true) or 0 (false), b2f(inot(a)) maps -1 => 0.0 and 0 => 1.0. This is equivalent to 1.0 + float(boolBitsToInt(a)). On Intel GPUs, ADD is one of the few instructions that can type-convert during write to destination, so we can achieve this in a single instruction: add g47F, g26D, 1D v2: Fix swizzles. v3: Fix typos in comments. Noticed by Ken. All Gen6+ platforms had similar results. (Skylake shown) Skylake total instructions in shared programs: 15185583 -> 15184683 (<.01%) instructions in affected programs: 239389 -> 238489 (-0.38%) helped: 899 HURT: 1 helped stats (abs) min: 1 max: 2 x̄: 1.00 x̃: 1 helped stats (rel) min: 0.15% max: 1.85% x̄: 0.49% x̃: 0.44% HURT stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 HURT stats (rel) min: 0.09% max: 0.09% x̄: 0.09% x̃: 0.09% 95% mean confidence interval for instructions value: -1.01 -0.99 95% mean confidence interval for instructions %-change: -0.51% -0.48% Instructions are helped. total cycles in shared programs: 370964249 -> 370961508 (<.01%) cycles in affected programs: 1487586 -> 1484845 (-0.18%) helped: 420 HURT: 268 helped stats (abs) min: 1 max: 232 x̄: 22.41 x̃: 6 helped stats (rel) min: 0.05% max: 22.60% x̄: 1.30% x̃: 0.41% HURT stats (abs) min: 1 max: 230 x̄: 24.90 x̃: 10 HURT stats (rel) min: <.01% max: 21.60% x̄: 1.45% x̃: 0.52% 95% mean confidence interval for cycles value: -7.61 -0.36 95% mean confidence interval for cycles %-change: -0.44% -0.02% Cycles are helped. No changes on Iron Lake or GM45. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2018-12-03 15:53:36 -08:00
if (inot_instr == NULL || inot_instr->op != nir_op_inot)
return false;
/* HF is also possible as a destination on BDW+. For nir_op_b2i, the set
* of valid size-changing combinations is a bit more complex.
*
* The source restriction is just because I was lazy about generating the
* constant below.
*/
if (instr->def.bit_size != 32 ||
intel/fs: Emit better code for b2f(inot(a)) and b2i(inot(a)) Since Boolean values are either -1 (true) or 0 (false), b2f(inot(a)) maps -1 => 0.0 and 0 => 1.0. This is equivalent to 1.0 + float(boolBitsToInt(a)). On Intel GPUs, ADD is one of the few instructions that can type-convert during write to destination, so we can achieve this in a single instruction: add g47F, g26D, 1D v2: Fix swizzles. v3: Fix typos in comments. Noticed by Ken. All Gen6+ platforms had similar results. (Skylake shown) Skylake total instructions in shared programs: 15185583 -> 15184683 (<.01%) instructions in affected programs: 239389 -> 238489 (-0.38%) helped: 899 HURT: 1 helped stats (abs) min: 1 max: 2 x̄: 1.00 x̃: 1 helped stats (rel) min: 0.15% max: 1.85% x̄: 0.49% x̃: 0.44% HURT stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 HURT stats (rel) min: 0.09% max: 0.09% x̄: 0.09% x̃: 0.09% 95% mean confidence interval for instructions value: -1.01 -0.99 95% mean confidence interval for instructions %-change: -0.51% -0.48% Instructions are helped. total cycles in shared programs: 370964249 -> 370961508 (<.01%) cycles in affected programs: 1487586 -> 1484845 (-0.18%) helped: 420 HURT: 268 helped stats (abs) min: 1 max: 232 x̄: 22.41 x̃: 6 helped stats (rel) min: 0.05% max: 22.60% x̄: 1.30% x̃: 0.41% HURT stats (abs) min: 1 max: 230 x̄: 24.90 x̃: 10 HURT stats (rel) min: <.01% max: 21.60% x̄: 1.45% x̃: 0.52% 95% mean confidence interval for cycles value: -7.61 -0.36 95% mean confidence interval for cycles %-change: -0.44% -0.02% Cycles are helped. No changes on Iron Lake or GM45. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2018-12-03 15:53:36 -08:00
nir_src_bit_size(inot_instr->src[0].src) != 32)
return false;
/* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0. Since a can only be 0 or -1,
* this is float(1 + a).
*/
fs_reg op;
prepare_alu_destination_and_sources(bld, inot_instr, &op, false);
/* Ignore the saturate modifier, if there is one. The result of the
* arithmetic can only be 0 or 1, so the clamping will do nothing anyway.
*/
intel/fs: Emit better code for b2f(inot(a)) and b2i(inot(a)) Since Boolean values are either -1 (true) or 0 (false), b2f(inot(a)) maps -1 => 0.0 and 0 => 1.0. This is equivalent to 1.0 + float(boolBitsToInt(a)). On Intel GPUs, ADD is one of the few instructions that can type-convert during write to destination, so we can achieve this in a single instruction: add g47F, g26D, 1D v2: Fix swizzles. v3: Fix typos in comments. Noticed by Ken. All Gen6+ platforms had similar results. (Skylake shown) Skylake total instructions in shared programs: 15185583 -> 15184683 (<.01%) instructions in affected programs: 239389 -> 238489 (-0.38%) helped: 899 HURT: 1 helped stats (abs) min: 1 max: 2 x̄: 1.00 x̃: 1 helped stats (rel) min: 0.15% max: 1.85% x̄: 0.49% x̃: 0.44% HURT stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 HURT stats (rel) min: 0.09% max: 0.09% x̄: 0.09% x̃: 0.09% 95% mean confidence interval for instructions value: -1.01 -0.99 95% mean confidence interval for instructions %-change: -0.51% -0.48% Instructions are helped. total cycles in shared programs: 370964249 -> 370961508 (<.01%) cycles in affected programs: 1487586 -> 1484845 (-0.18%) helped: 420 HURT: 268 helped stats (abs) min: 1 max: 232 x̄: 22.41 x̃: 6 helped stats (rel) min: 0.05% max: 22.60% x̄: 1.30% x̃: 0.41% HURT stats (abs) min: 1 max: 230 x̄: 24.90 x̃: 10 HURT stats (rel) min: <.01% max: 21.60% x̄: 1.45% x̃: 0.52% 95% mean confidence interval for cycles value: -7.61 -0.36 95% mean confidence interval for cycles %-change: -0.44% -0.02% Cycles are helped. No changes on Iron Lake or GM45. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2018-12-03 15:53:36 -08:00
bld.ADD(result, op, brw_imm_d(1));
return true;
}
/**
* Emit code for nir_op_fsign possibly fused with a nir_op_fmul
*
* If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of
* the source of \c instr that is a \c nir_op_fsign.
*/
void
fs_visitor::emit_fsign(const fs_builder &bld, const nir_alu_instr *instr,
fs_reg result, fs_reg *op, unsigned fsign_src)
{
fs_inst *inst;
assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul);
assert(fsign_src < nir_op_infos[instr->op].num_inputs);
if (instr->op != nir_op_fsign) {
const nir_alu_instr *const fsign_instr =
nir_src_as_alu_instr(instr->src[fsign_src].src);
/* op[fsign_src] has the nominal result of the fsign, and op[1 -
* fsign_src] has the other multiply source. This must be rearranged so
* that op[0] is the source of the fsign op[1] is the other multiply
* source.
*/
if (fsign_src != 0)
op[1] = op[0];
op[0] = get_nir_src(fsign_instr->src[0].src);
const nir_alu_type t =
(nir_alu_type)(nir_op_infos[instr->op].input_types[0] |
nir_src_bit_size(fsign_instr->src[0].src));
op[0].type = brw_type_for_nir_type(devinfo, t);
unsigned channel = 0;
if (nir_op_infos[instr->op].output_size == 0) {
/* Since NIR is doing the scalarizing for us, we should only ever see
* vectorized operations with a single channel.
*/
nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
assert(util_bitcount(write_mask) == 1);
channel = ffs(write_mask) - 1;
}
op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]);
}
if (type_sz(op[0].type) == 2) {
/* AND(val, 0x8000) gives the sign bit.
*
* Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero.
*/
fs_reg zero = retype(brw_imm_uw(0), BRW_REGISTER_TYPE_HF);
bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ);
op[0].type = BRW_REGISTER_TYPE_UW;
result.type = BRW_REGISTER_TYPE_UW;
bld.AND(result, op[0], brw_imm_uw(0x8000u));
if (instr->op == nir_op_fsign)
inst = bld.OR(result, result, brw_imm_uw(0x3c00u));
else {
/* Use XOR here to get the result sign correct. */
inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UW));
}
inst->predicate = BRW_PREDICATE_NORMAL;
} else if (type_sz(op[0].type) == 4) {
/* AND(val, 0x80000000) gives the sign bit.
*
* Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
* zero.
*/
bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
op[0].type = BRW_REGISTER_TYPE_UD;
result.type = BRW_REGISTER_TYPE_UD;
bld.AND(result, op[0], brw_imm_ud(0x80000000u));
if (instr->op == nir_op_fsign)
inst = bld.OR(result, result, brw_imm_ud(0x3f800000u));
else {
/* Use XOR here to get the result sign correct. */
inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UD));
}
inst->predicate = BRW_PREDICATE_NORMAL;
} else {
/* For doubles we do the same but we need to consider:
*
* - 2-src instructions can't operate with 64-bit immediates
* - The sign is encoded in the high 32-bit of each DF
* - We need to produce a DF result.
*/
fs_reg zero = vgrf(glsl_type::double_type);
bld.MOV(zero, setup_imm_df(bld, 0.0));
bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ);
bld.MOV(result, zero);
fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1);
bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1),
brw_imm_ud(0x80000000u));
if (instr->op == nir_op_fsign) {
set_predicate(BRW_PREDICATE_NORMAL,
bld.OR(r, r, brw_imm_ud(0x3ff00000u)));
} else {
if (devinfo->has_64bit_int) {
/* This could be done better in some cases. If the scale is an
* immediate with the low 32-bits all 0, emitting a separate XOR and
* OR would allow an algebraic optimization to remove the OR. There
* are currently zero instances of fsign(double(x))*IMM in shader-db
* or any test suite, so it is hard to care at this time.
*/
fs_reg result_int64 = retype(result, BRW_REGISTER_TYPE_UQ);
inst = bld.XOR(result_int64, result_int64,
retype(op[1], BRW_REGISTER_TYPE_UQ));
} else {
fs_reg result_int64 = retype(result, BRW_REGISTER_TYPE_UQ);
bld.MOV(subscript(result_int64, BRW_REGISTER_TYPE_UD, 0),
subscript(op[1], BRW_REGISTER_TYPE_UD, 0));
bld.XOR(subscript(result_int64, BRW_REGISTER_TYPE_UD, 1),
subscript(result_int64, BRW_REGISTER_TYPE_UD, 1),
subscript(op[1], BRW_REGISTER_TYPE_UD, 1));
}
}
}
}
intel/fs: Generate better code for fsign multiplied by a value v2: Rebase on v2 changes in previous two commits. v3: Rebase on 85c35885b38 ("nir: Rework nir_src_as_alu_instr to not take a pointer"). shader-db results: Skylake and Broadwell had similar results. (Skylake shown) total instructions in shared programs: 15297100 -> 15282141 (-0.10%) instructions in affected programs: 956685 -> 941726 (-1.56%) helped: 4527 HURT: 0 helped stats (abs) min: 1 max: 221 x̄: 3.30 x̃: 2 helped stats (rel) min: 0.07% max: 10.53% x̄: 1.85% x̃: 1.37% 95% mean confidence interval for instructions value: -3.48 -3.12 95% mean confidence interval for instructions %-change: -1.88% -1.81% Instructions are helped. total cycles in shared programs: 372809551 -> 372597886 (-0.06%) cycles in affected programs: 13645512 -> 13433847 (-1.55%) helped: 4362 HURT: 125 helped stats (abs) min: 1 max: 2088 x̄: 50.73 x̃: 28 helped stats (rel) min: 0.01% max: 28.20% x̄: 2.77% x̃: 2.39% HURT stats (abs) min: 1 max: 1836 x̄: 76.90 x̃: 28 HURT stats (rel) min: <.01% max: 34.36% x̄: 3.03% x̃: 1.42% 95% mean confidence interval for cycles value: -50.98 -43.37 95% mean confidence interval for cycles %-change: -2.67% -2.55% Cycles are helped. total spills in shared programs: 23465 -> 23463 (<.01%) spills in affected programs: 42 -> 40 (-4.76%) helped: 1 HURT: 0 total fills in shared programs: 31766 -> 31763 (<.01%) fills in affected programs: 69 -> 66 (-4.35%) helped: 1 HURT: 0 Haswell total instructions in shared programs: 13839992 -> 13828311 (-0.08%) instructions in affected programs: 712503 -> 700822 (-1.64%) helped: 3477 HURT: 0 helped stats (abs) min: 1 max: 221 x̄: 3.36 x̃: 2 helped stats (rel) min: 0.07% max: 10.64% x̄: 1.96% x̃: 1.52% 95% mean confidence interval for instructions value: -3.58 -3.14 95% mean confidence interval for instructions %-change: -2.01% -1.92% Instructions are helped. total cycles in shared programs: 387026330 -> 386872483 (-0.04%) cycles in affected programs: 11329966 -> 11176119 (-1.36%) helped: 3307 HURT: 139 helped stats (abs) min: 2 max: 1776 x̄: 49.58 x̃: 18 helped stats (rel) min: 0.01% max: 20.38% x̄: 2.27% x̃: 1.79% HURT stats (abs) min: 1 max: 2314 x̄: 72.68 x̃: 20 HURT stats (rel) min: <.01% max: 33.99% x̄: 2.28% x̃: 0.96% 95% mean confidence interval for cycles value: -49.31 -39.98 95% mean confidence interval for cycles %-change: -2.15% -2.01% Cycles are helped. LOST: 1 GAINED: 0 Ivy Bridge total instructions in shared programs: 12045602 -> 12038463 (-0.06%) instructions in affected programs: 623837 -> 616698 (-1.14%) helped: 2498 HURT: 0 helped stats (abs) min: 1 max: 39 x̄: 2.86 x̃: 2 helped stats (rel) min: 0.05% max: 10.00% x̄: 1.30% x̃: 1.05% 95% mean confidence interval for instructions value: -2.96 -2.75 95% mean confidence interval for instructions %-change: -1.34% -1.26% Instructions are helped. total cycles in shared programs: 181025675 -> 180891323 (-0.07%) cycles in affected programs: 11329329 -> 11194977 (-1.19%) helped: 2439 HURT: 47 helped stats (abs) min: 1 max: 1565 x̄: 57.06 x̃: 26 helped stats (rel) min: 0.02% max: 24.56% x̄: 2.02% x̃: 1.64% HURT stats (abs) min: 1 max: 1269 x̄: 102.51 x̃: 43 HURT stats (rel) min: 0.11% max: 52.94% x̄: 4.15% x̃: 1.34% 95% mean confidence interval for cycles value: -59.91 -48.17 95% mean confidence interval for cycles %-change: -1.99% -1.82% Cycles are helped. Sandy Bridge, Iron Lake, and GM45 had similar results. (Sandy Bridge shown) total instructions in shared programs: 10896368 -> 10896339 (<.01%) instructions in affected programs: 3767 -> 3738 (-0.77%) helped: 17 HURT: 0 helped stats (abs) min: 1 max: 4 x̄: 1.71 x̃: 1 helped stats (rel) min: 0.13% max: 9.52% x̄: 3.58% x̃: 2.73% 95% mean confidence interval for instructions value: -2.27 -1.14 95% mean confidence interval for instructions %-change: -5.14% -2.03% Instructions are helped. total cycles in shared programs: 155091109 -> 155091021 (<.01%) cycles in affected programs: 47241 -> 47153 (-0.19%) helped: 15 HURT: 8 helped stats (abs) min: 2 max: 81 x̄: 15.73 x̃: 4 helped stats (rel) min: 0.03% max: 10.59% x̄: 1.55% x̃: 0.71% HURT stats (abs) min: 14 max: 32 x̄: 18.50 x̃: 17 HURT stats (rel) min: 0.32% max: 2.79% x̄: 2.43% x̃: 2.71% 95% mean confidence interval for cycles value: -14.59 6.93 95% mean confidence interval for cycles %-change: -1.41% 1.08% Inconclusive result (value mean confidence interval includes 0). Reviewed-by: Matt Turner <mattst88@gmail.com> [v2]
2018-06-25 19:55:31 -07:00
/**
* Determine whether sources of a nir_op_fmul can be fused with a nir_op_fsign
intel/fs: Generate better code for fsign multiplied by a value v2: Rebase on v2 changes in previous two commits. v3: Rebase on 85c35885b38 ("nir: Rework nir_src_as_alu_instr to not take a pointer"). shader-db results: Skylake and Broadwell had similar results. (Skylake shown) total instructions in shared programs: 15297100 -> 15282141 (-0.10%) instructions in affected programs: 956685 -> 941726 (-1.56%) helped: 4527 HURT: 0 helped stats (abs) min: 1 max: 221 x̄: 3.30 x̃: 2 helped stats (rel) min: 0.07% max: 10.53% x̄: 1.85% x̃: 1.37% 95% mean confidence interval for instructions value: -3.48 -3.12 95% mean confidence interval for instructions %-change: -1.88% -1.81% Instructions are helped. total cycles in shared programs: 372809551 -> 372597886 (-0.06%) cycles in affected programs: 13645512 -> 13433847 (-1.55%) helped: 4362 HURT: 125 helped stats (abs) min: 1 max: 2088 x̄: 50.73 x̃: 28 helped stats (rel) min: 0.01% max: 28.20% x̄: 2.77% x̃: 2.39% HURT stats (abs) min: 1 max: 1836 x̄: 76.90 x̃: 28 HURT stats (rel) min: <.01% max: 34.36% x̄: 3.03% x̃: 1.42% 95% mean confidence interval for cycles value: -50.98 -43.37 95% mean confidence interval for cycles %-change: -2.67% -2.55% Cycles are helped. total spills in shared programs: 23465 -> 23463 (<.01%) spills in affected programs: 42 -> 40 (-4.76%) helped: 1 HURT: 0 total fills in shared programs: 31766 -> 31763 (<.01%) fills in affected programs: 69 -> 66 (-4.35%) helped: 1 HURT: 0 Haswell total instructions in shared programs: 13839992 -> 13828311 (-0.08%) instructions in affected programs: 712503 -> 700822 (-1.64%) helped: 3477 HURT: 0 helped stats (abs) min: 1 max: 221 x̄: 3.36 x̃: 2 helped stats (rel) min: 0.07% max: 10.64% x̄: 1.96% x̃: 1.52% 95% mean confidence interval for instructions value: -3.58 -3.14 95% mean confidence interval for instructions %-change: -2.01% -1.92% Instructions are helped. total cycles in shared programs: 387026330 -> 386872483 (-0.04%) cycles in affected programs: 11329966 -> 11176119 (-1.36%) helped: 3307 HURT: 139 helped stats (abs) min: 2 max: 1776 x̄: 49.58 x̃: 18 helped stats (rel) min: 0.01% max: 20.38% x̄: 2.27% x̃: 1.79% HURT stats (abs) min: 1 max: 2314 x̄: 72.68 x̃: 20 HURT stats (rel) min: <.01% max: 33.99% x̄: 2.28% x̃: 0.96% 95% mean confidence interval for cycles value: -49.31 -39.98 95% mean confidence interval for cycles %-change: -2.15% -2.01% Cycles are helped. LOST: 1 GAINED: 0 Ivy Bridge total instructions in shared programs: 12045602 -> 12038463 (-0.06%) instructions in affected programs: 623837 -> 616698 (-1.14%) helped: 2498 HURT: 0 helped stats (abs) min: 1 max: 39 x̄: 2.86 x̃: 2 helped stats (rel) min: 0.05% max: 10.00% x̄: 1.30% x̃: 1.05% 95% mean confidence interval for instructions value: -2.96 -2.75 95% mean confidence interval for instructions %-change: -1.34% -1.26% Instructions are helped. total cycles in shared programs: 181025675 -> 180891323 (-0.07%) cycles in affected programs: 11329329 -> 11194977 (-1.19%) helped: 2439 HURT: 47 helped stats (abs) min: 1 max: 1565 x̄: 57.06 x̃: 26 helped stats (rel) min: 0.02% max: 24.56% x̄: 2.02% x̃: 1.64% HURT stats (abs) min: 1 max: 1269 x̄: 102.51 x̃: 43 HURT stats (rel) min: 0.11% max: 52.94% x̄: 4.15% x̃: 1.34% 95% mean confidence interval for cycles value: -59.91 -48.17 95% mean confidence interval for cycles %-change: -1.99% -1.82% Cycles are helped. Sandy Bridge, Iron Lake, and GM45 had similar results. (Sandy Bridge shown) total instructions in shared programs: 10896368 -> 10896339 (<.01%) instructions in affected programs: 3767 -> 3738 (-0.77%) helped: 17 HURT: 0 helped stats (abs) min: 1 max: 4 x̄: 1.71 x̃: 1 helped stats (rel) min: 0.13% max: 9.52% x̄: 3.58% x̃: 2.73% 95% mean confidence interval for instructions value: -2.27 -1.14 95% mean confidence interval for instructions %-change: -5.14% -2.03% Instructions are helped. total cycles in shared programs: 155091109 -> 155091021 (<.01%) cycles in affected programs: 47241 -> 47153 (-0.19%) helped: 15 HURT: 8 helped stats (abs) min: 2 max: 81 x̄: 15.73 x̃: 4 helped stats (rel) min: 0.03% max: 10.59% x̄: 1.55% x̃: 0.71% HURT stats (abs) min: 14 max: 32 x̄: 18.50 x̃: 17 HURT stats (rel) min: 0.32% max: 2.79% x̄: 2.43% x̃: 2.71% 95% mean confidence interval for cycles value: -14.59 6.93 95% mean confidence interval for cycles %-change: -1.41% 1.08% Inconclusive result (value mean confidence interval includes 0). Reviewed-by: Matt Turner <mattst88@gmail.com> [v2]
2018-06-25 19:55:31 -07:00
*
* Checks the operands of a \c nir_op_fmul to determine whether or not
* \c emit_fsign could fuse the multiplication with the \c sign() calculation.
*
* \param instr The multiplication instruction
*
* \param fsign_src The source of \c instr that may or may not be a
* \c nir_op_fsign
*/
static bool
can_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src)
{
assert(instr->op == nir_op_fmul);
nir_alu_instr *const fsign_instr =
nir_src_as_alu_instr(instr->src[fsign_src].src);
/* Rules:
*
* 1. instr->src[fsign_src] must be a nir_op_fsign.
* 2. The nir_op_fsign can only be used by this multiplication.
* 3. The source that is the nir_op_fsign does not have source modifiers.
* \c emit_fsign only examines the source modifiers of the source of the
* \c nir_op_fsign.
*
* The nir_op_fsign must also not have the saturate modifier, but steps
* have already been taken (in nir_opt_algebraic) to ensure that.
*/
return fsign_instr != NULL && fsign_instr->op == nir_op_fsign &&
intel/compiler: Drop nir_lower_to_source_mods() and related handling. I think we're unanimous in wanting to drop nir_lower_to_source_mods. It's a bit of complexity to handle in the backend, but perhaps more importantly, would be even more complexity to handle in nir_search. And, it turns out that since we made other compiler improvements in the last few years, they no longer appear to buy us anything of value. Summarizing the results from shader-db from this patch: - Icelake (scalar mode) Instruction counts: - 411 helped, 598 hurt (out of 139,470 shaders) - 99.2% of shaders remain unaffected. The average increase in instruction count in hurt programs is 1.78 instructions. - total instructions in shared programs: 17214951 -> 17215206 (<.01%) - instructions in affected programs: 1143879 -> 1144134 (0.02%) Cycles: - 1042 helped, 1357 hurt - total cycles in shared programs: 365613294 -> 365882263 (0.07%) - cycles in affected programs: 138155497 -> 138424466 (0.19%) - Haswell (both scalar and vector modes) Instruction counts: - 73 helped, 1680 hurt (out of 139,470 shaders) - 98.7% of shaders remain unaffected. The average increase in instruction count in hurt programs is 1.9 instructions. - total instructions in shared programs: 14199527 -> 14202262 (0.02%) - instructions in affected programs: 446499 -> 449234 (0.61%) Cycles: - 5253 helped, 5559 hurt - total cycles in shared programs: 359996545 -> 360038731 (0.01%) - cycles in affected programs: 155897127 -> 155939313 (0.03%) Given that ~99% of shader-db remains unaffected, and the affected programs are hurt by about 1-2 instructions - which are all cheap ALU instructions - this is unlikely to be measurable in terms of any real performance impact that would affect users. So, drop them and simplify the backend, and hopefully enable other future simplifications in NIR. Reviewed-by: Eric Anholt <eric@anholt.net> [v1] Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4616>
2020-04-18 01:20:42 -07:00
is_used_once(fsign_instr);
intel/fs: Generate better code for fsign multiplied by a value v2: Rebase on v2 changes in previous two commits. v3: Rebase on 85c35885b38 ("nir: Rework nir_src_as_alu_instr to not take a pointer"). shader-db results: Skylake and Broadwell had similar results. (Skylake shown) total instructions in shared programs: 15297100 -> 15282141 (-0.10%) instructions in affected programs: 956685 -> 941726 (-1.56%) helped: 4527 HURT: 0 helped stats (abs) min: 1 max: 221 x̄: 3.30 x̃: 2 helped stats (rel) min: 0.07% max: 10.53% x̄: 1.85% x̃: 1.37% 95% mean confidence interval for instructions value: -3.48 -3.12 95% mean confidence interval for instructions %-change: -1.88% -1.81% Instructions are helped. total cycles in shared programs: 372809551 -> 372597886 (-0.06%) cycles in affected programs: 13645512 -> 13433847 (-1.55%) helped: 4362 HURT: 125 helped stats (abs) min: 1 max: 2088 x̄: 50.73 x̃: 28 helped stats (rel) min: 0.01% max: 28.20% x̄: 2.77% x̃: 2.39% HURT stats (abs) min: 1 max: 1836 x̄: 76.90 x̃: 28 HURT stats (rel) min: <.01% max: 34.36% x̄: 3.03% x̃: 1.42% 95% mean confidence interval for cycles value: -50.98 -43.37 95% mean confidence interval for cycles %-change: -2.67% -2.55% Cycles are helped. total spills in shared programs: 23465 -> 23463 (<.01%) spills in affected programs: 42 -> 40 (-4.76%) helped: 1 HURT: 0 total fills in shared programs: 31766 -> 31763 (<.01%) fills in affected programs: 69 -> 66 (-4.35%) helped: 1 HURT: 0 Haswell total instructions in shared programs: 13839992 -> 13828311 (-0.08%) instructions in affected programs: 712503 -> 700822 (-1.64%) helped: 3477 HURT: 0 helped stats (abs) min: 1 max: 221 x̄: 3.36 x̃: 2 helped stats (rel) min: 0.07% max: 10.64% x̄: 1.96% x̃: 1.52% 95% mean confidence interval for instructions value: -3.58 -3.14 95% mean confidence interval for instructions %-change: -2.01% -1.92% Instructions are helped. total cycles in shared programs: 387026330 -> 386872483 (-0.04%) cycles in affected programs: 11329966 -> 11176119 (-1.36%) helped: 3307 HURT: 139 helped stats (abs) min: 2 max: 1776 x̄: 49.58 x̃: 18 helped stats (rel) min: 0.01% max: 20.38% x̄: 2.27% x̃: 1.79% HURT stats (abs) min: 1 max: 2314 x̄: 72.68 x̃: 20 HURT stats (rel) min: <.01% max: 33.99% x̄: 2.28% x̃: 0.96% 95% mean confidence interval for cycles value: -49.31 -39.98 95% mean confidence interval for cycles %-change: -2.15% -2.01% Cycles are helped. LOST: 1 GAINED: 0 Ivy Bridge total instructions in shared programs: 12045602 -> 12038463 (-0.06%) instructions in affected programs: 623837 -> 616698 (-1.14%) helped: 2498 HURT: 0 helped stats (abs) min: 1 max: 39 x̄: 2.86 x̃: 2 helped stats (rel) min: 0.05% max: 10.00% x̄: 1.30% x̃: 1.05% 95% mean confidence interval for instructions value: -2.96 -2.75 95% mean confidence interval for instructions %-change: -1.34% -1.26% Instructions are helped. total cycles in shared programs: 181025675 -> 180891323 (-0.07%) cycles in affected programs: 11329329 -> 11194977 (-1.19%) helped: 2439 HURT: 47 helped stats (abs) min: 1 max: 1565 x̄: 57.06 x̃: 26 helped stats (rel) min: 0.02% max: 24.56% x̄: 2.02% x̃: 1.64% HURT stats (abs) min: 1 max: 1269 x̄: 102.51 x̃: 43 HURT stats (rel) min: 0.11% max: 52.94% x̄: 4.15% x̃: 1.34% 95% mean confidence interval for cycles value: -59.91 -48.17 95% mean confidence interval for cycles %-change: -1.99% -1.82% Cycles are helped. Sandy Bridge, Iron Lake, and GM45 had similar results. (Sandy Bridge shown) total instructions in shared programs: 10896368 -> 10896339 (<.01%) instructions in affected programs: 3767 -> 3738 (-0.77%) helped: 17 HURT: 0 helped stats (abs) min: 1 max: 4 x̄: 1.71 x̃: 1 helped stats (rel) min: 0.13% max: 9.52% x̄: 3.58% x̃: 2.73% 95% mean confidence interval for instructions value: -2.27 -1.14 95% mean confidence interval for instructions %-change: -5.14% -2.03% Instructions are helped. total cycles in shared programs: 155091109 -> 155091021 (<.01%) cycles in affected programs: 47241 -> 47153 (-0.19%) helped: 15 HURT: 8 helped stats (abs) min: 2 max: 81 x̄: 15.73 x̃: 4 helped stats (rel) min: 0.03% max: 10.59% x̄: 1.55% x̃: 0.71% HURT stats (abs) min: 14 max: 32 x̄: 18.50 x̃: 17 HURT stats (rel) min: 0.32% max: 2.79% x̄: 2.43% x̃: 2.71% 95% mean confidence interval for cycles value: -14.59 6.93 95% mean confidence interval for cycles %-change: -1.41% 1.08% Inconclusive result (value mean confidence interval includes 0). Reviewed-by: Matt Turner <mattst88@gmail.com> [v2]
2018-06-25 19:55:31 -07:00
}
intel/fs: Emit better code for bfi(..., 0) DG2, Tiger Lake, Ice Lake, and Skylake had similar results (Ice Lake shown) total instructions in shared programs: 20570141 -> 20570063 (<.01%) instructions in affected programs: 30679 -> 30601 (-0.25%) helped: 77 / HURT: 0 total cycles in shared programs: 902113977 -> 902118723 (<.01%) cycles in affected programs: 3255958 -> 3260704 (0.15%) helped: 60 / HURT: 19 Broadwell total instructions in shared programs: 18524633 -> 18524547 (<.01%) instructions in affected programs: 34095 -> 34009 (-0.25%) helped: 75 / HURT: 2 total cycles in shared programs: 949532394 -> 949543761 (<.01%) cycles in affected programs: 3419107 -> 3430474 (0.33%) helped: 57 / HURT: 24 total spills in shared programs: 22484 -> 22484 (0.00%) spills in affected programs: 516 -> 516 (0.00%) helped: 2 / HURT: 2 total fills in shared programs: 29346 -> 29338 (-0.03%) fills in affected programs: 572 -> 564 (-1.40%) helped: 4 / HURT: 0 Haswell total instructions in shared programs: 17331356 -> 17331523 (<.01%) instructions in affected programs: 27920 -> 28087 (0.60%) helped: 41 / HURT: 4 total cycles in shared programs: 936603192 -> 936574664 (<.01%) cycles in affected programs: 3417695 -> 3389167 (-0.83%) helped: 28 / HURT: 21 total spills in shared programs: 19718 -> 19756 (0.19%) spills in affected programs: 436 -> 474 (8.72%) helped: 0 / HURT: 4 total fills in shared programs: 22547 -> 22607 (0.27%) fills in affected programs: 444 -> 504 (13.51%) helped: 0 / HURT: 4 Ivy Bridge total cycles in shared programs: 463451277 -> 463451273 (<.01%) cycles in affected programs: 95870 -> 95866 (<.01%) helped: 3 / HURT: 2 DG2, Tiger Lake, Ice Lake, and Skylake had similar results (Ice Lake shown) Totals: Instrs: 152825278 -> 152819969 (-0.00%); split: -0.00%, +0.00% Cycles: 15014075626 -> 15014628652 (+0.00%); split: -0.01%, +0.01% Subgroup size: 8528536 -> 8528560 (+0.00%) Send messages: 7711431 -> 7711464 (+0.00%) Spill count: 99907 -> 99509 (-0.40%); split: -0.40%, +0.00% Fill count: 202459 -> 201598 (-0.43%); split: -0.43%, +0.00% Scratch Memory Size: 4376576 -> 4371456 (-0.12%) Totals from 2915 (0.44% of 662497) affected shaders: Instrs: 2288842 -> 2283533 (-0.23%); split: -0.24%, +0.01% Cycles: 471633295 -> 472186321 (+0.12%); split: -0.27%, +0.39% Subgroup size: 27488 -> 27512 (+0.09%) Send messages: 151344 -> 151377 (+0.02%) Spill count: 48091 -> 47693 (-0.83%); split: -0.83%, +0.00% Fill count: 59053 -> 58192 (-1.46%); split: -1.46%, +0.00% Scratch Memory Size: 1827840 -> 1822720 (-0.28%) Reviewed-by: Matt Turner <mattst88@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19968>
2022-11-16 13:12:50 -08:00
static bool
is_const_zero(const nir_src &src)
{
return nir_src_is_const(src) && nir_src_as_int(src) == 0;
}
void
fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr,
bool need_dest)
{
fs_inst *inst;
unsigned execution_mode =
bld.shader->nir->info.float_controls_execution_mode;
fs_reg op[NIR_MAX_VEC_COMPONENTS];
fs_reg result = prepare_alu_destination_and_sources(bld, instr, op, need_dest);
#ifndef NDEBUG
/* Everything except raw moves, some type conversions, iabs, and ineg
* should have 8-bit sources lowered by nir_lower_bit_size in
* brw_preprocess_nir or by brw_nir_lower_conversions in
* brw_postprocess_nir.
*/
switch (instr->op) {
case nir_op_mov:
case nir_op_vec2:
case nir_op_vec3:
case nir_op_vec4:
case nir_op_vec8:
case nir_op_vec16:
case nir_op_i2f16:
case nir_op_i2f32:
case nir_op_i2i16:
case nir_op_i2i32:
case nir_op_u2f16:
case nir_op_u2f32:
case nir_op_u2u16:
case nir_op_u2u32:
case nir_op_iabs:
case nir_op_ineg:
case nir_op_pack_32_4x8_split:
break;
default:
for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
assert(type_sz(op[i].type) > 1);
}
}
#endif
switch (instr->op) {
case nir_op_mov:
case nir_op_vec2:
case nir_op_vec3:
case nir_op_vec4:
case nir_op_vec8:
case nir_op_vec16: {
fs_reg temp = result;
bool need_extra_copy = false;
nir_intrinsic_instr *store_reg =
nir_store_reg_for_def(&instr->def);
if (store_reg != NULL) {
nir_def *dest_reg = store_reg->src[1].ssa;
for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
nir_intrinsic_instr *load_reg =
nir_load_reg_for_def(instr->src[i].src.ssa);
if (load_reg == NULL)
continue;
if (load_reg->src[0].ssa == dest_reg) {
need_extra_copy = true;
temp = bld.vgrf(result.type, 4);
break;
}
}
}
nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
unsigned last_bit = util_last_bit(write_mask);
for (unsigned i = 0; i < last_bit; i++) {
if (!(write_mask & (1 << i)))
continue;
if (instr->op == nir_op_mov) {
bld.MOV(offset(temp, bld, i),
offset(op[0], bld, instr->src[0].swizzle[i]));
} else {
bld.MOV(offset(temp, bld, i),
offset(op[i], bld, instr->src[i].swizzle[0]));
}
}
/* In this case the source and destination registers were the same,
* so we need to insert an extra set of moves in order to deal with
* any swizzling.
*/
if (need_extra_copy) {
for (unsigned i = 0; i < last_bit; i++) {
if (!(write_mask & (1 << i)))
continue;
bld.MOV(offset(result, bld, i), offset(temp, bld, i));
}
}
return;
}
case nir_op_i2f32:
case nir_op_u2f32:
if (optimize_extract_to_float(instr, result))
return;
inst = bld.MOV(result, op[0]);
break;
case nir_op_f2f16_rtne:
case nir_op_f2f16_rtz:
case nir_op_f2f16: {
brw_rnd_mode rnd = BRW_RND_MODE_UNSPECIFIED;
if (nir_op_f2f16 == instr->op)
rnd = brw_rnd_mode_from_execution_mode(execution_mode);
else
rnd = brw_rnd_mode_from_nir_op(instr->op);
if (BRW_RND_MODE_UNSPECIFIED != rnd)
bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), brw_imm_d(rnd));
assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
inst = bld.F32TO16(result, op[0]);
break;
}
case nir_op_b2i8:
case nir_op_b2i16:
case nir_op_b2i32:
case nir_op_b2i64:
case nir_op_b2f16:
case nir_op_b2f32:
case nir_op_b2f64:
intel/fs: Emit better code for b2f(inot(a)) and b2i(inot(a)) Since Boolean values are either -1 (true) or 0 (false), b2f(inot(a)) maps -1 => 0.0 and 0 => 1.0. This is equivalent to 1.0 + float(boolBitsToInt(a)). On Intel GPUs, ADD is one of the few instructions that can type-convert during write to destination, so we can achieve this in a single instruction: add g47F, g26D, 1D v2: Fix swizzles. v3: Fix typos in comments. Noticed by Ken. All Gen6+ platforms had similar results. (Skylake shown) Skylake total instructions in shared programs: 15185583 -> 15184683 (<.01%) instructions in affected programs: 239389 -> 238489 (-0.38%) helped: 899 HURT: 1 helped stats (abs) min: 1 max: 2 x̄: 1.00 x̃: 1 helped stats (rel) min: 0.15% max: 1.85% x̄: 0.49% x̃: 0.44% HURT stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 HURT stats (rel) min: 0.09% max: 0.09% x̄: 0.09% x̃: 0.09% 95% mean confidence interval for instructions value: -1.01 -0.99 95% mean confidence interval for instructions %-change: -0.51% -0.48% Instructions are helped. total cycles in shared programs: 370964249 -> 370961508 (<.01%) cycles in affected programs: 1487586 -> 1484845 (-0.18%) helped: 420 HURT: 268 helped stats (abs) min: 1 max: 232 x̄: 22.41 x̃: 6 helped stats (rel) min: 0.05% max: 22.60% x̄: 1.30% x̃: 0.41% HURT stats (abs) min: 1 max: 230 x̄: 24.90 x̃: 10 HURT stats (rel) min: <.01% max: 21.60% x̄: 1.45% x̃: 0.52% 95% mean confidence interval for cycles value: -7.61 -0.36 95% mean confidence interval for cycles %-change: -0.44% -0.02% Cycles are helped. No changes on Iron Lake or GM45. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2018-12-03 15:53:36 -08:00
if (try_emit_b2fi_of_inot(bld, result, instr))
break;
op[0].type = BRW_REGISTER_TYPE_D;
op[0].negate = !op[0].negate;
FALLTHROUGH;
case nir_op_i2f64:
case nir_op_i2i64:
case nir_op_u2f64:
case nir_op_u2u64:
case nir_op_f2f64:
case nir_op_f2i64:
case nir_op_f2u64:
case nir_op_i2i32:
case nir_op_u2u32:
case nir_op_f2i32:
case nir_op_f2u32:
case nir_op_i2f16:
case nir_op_u2f16:
case nir_op_f2i16:
case nir_op_f2u16:
case nir_op_f2i8:
case nir_op_f2u8:
if (result.type == BRW_REGISTER_TYPE_B ||
result.type == BRW_REGISTER_TYPE_UB ||
result.type == BRW_REGISTER_TYPE_HF)
assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
if (op[0].type == BRW_REGISTER_TYPE_B ||
op[0].type == BRW_REGISTER_TYPE_UB ||
op[0].type == BRW_REGISTER_TYPE_HF)
assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */
inst = bld.MOV(result, op[0]);
break;
case nir_op_i2i8:
case nir_op_u2u8:
assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
FALLTHROUGH;
case nir_op_i2i16:
case nir_op_u2u16: {
/* Emit better code for u2u8(extract_u8(a, b)) and similar patterns.
* Emitting the instructions one by one results in two MOV instructions
* that won't be propagated. By handling both instructions here, a
* single MOV is emitted.
*/
nir_alu_instr *extract_instr = nir_src_as_alu_instr(instr->src[0].src);
if (extract_instr != NULL) {
if (extract_instr->op == nir_op_extract_u8 ||
extract_instr->op == nir_op_extract_i8) {
prepare_alu_destination_and_sources(bld, extract_instr, op, false);
const unsigned byte = nir_src_as_uint(extract_instr->src[1].src);
const brw_reg_type type =
brw_int_type(1, extract_instr->op == nir_op_extract_i8);
op[0] = subscript(op[0], type, byte);
} else if (extract_instr->op == nir_op_extract_u16 ||
extract_instr->op == nir_op_extract_i16) {
prepare_alu_destination_and_sources(bld, extract_instr, op, false);
const unsigned word = nir_src_as_uint(extract_instr->src[1].src);
const brw_reg_type type =
brw_int_type(2, extract_instr->op == nir_op_extract_i16);
op[0] = subscript(op[0], type, word);
}
}
inst = bld.MOV(result, op[0]);
break;
}
case nir_op_fsat:
inst = bld.MOV(result, op[0]);
inst->saturate = true;
break;
case nir_op_fneg:
case nir_op_ineg:
op[0].negate = true;
inst = bld.MOV(result, op[0]);
break;
case nir_op_fabs:
case nir_op_iabs:
op[0].negate = false;
op[0].abs = true;
inst = bld.MOV(result, op[0]);
break;
case nir_op_f2f32:
if (nir_has_any_rounding_mode_enabled(execution_mode)) {
brw_rnd_mode rnd =
brw_rnd_mode_from_execution_mode(execution_mode);
bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
brw_imm_d(rnd));
}
if (op[0].type == BRW_REGISTER_TYPE_HF)
assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */
inst = bld.MOV(result, op[0]);
break;
case nir_op_fsign:
emit_fsign(bld, instr, result, op, 0);
break;
case nir_op_frcp:
inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
break;
case nir_op_fexp2:
inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);
break;
case nir_op_flog2:
inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);
break;
case nir_op_fsin:
inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
break;
case nir_op_fcos:
inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
break;
case nir_op_fddx_fine:
inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
break;
case nir_op_fddx:
case nir_op_fddx_coarse:
inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
break;
case nir_op_fddy_fine:
inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
break;
case nir_op_fddy:
case nir_op_fddy_coarse:
inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
break;
case nir_op_fadd:
if (nir_has_any_rounding_mode_enabled(execution_mode)) {
brw_rnd_mode rnd =
brw_rnd_mode_from_execution_mode(execution_mode);
bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
brw_imm_d(rnd));
}
FALLTHROUGH;
case nir_op_iadd:
inst = bld.ADD(result, op[0], op[1]);
break;
case nir_op_iadd3:
inst = bld.ADD3(result, op[0], op[1], op[2]);
break;
case nir_op_iadd_sat:
case nir_op_uadd_sat:
inst = bld.ADD(result, op[0], op[1]);
inst->saturate = true;
break;
case nir_op_isub_sat:
bld.emit(SHADER_OPCODE_ISUB_SAT, result, op[0], op[1]);
break;
case nir_op_usub_sat:
bld.emit(SHADER_OPCODE_USUB_SAT, result, op[0], op[1]);
break;
case nir_op_irhadd:
case nir_op_urhadd:
assert(instr->def.bit_size < 64);
inst = bld.AVG(result, op[0], op[1]);
break;
case nir_op_ihadd:
case nir_op_uhadd: {
assert(instr->def.bit_size < 64);
fs_reg tmp = bld.vgrf(result.type);
if (devinfo->ver >= 8) {
op[0] = resolve_source_modifiers(bld, op[0]);
op[1] = resolve_source_modifiers(bld, op[1]);
}
/* AVG(x, y) - ((x ^ y) & 1) */
bld.XOR(tmp, op[0], op[1]);
bld.AND(tmp, tmp, retype(brw_imm_ud(1), result.type));
bld.AVG(result, op[0], op[1]);
inst = bld.ADD(result, result, tmp);
inst->src[1].negate = true;
break;
}
case nir_op_fmul:
intel/fs: Generate better code for fsign multiplied by a value v2: Rebase on v2 changes in previous two commits. v3: Rebase on 85c35885b38 ("nir: Rework nir_src_as_alu_instr to not take a pointer"). shader-db results: Skylake and Broadwell had similar results. (Skylake shown) total instructions in shared programs: 15297100 -> 15282141 (-0.10%) instructions in affected programs: 956685 -> 941726 (-1.56%) helped: 4527 HURT: 0 helped stats (abs) min: 1 max: 221 x̄: 3.30 x̃: 2 helped stats (rel) min: 0.07% max: 10.53% x̄: 1.85% x̃: 1.37% 95% mean confidence interval for instructions value: -3.48 -3.12 95% mean confidence interval for instructions %-change: -1.88% -1.81% Instructions are helped. total cycles in shared programs: 372809551 -> 372597886 (-0.06%) cycles in affected programs: 13645512 -> 13433847 (-1.55%) helped: 4362 HURT: 125 helped stats (abs) min: 1 max: 2088 x̄: 50.73 x̃: 28 helped stats (rel) min: 0.01% max: 28.20% x̄: 2.77% x̃: 2.39% HURT stats (abs) min: 1 max: 1836 x̄: 76.90 x̃: 28 HURT stats (rel) min: <.01% max: 34.36% x̄: 3.03% x̃: 1.42% 95% mean confidence interval for cycles value: -50.98 -43.37 95% mean confidence interval for cycles %-change: -2.67% -2.55% Cycles are helped. total spills in shared programs: 23465 -> 23463 (<.01%) spills in affected programs: 42 -> 40 (-4.76%) helped: 1 HURT: 0 total fills in shared programs: 31766 -> 31763 (<.01%) fills in affected programs: 69 -> 66 (-4.35%) helped: 1 HURT: 0 Haswell total instructions in shared programs: 13839992 -> 13828311 (-0.08%) instructions in affected programs: 712503 -> 700822 (-1.64%) helped: 3477 HURT: 0 helped stats (abs) min: 1 max: 221 x̄: 3.36 x̃: 2 helped stats (rel) min: 0.07% max: 10.64% x̄: 1.96% x̃: 1.52% 95% mean confidence interval for instructions value: -3.58 -3.14 95% mean confidence interval for instructions %-change: -2.01% -1.92% Instructions are helped. total cycles in shared programs: 387026330 -> 386872483 (-0.04%) cycles in affected programs: 11329966 -> 11176119 (-1.36%) helped: 3307 HURT: 139 helped stats (abs) min: 2 max: 1776 x̄: 49.58 x̃: 18 helped stats (rel) min: 0.01% max: 20.38% x̄: 2.27% x̃: 1.79% HURT stats (abs) min: 1 max: 2314 x̄: 72.68 x̃: 20 HURT stats (rel) min: <.01% max: 33.99% x̄: 2.28% x̃: 0.96% 95% mean confidence interval for cycles value: -49.31 -39.98 95% mean confidence interval for cycles %-change: -2.15% -2.01% Cycles are helped. LOST: 1 GAINED: 0 Ivy Bridge total instructions in shared programs: 12045602 -> 12038463 (-0.06%) instructions in affected programs: 623837 -> 616698 (-1.14%) helped: 2498 HURT: 0 helped stats (abs) min: 1 max: 39 x̄: 2.86 x̃: 2 helped stats (rel) min: 0.05% max: 10.00% x̄: 1.30% x̃: 1.05% 95% mean confidence interval for instructions value: -2.96 -2.75 95% mean confidence interval for instructions %-change: -1.34% -1.26% Instructions are helped. total cycles in shared programs: 181025675 -> 180891323 (-0.07%) cycles in affected programs: 11329329 -> 11194977 (-1.19%) helped: 2439 HURT: 47 helped stats (abs) min: 1 max: 1565 x̄: 57.06 x̃: 26 helped stats (rel) min: 0.02% max: 24.56% x̄: 2.02% x̃: 1.64% HURT stats (abs) min: 1 max: 1269 x̄: 102.51 x̃: 43 HURT stats (rel) min: 0.11% max: 52.94% x̄: 4.15% x̃: 1.34% 95% mean confidence interval for cycles value: -59.91 -48.17 95% mean confidence interval for cycles %-change: -1.99% -1.82% Cycles are helped. Sandy Bridge, Iron Lake, and GM45 had similar results. (Sandy Bridge shown) total instructions in shared programs: 10896368 -> 10896339 (<.01%) instructions in affected programs: 3767 -> 3738 (-0.77%) helped: 17 HURT: 0 helped stats (abs) min: 1 max: 4 x̄: 1.71 x̃: 1 helped stats (rel) min: 0.13% max: 9.52% x̄: 3.58% x̃: 2.73% 95% mean confidence interval for instructions value: -2.27 -1.14 95% mean confidence interval for instructions %-change: -5.14% -2.03% Instructions are helped. total cycles in shared programs: 155091109 -> 155091021 (<.01%) cycles in affected programs: 47241 -> 47153 (-0.19%) helped: 15 HURT: 8 helped stats (abs) min: 2 max: 81 x̄: 15.73 x̃: 4 helped stats (rel) min: 0.03% max: 10.59% x̄: 1.55% x̃: 0.71% HURT stats (abs) min: 14 max: 32 x̄: 18.50 x̃: 17 HURT stats (rel) min: 0.32% max: 2.79% x̄: 2.43% x̃: 2.71% 95% mean confidence interval for cycles value: -14.59 6.93 95% mean confidence interval for cycles %-change: -1.41% 1.08% Inconclusive result (value mean confidence interval includes 0). Reviewed-by: Matt Turner <mattst88@gmail.com> [v2]
2018-06-25 19:55:31 -07:00
for (unsigned i = 0; i < 2; i++) {
if (can_fuse_fmul_fsign(instr, i)) {
emit_fsign(bld, instr, result, op, i);
return;
}
}
/* We emit the rounding mode after the previous fsign optimization since
* it won't result in a MUL, but will try to negate the value by other
* means.
*/
if (nir_has_any_rounding_mode_enabled(execution_mode)) {
brw_rnd_mode rnd =
brw_rnd_mode_from_execution_mode(execution_mode);
bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
brw_imm_d(rnd));
}
inst = bld.MUL(result, op[0], op[1]);
break;
case nir_op_imul_2x32_64:
case nir_op_umul_2x32_64:
bld.MUL(result, op[0], op[1]);
break;
case nir_op_imul_32x16:
case nir_op_umul_32x16: {
const bool ud = instr->op == nir_op_umul_32x16;
const enum brw_reg_type word_type =
ud ? BRW_REGISTER_TYPE_UW : BRW_REGISTER_TYPE_W;
const enum brw_reg_type dword_type =
ud ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
assert(instr->def.bit_size == 32);
/* Before copy propagation there are no immediate values. */
assert(op[0].file != IMM && op[1].file != IMM);
op[1] = subscript(op[1], word_type, 0);
if (devinfo->ver >= 7)
bld.MUL(result, retype(op[0], dword_type), op[1]);
else
bld.MUL(result, op[1], retype(op[0], dword_type));
break;
}
case nir_op_imul:
assert(instr->def.bit_size < 64);
bld.MUL(result, op[0], op[1]);
break;
case nir_op_imul_high:
case nir_op_umul_high:
assert(instr->def.bit_size < 64);
if (instr->def.bit_size == 32) {
bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
} else {
fs_reg tmp = bld.vgrf(brw_reg_type_from_bit_size(32, op[0].type));
bld.MUL(tmp, op[0], op[1]);
bld.MOV(result, subscript(tmp, result.type, 1));
}
break;
case nir_op_idiv:
case nir_op_udiv:
assert(instr->def.bit_size < 64);
bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
break;
case nir_op_uadd_carry:
unreachable("Should have been lowered by carry_to_arith().");
case nir_op_usub_borrow:
unreachable("Should have been lowered by borrow_to_arith().");
case nir_op_umod:
case nir_op_irem:
/* According to the sign table for INT DIV in the Ivy Bridge PRM, it
* appears that our hardware just does the right thing for signed
* remainder.
*/
assert(instr->def.bit_size < 64);
bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
break;
case nir_op_imod: {
/* Get a regular C-style remainder. If a % b == 0, set the predicate. */
bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
/* Math instructions don't support conditional mod */
inst = bld.MOV(bld.null_reg_d(), result);
inst->conditional_mod = BRW_CONDITIONAL_NZ;
/* Now, we need to determine if signs of the sources are different.
* When we XOR the sources, the top bit is 0 if they are the same and 1
* if they are different. We can then use a conditional modifier to
* turn that into a predicate. This leads us to an XOR.l instruction.
*
* Technically, according to the PRM, you're not allowed to use .l on a
* XOR instruction. However, empirical experiments and Curro's reading
* of the simulator source both indicate that it's safe.
*/
fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);
inst = bld.XOR(tmp, op[0], op[1]);
inst->predicate = BRW_PREDICATE_NORMAL;
inst->conditional_mod = BRW_CONDITIONAL_L;
/* If the result of the initial remainder operation is non-zero and the
* two sources have different signs, add in a copy of op[1] to get the
* final integer modulus value.
*/
inst = bld.ADD(result, result, op[1]);
inst->predicate = BRW_PREDICATE_NORMAL;
break;
}
nir: Rename Boolean-related opcodes to include 32 in the name This is a squash of a bunch of individual changes: nir/builder: Generate 32-bit bool opcodes transparently nir/algebraic: Remap Boolean opcodes to the 32-bit variant Use 32-bit opcodes in the NIR producers and optimizations Generated with a little hand-editing and the following sed commands: sed -i 's/nir_op_ball_fequal/nir_op_b32all_fequal/g' **/*.c sed -i 's/nir_op_bany_fnequal/nir_op_b32any_fnequal/g' **/*.c sed -i 's/nir_op_ball_iequal/nir_op_b32all_iequal/g' **/*.c sed -i 's/nir_op_bany_inequal/nir_op_b32any_inequal/g' **/*.c sed -i 's/nir_op_\([fiu]lt\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fiu]ge\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fiu]ne\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fiu]eq\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fi]\)ne32g/nir_op_\1neg/g' **/*.c sed -i 's/nir_op_bcsel/nir_op_b32csel/g' **/*.c Use 32-bit opcodes in the NIR back-ends Generated with a little hand-editing and the following sed commands: sed -i 's/nir_op_ball_fequal/nir_op_b32all_fequal/g' **/*.c sed -i 's/nir_op_bany_fnequal/nir_op_b32any_fnequal/g' **/*.c sed -i 's/nir_op_ball_iequal/nir_op_b32all_iequal/g' **/*.c sed -i 's/nir_op_bany_inequal/nir_op_b32any_inequal/g' **/*.c sed -i 's/nir_op_\([fiu]lt\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fiu]ge\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fiu]ne\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fiu]eq\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fi]\)ne32g/nir_op_\1neg/g' **/*.c sed -i 's/nir_op_bcsel/nir_op_b32csel/g' **/*.c Reviewed-by: Eric Anholt <eric@anholt.net> Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> Tested-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
2018-10-18 11:44:38 -05:00
case nir_op_flt32:
case nir_op_fge32:
case nir_op_feq32:
case nir_op_fneu32: {
fs_reg dest = result;
const uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
if (bit_size != 32) {
dest = bld.vgrf(op[0].type, 1);
bld.UNDEF(dest);
}
bld.CMP(dest, op[0], op[1], brw_cmod_for_nir_comparison(instr->op));
if (bit_size > 32) {
bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
} else if(bit_size < 32) {
/* When we convert the result to 32-bit we need to be careful and do
* it as a signed conversion to get sign extension (for 32-bit true)
*/
const brw_reg_type src_type =
brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
}
break;
}
nir: Rename Boolean-related opcodes to include 32 in the name This is a squash of a bunch of individual changes: nir/builder: Generate 32-bit bool opcodes transparently nir/algebraic: Remap Boolean opcodes to the 32-bit variant Use 32-bit opcodes in the NIR producers and optimizations Generated with a little hand-editing and the following sed commands: sed -i 's/nir_op_ball_fequal/nir_op_b32all_fequal/g' **/*.c sed -i 's/nir_op_bany_fnequal/nir_op_b32any_fnequal/g' **/*.c sed -i 's/nir_op_ball_iequal/nir_op_b32all_iequal/g' **/*.c sed -i 's/nir_op_bany_inequal/nir_op_b32any_inequal/g' **/*.c sed -i 's/nir_op_\([fiu]lt\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fiu]ge\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fiu]ne\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fiu]eq\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fi]\)ne32g/nir_op_\1neg/g' **/*.c sed -i 's/nir_op_bcsel/nir_op_b32csel/g' **/*.c Use 32-bit opcodes in the NIR back-ends Generated with a little hand-editing and the following sed commands: sed -i 's/nir_op_ball_fequal/nir_op_b32all_fequal/g' **/*.c sed -i 's/nir_op_bany_fnequal/nir_op_b32any_fnequal/g' **/*.c sed -i 's/nir_op_ball_iequal/nir_op_b32all_iequal/g' **/*.c sed -i 's/nir_op_bany_inequal/nir_op_b32any_inequal/g' **/*.c sed -i 's/nir_op_\([fiu]lt\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fiu]ge\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fiu]ne\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fiu]eq\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fi]\)ne32g/nir_op_\1neg/g' **/*.c sed -i 's/nir_op_bcsel/nir_op_b32csel/g' **/*.c Reviewed-by: Eric Anholt <eric@anholt.net> Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> Tested-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
2018-10-18 11:44:38 -05:00
case nir_op_ilt32:
case nir_op_ult32:
case nir_op_ige32:
case nir_op_uge32:
case nir_op_ieq32:
case nir_op_ine32: {
fs_reg dest = result;
intel/nir: Lower 8-bit ops to 16-bit in NIR on Gen11+ Intel hardware supports 8-bit arithmetic but it's tricky and annoying: - Byte operations don't actually execute with a byte type. The execution type for byte operations is actually word. (I don't know if this has implications for the HW implementation. Probably?) - Destinations are required to be strided out to at least the execution type size. This means that B-type operations always have a stride of at least 2. This means wreaks havoc on the back-end in multiple ways. - Thanks to the strided destination, we don't actually save register space by storing things in bytes. We could, in theory, interleave two byte values into a single 2B-strided register but that's both a pain for RA and would lead to piles of false dependencies pre-Gen12 and on Gen12+, we'd need some significant improvements to the SWSB pass. - Also thanks to the strided destination, all byte writes are treated as partial writes by the back-end and we don't know how to copy-prop them. - On Gen11, they added a new hardware restriction that byte types aren't allowed in the 2nd and 3rd sources of instructions. This means that we have to emit B->W conversions all over to resolve things. If we emit said conversions in NIR, instead, there's a chance NIR can get rid of some of them for us. We can get rid of a lot of this pain by just asking NIR to get rid of 8-bit arithmetic for us. It may lead to a few more conversions in some cases but having back-end copy-prop actually work is probably a bigger bonus. There is still a bit we have to handle in the back-end. In particular, basic MOVs and conversions because 8-bit load/store ops still require 8-bit types. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7482>
2020-11-05 23:19:31 -06:00
const uint32_t bit_size = type_sz(op[0].type) * 8;
if (bit_size != 32) {
intel/nir: Lower 8-bit ops to 16-bit in NIR on Gen11+ Intel hardware supports 8-bit arithmetic but it's tricky and annoying: - Byte operations don't actually execute with a byte type. The execution type for byte operations is actually word. (I don't know if this has implications for the HW implementation. Probably?) - Destinations are required to be strided out to at least the execution type size. This means that B-type operations always have a stride of at least 2. This means wreaks havoc on the back-end in multiple ways. - Thanks to the strided destination, we don't actually save register space by storing things in bytes. We could, in theory, interleave two byte values into a single 2B-strided register but that's both a pain for RA and would lead to piles of false dependencies pre-Gen12 and on Gen12+, we'd need some significant improvements to the SWSB pass. - Also thanks to the strided destination, all byte writes are treated as partial writes by the back-end and we don't know how to copy-prop them. - On Gen11, they added a new hardware restriction that byte types aren't allowed in the 2nd and 3rd sources of instructions. This means that we have to emit B->W conversions all over to resolve things. If we emit said conversions in NIR, instead, there's a chance NIR can get rid of some of them for us. We can get rid of a lot of this pain by just asking NIR to get rid of 8-bit arithmetic for us. It may lead to a few more conversions in some cases but having back-end copy-prop actually work is probably a bigger bonus. There is still a bit we have to handle in the back-end. In particular, basic MOVs and conversions because 8-bit load/store ops still require 8-bit types. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7482>
2020-11-05 23:19:31 -06:00
dest = bld.vgrf(op[0].type, 1);
bld.UNDEF(dest);
}
intel/nir: Lower 8-bit ops to 16-bit in NIR on Gen11+ Intel hardware supports 8-bit arithmetic but it's tricky and annoying: - Byte operations don't actually execute with a byte type. The execution type for byte operations is actually word. (I don't know if this has implications for the HW implementation. Probably?) - Destinations are required to be strided out to at least the execution type size. This means that B-type operations always have a stride of at least 2. This means wreaks havoc on the back-end in multiple ways. - Thanks to the strided destination, we don't actually save register space by storing things in bytes. We could, in theory, interleave two byte values into a single 2B-strided register but that's both a pain for RA and would lead to piles of false dependencies pre-Gen12 and on Gen12+, we'd need some significant improvements to the SWSB pass. - Also thanks to the strided destination, all byte writes are treated as partial writes by the back-end and we don't know how to copy-prop them. - On Gen11, they added a new hardware restriction that byte types aren't allowed in the 2nd and 3rd sources of instructions. This means that we have to emit B->W conversions all over to resolve things. If we emit said conversions in NIR, instead, there's a chance NIR can get rid of some of them for us. We can get rid of a lot of this pain by just asking NIR to get rid of 8-bit arithmetic for us. It may lead to a few more conversions in some cases but having back-end copy-prop actually work is probably a bigger bonus. There is still a bit we have to handle in the back-end. In particular, basic MOVs and conversions because 8-bit load/store ops still require 8-bit types. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7482>
2020-11-05 23:19:31 -06:00
bld.CMP(dest, op[0], op[1],
brw_cmod_for_nir_comparison(instr->op));
if (bit_size > 32) {
bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
} else if (bit_size < 32) {
/* When we convert the result to 32-bit we need to be careful and do
* it as a signed conversion to get sign extension (for 32-bit true)
*/
const brw_reg_type src_type =
brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
}
break;
}
case nir_op_inot:
if (devinfo->ver >= 8) {
nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src);
intel/fs: Use De Morgan's laws to avoid logical-not of a logic result on Gen8+ Instead of emitting ~(a & b), emit (~a | ~b) since logical-not of operands is free on Gen8+. v2: Fix swizzles. Fix types for cmod propagation. v3: Simplify logic for inverting source of inot(ixor(a, b)). Suggested by Ken. Skylake and Broadwell had similar results. (Skylake shown) Skylake total instructions in shared programs: 15185593 -> 15185583 (<.01%) instructions in affected programs: 5673 -> 5663 (-0.18%) helped: 12 HURT: 1 helped stats (abs) min: 1 max: 2 x̄: 1.17 x̃: 1 helped stats (rel) min: 0.30% max: 5.88% x̄: 1.50% x̃: 0.70% HURT stats (abs) min: 4 max: 4 x̄: 4.00 x̃: 4 HURT stats (rel) min: 0.12% max: 0.12% x̄: 0.12% x̃: 0.12% 95% mean confidence interval for instructions value: -1.66 0.13 95% mean confidence interval for instructions %-change: -2.60% -0.15% Inconclusive result (value mean confidence interval includes 0). total cycles in shared programs: 370977726 -> 370964249 (<.01%) cycles in affected programs: 869987 -> 856510 (-1.55%) helped: 15 HURT: 2 helped stats (abs) min: 2 max: 6640 x̄: 902.20 x̃: 16 helped stats (rel) min: <.01% max: 4.92% x̄: 1.71% x̃: 1.53% HURT stats (abs) min: 14 max: 42 x̄: 28.00 x̃: 28 HURT stats (rel) min: 1.08% max: 3.18% x̄: 2.13% x̃: 2.13% 95% mean confidence interval for cycles value: -1654.87 69.34 95% mean confidence interval for cycles %-change: -2.29% -0.23% Inconclusive result (value mean confidence interval includes 0). Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2017-02-09 15:21:47 +00:00
if (inot_src_instr != NULL &&
(inot_src_instr->op == nir_op_ior ||
inot_src_instr->op == nir_op_ixor ||
intel/compiler: Drop nir_lower_to_source_mods() and related handling. I think we're unanimous in wanting to drop nir_lower_to_source_mods. It's a bit of complexity to handle in the backend, but perhaps more importantly, would be even more complexity to handle in nir_search. And, it turns out that since we made other compiler improvements in the last few years, they no longer appear to buy us anything of value. Summarizing the results from shader-db from this patch: - Icelake (scalar mode) Instruction counts: - 411 helped, 598 hurt (out of 139,470 shaders) - 99.2% of shaders remain unaffected. The average increase in instruction count in hurt programs is 1.78 instructions. - total instructions in shared programs: 17214951 -> 17215206 (<.01%) - instructions in affected programs: 1143879 -> 1144134 (0.02%) Cycles: - 1042 helped, 1357 hurt - total cycles in shared programs: 365613294 -> 365882263 (0.07%) - cycles in affected programs: 138155497 -> 138424466 (0.19%) - Haswell (both scalar and vector modes) Instruction counts: - 73 helped, 1680 hurt (out of 139,470 shaders) - 98.7% of shaders remain unaffected. The average increase in instruction count in hurt programs is 1.9 instructions. - total instructions in shared programs: 14199527 -> 14202262 (0.02%) - instructions in affected programs: 446499 -> 449234 (0.61%) Cycles: - 5253 helped, 5559 hurt - total cycles in shared programs: 359996545 -> 360038731 (0.01%) - cycles in affected programs: 155897127 -> 155939313 (0.03%) Given that ~99% of shader-db remains unaffected, and the affected programs are hurt by about 1-2 instructions - which are all cheap ALU instructions - this is unlikely to be measurable in terms of any real performance impact that would affect users. So, drop them and simplify the backend, and hopefully enable other future simplifications in NIR. Reviewed-by: Eric Anholt <eric@anholt.net> [v1] Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4616>
2020-04-18 01:20:42 -07:00
inot_src_instr->op == nir_op_iand)) {
intel/fs: Use De Morgan's laws to avoid logical-not of a logic result on Gen8+ Instead of emitting ~(a & b), emit (~a | ~b) since logical-not of operands is free on Gen8+. v2: Fix swizzles. Fix types for cmod propagation. v3: Simplify logic for inverting source of inot(ixor(a, b)). Suggested by Ken. Skylake and Broadwell had similar results. (Skylake shown) Skylake total instructions in shared programs: 15185593 -> 15185583 (<.01%) instructions in affected programs: 5673 -> 5663 (-0.18%) helped: 12 HURT: 1 helped stats (abs) min: 1 max: 2 x̄: 1.17 x̃: 1 helped stats (rel) min: 0.30% max: 5.88% x̄: 1.50% x̃: 0.70% HURT stats (abs) min: 4 max: 4 x̄: 4.00 x̃: 4 HURT stats (rel) min: 0.12% max: 0.12% x̄: 0.12% x̃: 0.12% 95% mean confidence interval for instructions value: -1.66 0.13 95% mean confidence interval for instructions %-change: -2.60% -0.15% Inconclusive result (value mean confidence interval includes 0). total cycles in shared programs: 370977726 -> 370964249 (<.01%) cycles in affected programs: 869987 -> 856510 (-1.55%) helped: 15 HURT: 2 helped stats (abs) min: 2 max: 6640 x̄: 902.20 x̃: 16 helped stats (rel) min: <.01% max: 4.92% x̄: 1.71% x̃: 1.53% HURT stats (abs) min: 14 max: 42 x̄: 28.00 x̃: 28 HURT stats (rel) min: 1.08% max: 3.18% x̄: 2.13% x̃: 2.13% 95% mean confidence interval for cycles value: -1654.87 69.34 95% mean confidence interval for cycles %-change: -2.29% -0.23% Inconclusive result (value mean confidence interval includes 0). Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2017-02-09 15:21:47 +00:00
/* The sources of the source logical instruction are now the
* sources of the instruction that will be generated.
*/
prepare_alu_destination_and_sources(bld, inot_src_instr, op, false);
resolve_inot_sources(bld, inot_src_instr, op);
/* Smash all of the sources and destination to be signed. This
* doesn't matter for the operation of the instruction, but cmod
* propagation fails on unsigned sources with negation (due to
* fs_inst::can_do_cmod returning false).
*/
result.type =
brw_type_for_nir_type(devinfo,
(nir_alu_type)(nir_type_int |
instr->def.bit_size));
intel/fs: Use De Morgan's laws to avoid logical-not of a logic result on Gen8+ Instead of emitting ~(a & b), emit (~a | ~b) since logical-not of operands is free on Gen8+. v2: Fix swizzles. Fix types for cmod propagation. v3: Simplify logic for inverting source of inot(ixor(a, b)). Suggested by Ken. Skylake and Broadwell had similar results. (Skylake shown) Skylake total instructions in shared programs: 15185593 -> 15185583 (<.01%) instructions in affected programs: 5673 -> 5663 (-0.18%) helped: 12 HURT: 1 helped stats (abs) min: 1 max: 2 x̄: 1.17 x̃: 1 helped stats (rel) min: 0.30% max: 5.88% x̄: 1.50% x̃: 0.70% HURT stats (abs) min: 4 max: 4 x̄: 4.00 x̃: 4 HURT stats (rel) min: 0.12% max: 0.12% x̄: 0.12% x̃: 0.12% 95% mean confidence interval for instructions value: -1.66 0.13 95% mean confidence interval for instructions %-change: -2.60% -0.15% Inconclusive result (value mean confidence interval includes 0). total cycles in shared programs: 370977726 -> 370964249 (<.01%) cycles in affected programs: 869987 -> 856510 (-1.55%) helped: 15 HURT: 2 helped stats (abs) min: 2 max: 6640 x̄: 902.20 x̃: 16 helped stats (rel) min: <.01% max: 4.92% x̄: 1.71% x̃: 1.53% HURT stats (abs) min: 14 max: 42 x̄: 28.00 x̃: 28 HURT stats (rel) min: 1.08% max: 3.18% x̄: 2.13% x̃: 2.13% 95% mean confidence interval for cycles value: -1654.87 69.34 95% mean confidence interval for cycles %-change: -2.29% -0.23% Inconclusive result (value mean confidence interval includes 0). Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2017-02-09 15:21:47 +00:00
op[0].type =
brw_type_for_nir_type(devinfo,
(nir_alu_type)(nir_type_int |
nir_src_bit_size(inot_src_instr->src[0].src)));
op[1].type =
brw_type_for_nir_type(devinfo,
(nir_alu_type)(nir_type_int |
nir_src_bit_size(inot_src_instr->src[1].src)));
/* For XOR, only invert one of the sources. Arbitrarily choose
* the first source.
*/
op[0].negate = !op[0].negate;
if (inot_src_instr->op != nir_op_ixor)
op[1].negate = !op[1].negate;
switch (inot_src_instr->op) {
case nir_op_ior:
bld.AND(result, op[0], op[1]);
return;
case nir_op_iand:
bld.OR(result, op[0], op[1]);
return;
case nir_op_ixor:
bld.XOR(result, op[0], op[1]);
return;
default:
unreachable("impossible opcode");
}
}
op[0] = resolve_source_modifiers(bld, op[0]);
}
bld.NOT(result, op[0]);
break;
case nir_op_ixor:
if (devinfo->ver >= 8) {
intel/fs: Emit logical-not of operands on Gen8+ On Gen8+ specifying negation of a logical operation such as AND actually performs a logical-not. Take advantage of this to generate fewer instructions. v2: Major rebase. Use nir_src_as_alu_instr. Fix swizzle handling. No changes on any pre-Gen8 platform. Skylake and Broadwell had similar results. (Broadwell shown) total instructions in shared programs: 15466902 -> 15466274 (<.01%) instructions in affected programs: 1262953 -> 1262325 (-0.05%) helped: 682 HURT: 4 helped stats (abs) min: 1 max: 5 x̄: 1.02 x̃: 1 helped stats (rel) min: 0.03% max: 2.40% x̄: 0.18% x̃: 0.04% HURT stats (abs) min: 1 max: 62 x̄: 17.50 x̃: 3 HURT stats (rel) min: 0.03% max: 1.89% x̄: 0.53% x̃: 0.10% 95% mean confidence interval for instructions value: -1.10 -0.73 95% mean confidence interval for instructions %-change: -0.19% -0.15% Instructions are helped. total cycles in shared programs: 410996093 -> 410950440 (-0.01%) cycles in affected programs: 144389048 -> 144343395 (-0.03%) helped: 519 HURT: 51 helped stats (abs) min: 1 max: 1060 x̄: 104.46 x̃: 140 helped stats (rel) min: 0.01% max: 10.98% x̄: 0.34% x̃: 0.03% HURT stats (abs) min: 1 max: 4060 x̄: 167.90 x̃: 22 HURT stats (rel) min: <.01% max: 8.20% x̄: 0.96% x̃: 0.25% 95% mean confidence interval for cycles value: -97.16 -63.02 95% mean confidence interval for cycles %-change: -0.32% -0.13% Cycles are helped. total spills in shared programs: 95311 -> 95329 (0.02%) spills in affected programs: 881 -> 899 (2.04%) helped: 0 HURT: 4 total fills in shared programs: 93629 -> 93634 (<.01%) fills in affected programs: 794 -> 799 (0.63%) helped: 1 HURT: 2 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2017-02-09 15:20:04 +00:00
resolve_inot_sources(bld, instr, op);
}
bld.XOR(result, op[0], op[1]);
break;
case nir_op_ior:
if (devinfo->ver >= 8) {
intel/fs: Emit logical-not of operands on Gen8+ On Gen8+ specifying negation of a logical operation such as AND actually performs a logical-not. Take advantage of this to generate fewer instructions. v2: Major rebase. Use nir_src_as_alu_instr. Fix swizzle handling. No changes on any pre-Gen8 platform. Skylake and Broadwell had similar results. (Broadwell shown) total instructions in shared programs: 15466902 -> 15466274 (<.01%) instructions in affected programs: 1262953 -> 1262325 (-0.05%) helped: 682 HURT: 4 helped stats (abs) min: 1 max: 5 x̄: 1.02 x̃: 1 helped stats (rel) min: 0.03% max: 2.40% x̄: 0.18% x̃: 0.04% HURT stats (abs) min: 1 max: 62 x̄: 17.50 x̃: 3 HURT stats (rel) min: 0.03% max: 1.89% x̄: 0.53% x̃: 0.10% 95% mean confidence interval for instructions value: -1.10 -0.73 95% mean confidence interval for instructions %-change: -0.19% -0.15% Instructions are helped. total cycles in shared programs: 410996093 -> 410950440 (-0.01%) cycles in affected programs: 144389048 -> 144343395 (-0.03%) helped: 519 HURT: 51 helped stats (abs) min: 1 max: 1060 x̄: 104.46 x̃: 140 helped stats (rel) min: 0.01% max: 10.98% x̄: 0.34% x̃: 0.03% HURT stats (abs) min: 1 max: 4060 x̄: 167.90 x̃: 22 HURT stats (rel) min: <.01% max: 8.20% x̄: 0.96% x̃: 0.25% 95% mean confidence interval for cycles value: -97.16 -63.02 95% mean confidence interval for cycles %-change: -0.32% -0.13% Cycles are helped. total spills in shared programs: 95311 -> 95329 (0.02%) spills in affected programs: 881 -> 899 (2.04%) helped: 0 HURT: 4 total fills in shared programs: 93629 -> 93634 (<.01%) fills in affected programs: 794 -> 799 (0.63%) helped: 1 HURT: 2 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2017-02-09 15:20:04 +00:00
resolve_inot_sources(bld, instr, op);
}
bld.OR(result, op[0], op[1]);
break;
case nir_op_iand:
if (devinfo->ver >= 8) {
intel/fs: Emit logical-not of operands on Gen8+ On Gen8+ specifying negation of a logical operation such as AND actually performs a logical-not. Take advantage of this to generate fewer instructions. v2: Major rebase. Use nir_src_as_alu_instr. Fix swizzle handling. No changes on any pre-Gen8 platform. Skylake and Broadwell had similar results. (Broadwell shown) total instructions in shared programs: 15466902 -> 15466274 (<.01%) instructions in affected programs: 1262953 -> 1262325 (-0.05%) helped: 682 HURT: 4 helped stats (abs) min: 1 max: 5 x̄: 1.02 x̃: 1 helped stats (rel) min: 0.03% max: 2.40% x̄: 0.18% x̃: 0.04% HURT stats (abs) min: 1 max: 62 x̄: 17.50 x̃: 3 HURT stats (rel) min: 0.03% max: 1.89% x̄: 0.53% x̃: 0.10% 95% mean confidence interval for instructions value: -1.10 -0.73 95% mean confidence interval for instructions %-change: -0.19% -0.15% Instructions are helped. total cycles in shared programs: 410996093 -> 410950440 (-0.01%) cycles in affected programs: 144389048 -> 144343395 (-0.03%) helped: 519 HURT: 51 helped stats (abs) min: 1 max: 1060 x̄: 104.46 x̃: 140 helped stats (rel) min: 0.01% max: 10.98% x̄: 0.34% x̃: 0.03% HURT stats (abs) min: 1 max: 4060 x̄: 167.90 x̃: 22 HURT stats (rel) min: <.01% max: 8.20% x̄: 0.96% x̃: 0.25% 95% mean confidence interval for cycles value: -97.16 -63.02 95% mean confidence interval for cycles %-change: -0.32% -0.13% Cycles are helped. total spills in shared programs: 95311 -> 95329 (0.02%) spills in affected programs: 881 -> 899 (2.04%) helped: 0 HURT: 4 total fills in shared programs: 93629 -> 93634 (<.01%) fills in affected programs: 794 -> 799 (0.63%) helped: 1 HURT: 2 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2017-02-09 15:20:04 +00:00
resolve_inot_sources(bld, instr, op);
}
bld.AND(result, op[0], op[1]);
break;
case nir_op_fdot2:
case nir_op_fdot3:
case nir_op_fdot4:
nir: Rename Boolean-related opcodes to include 32 in the name This is a squash of a bunch of individual changes: nir/builder: Generate 32-bit bool opcodes transparently nir/algebraic: Remap Boolean opcodes to the 32-bit variant Use 32-bit opcodes in the NIR producers and optimizations Generated with a little hand-editing and the following sed commands: sed -i 's/nir_op_ball_fequal/nir_op_b32all_fequal/g' **/*.c sed -i 's/nir_op_bany_fnequal/nir_op_b32any_fnequal/g' **/*.c sed -i 's/nir_op_ball_iequal/nir_op_b32all_iequal/g' **/*.c sed -i 's/nir_op_bany_inequal/nir_op_b32any_inequal/g' **/*.c sed -i 's/nir_op_\([fiu]lt\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fiu]ge\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fiu]ne\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fiu]eq\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fi]\)ne32g/nir_op_\1neg/g' **/*.c sed -i 's/nir_op_bcsel/nir_op_b32csel/g' **/*.c Use 32-bit opcodes in the NIR back-ends Generated with a little hand-editing and the following sed commands: sed -i 's/nir_op_ball_fequal/nir_op_b32all_fequal/g' **/*.c sed -i 's/nir_op_bany_fnequal/nir_op_b32any_fnequal/g' **/*.c sed -i 's/nir_op_ball_iequal/nir_op_b32all_iequal/g' **/*.c sed -i 's/nir_op_bany_inequal/nir_op_b32any_inequal/g' **/*.c sed -i 's/nir_op_\([fiu]lt\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fiu]ge\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fiu]ne\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fiu]eq\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fi]\)ne32g/nir_op_\1neg/g' **/*.c sed -i 's/nir_op_bcsel/nir_op_b32csel/g' **/*.c Reviewed-by: Eric Anholt <eric@anholt.net> Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> Tested-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
2018-10-18 11:44:38 -05:00
case nir_op_b32all_fequal2:
case nir_op_b32all_iequal2:
case nir_op_b32all_fequal3:
case nir_op_b32all_iequal3:
case nir_op_b32all_fequal4:
case nir_op_b32all_iequal4:
case nir_op_b32any_fnequal2:
case nir_op_b32any_inequal2:
case nir_op_b32any_fnequal3:
case nir_op_b32any_inequal3:
case nir_op_b32any_fnequal4:
case nir_op_b32any_inequal4:
unreachable("Lowered by nir_lower_alu_reductions");
case nir_op_ldexp:
unreachable("not reached: should be handled by ldexp_to_arith()");
case nir_op_fsqrt:
inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);
break;
case nir_op_frsq:
inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);
break;
case nir_op_ftrunc:
inst = bld.RNDZ(result, op[0]);
if (devinfo->ver < 6) {
intel/compiler: Move Gen4/5 rounding to visitor Gen4/5's rounding instructions operate differently than later Gens'. They all return the floor of the input and the "Round-increment" conditional modifier answers whether the result should be incremented by 1.0 to get the appropriate result for the operation (and thus its behavior is determined by the round opcode; e.g., RNDZ vs RNDE). Since this requires a second instruciton (a predicated ADD) that consumes the result of the round instruction, the round instruction cannot write its result directly to the (write-only) message registers. By emitting the ADD in the generator, the backend thinks it's safe to store the round's result directly to the message register file. To avoid this, we move the emission of the ADD instruction to the NIR translator so that the backend has the information it needs. I suspect this also fixes code generated for RNDZ.SAT but since Gen4/5 don't support GLSL 1.30 which adds the trunc() function, I couldn't write a piglit test to confirm. My thinking is that if x=-0.5: sat(trunc(-0.5)) = 0.0 But on Gen4/5 where sat(trunc(x)) is implemented as rndz.r.f0 result, x // result = floor(x) // set f0 if increment needed (+f0) add result, result, 1.0 // fixup so result = trunc(x) then putting saturate on both instructions will give the wrong result. floor(-0.5) = -1.0 sat(floor(-0.5)) = 0.0 // +1 increment would be needed since floor(-0.5) != trunc(-0.5) sat(sat(floor(-0.5)) + 1.0) = 1.0 Fixes: 6f394343b1f ("nir/algebraic: i2f(f2i()) -> trunc()") Closes: https://gitlab.freedesktop.org/mesa/mesa/issues/2355 Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Tested-by: Marge Bot <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3459> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3459>
2020-01-16 11:17:14 -08:00
set_condmod(BRW_CONDITIONAL_R, inst);
set_predicate(BRW_PREDICATE_NORMAL,
bld.ADD(result, result, brw_imm_f(1.0f)));
inst = bld.MOV(result, result); /* for potential saturation */
}
break;
case nir_op_fceil: {
op[0].negate = !op[0].negate;
fs_reg temp = vgrf(glsl_type::float_type);
bld.RNDD(temp, op[0]);
temp.negate = true;
inst = bld.MOV(result, temp);
break;
}
case nir_op_ffloor:
inst = bld.RNDD(result, op[0]);
break;
case nir_op_ffract:
inst = bld.FRC(result, op[0]);
break;
case nir_op_fround_even:
inst = bld.RNDE(result, op[0]);
if (devinfo->ver < 6) {
intel/compiler: Move Gen4/5 rounding to visitor Gen4/5's rounding instructions operate differently than later Gens'. They all return the floor of the input and the "Round-increment" conditional modifier answers whether the result should be incremented by 1.0 to get the appropriate result for the operation (and thus its behavior is determined by the round opcode; e.g., RNDZ vs RNDE). Since this requires a second instruciton (a predicated ADD) that consumes the result of the round instruction, the round instruction cannot write its result directly to the (write-only) message registers. By emitting the ADD in the generator, the backend thinks it's safe to store the round's result directly to the message register file. To avoid this, we move the emission of the ADD instruction to the NIR translator so that the backend has the information it needs. I suspect this also fixes code generated for RNDZ.SAT but since Gen4/5 don't support GLSL 1.30 which adds the trunc() function, I couldn't write a piglit test to confirm. My thinking is that if x=-0.5: sat(trunc(-0.5)) = 0.0 But on Gen4/5 where sat(trunc(x)) is implemented as rndz.r.f0 result, x // result = floor(x) // set f0 if increment needed (+f0) add result, result, 1.0 // fixup so result = trunc(x) then putting saturate on both instructions will give the wrong result. floor(-0.5) = -1.0 sat(floor(-0.5)) = 0.0 // +1 increment would be needed since floor(-0.5) != trunc(-0.5) sat(sat(floor(-0.5)) + 1.0) = 1.0 Fixes: 6f394343b1f ("nir/algebraic: i2f(f2i()) -> trunc()") Closes: https://gitlab.freedesktop.org/mesa/mesa/issues/2355 Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Tested-by: Marge Bot <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3459> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3459>
2020-01-16 11:17:14 -08:00
set_condmod(BRW_CONDITIONAL_R, inst);
set_predicate(BRW_PREDICATE_NORMAL,
bld.ADD(result, result, brw_imm_f(1.0f)));
inst = bld.MOV(result, result); /* for potential saturation */
}
break;
case nir_op_fquantize2f16: {
fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D);
fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F);
fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F);
/* The destination stride must be at least as big as the source stride. */
tmp16 = subscript(tmp16, BRW_REGISTER_TYPE_HF, 0);
/* Check for denormal */
fs_reg abs_src0 = op[0];
abs_src0.abs = true;
bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
BRW_CONDITIONAL_L);
/* Get the appropriately signed zero */
bld.AND(retype(zero, BRW_REGISTER_TYPE_UD),
retype(op[0], BRW_REGISTER_TYPE_UD),
brw_imm_ud(0x80000000));
/* Do the actual F32 -> F16 -> F32 conversion */
bld.F32TO16(tmp16, op[0]);
bld.F16TO32(tmp32, tmp16);
/* Select that or zero based on normal status */
inst = bld.SEL(result, zero, tmp32);
inst->predicate = BRW_PREDICATE_NORMAL;
break;
}
case nir_op_imin:
case nir_op_umin:
case nir_op_fmin:
inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L);
break;
case nir_op_imax:
case nir_op_umax:
case nir_op_fmax:
inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE);
break;
case nir_op_pack_snorm_2x16:
case nir_op_pack_snorm_4x8:
case nir_op_pack_unorm_2x16:
case nir_op_pack_unorm_4x8:
case nir_op_unpack_snorm_2x16:
case nir_op_unpack_snorm_4x8:
case nir_op_unpack_unorm_2x16:
case nir_op_unpack_unorm_4x8:
case nir_op_unpack_half_2x16:
case nir_op_pack_half_2x16:
unreachable("not reached: should be handled by lower_packing_builtins");
case nir_op_unpack_half_2x16_split_x_flush_to_zero:
assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode);
FALLTHROUGH;
case nir_op_unpack_half_2x16_split_x:
inst = bld.F16TO32(result, subscript(op[0], BRW_REGISTER_TYPE_HF, 0));
break;
case nir_op_unpack_half_2x16_split_y_flush_to_zero:
assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode);
FALLTHROUGH;
case nir_op_unpack_half_2x16_split_y:
inst = bld.F16TO32(result, subscript(op[0], BRW_REGISTER_TYPE_HF, 1));
break;
case nir_op_pack_64_2x32_split:
case nir_op_pack_32_2x16_split:
bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
break;
case nir_op_pack_32_4x8_split:
bld.emit(FS_OPCODE_PACK, result, op, 4);
break;
case nir_op_unpack_64_2x32_split_x:
case nir_op_unpack_64_2x32_split_y: {
if (instr->op == nir_op_unpack_64_2x32_split_x)
bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0));
else
bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
break;
}
case nir_op_unpack_32_2x16_split_x:
case nir_op_unpack_32_2x16_split_y: {
if (instr->op == nir_op_unpack_32_2x16_split_x)
bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
else
bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
break;
}
case nir_op_fpow:
inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
break;
case nir_op_bitfield_reverse:
assert(instr->def.bit_size == 32);
assert(nir_src_bit_size(instr->src[0].src) == 32);
bld.BFREV(result, op[0]);
break;
case nir_op_bit_count:
assert(instr->def.bit_size == 32);
assert(nir_src_bit_size(instr->src[0].src) < 64);
bld.CBIT(result, op[0]);
break;
case nir_op_uclz:
assert(instr->def.bit_size == 32);
assert(nir_src_bit_size(instr->src[0].src) == 32);
bld.LZD(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
break;
case nir_op_ifind_msb: {
assert(instr->def.bit_size == 32);
assert(nir_src_bit_size(instr->src[0].src) == 32);
assert(devinfo->ver >= 7);
bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
/* FBH counts from the MSB side, while GLSL's findMSB() wants the count
* from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
* subtract the result from 31 to convert the MSB count into an LSB
* count.
*/
bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
inst = bld.ADD(result, result, brw_imm_d(31));
inst->predicate = BRW_PREDICATE_NORMAL;
inst->src[0].negate = true;
break;
}
case nir_op_find_lsb:
assert(instr->def.bit_size == 32);
assert(nir_src_bit_size(instr->src[0].src) == 32);
assert(devinfo->ver >= 7);
bld.FBL(result, op[0]);
break;
case nir_op_ubitfield_extract:
case nir_op_ibitfield_extract:
unreachable("should have been lowered");
case nir_op_ubfe:
case nir_op_ibfe:
assert(instr->def.bit_size < 64);
bld.BFE(result, op[2], op[1], op[0]);
break;
case nir_op_bfm:
assert(instr->def.bit_size < 64);
bld.BFI1(result, op[0], op[1]);
break;
case nir_op_bfi:
assert(instr->def.bit_size < 64);
intel/fs: Emit better code for bfi(..., 0) DG2, Tiger Lake, Ice Lake, and Skylake had similar results (Ice Lake shown) total instructions in shared programs: 20570141 -> 20570063 (<.01%) instructions in affected programs: 30679 -> 30601 (-0.25%) helped: 77 / HURT: 0 total cycles in shared programs: 902113977 -> 902118723 (<.01%) cycles in affected programs: 3255958 -> 3260704 (0.15%) helped: 60 / HURT: 19 Broadwell total instructions in shared programs: 18524633 -> 18524547 (<.01%) instructions in affected programs: 34095 -> 34009 (-0.25%) helped: 75 / HURT: 2 total cycles in shared programs: 949532394 -> 949543761 (<.01%) cycles in affected programs: 3419107 -> 3430474 (0.33%) helped: 57 / HURT: 24 total spills in shared programs: 22484 -> 22484 (0.00%) spills in affected programs: 516 -> 516 (0.00%) helped: 2 / HURT: 2 total fills in shared programs: 29346 -> 29338 (-0.03%) fills in affected programs: 572 -> 564 (-1.40%) helped: 4 / HURT: 0 Haswell total instructions in shared programs: 17331356 -> 17331523 (<.01%) instructions in affected programs: 27920 -> 28087 (0.60%) helped: 41 / HURT: 4 total cycles in shared programs: 936603192 -> 936574664 (<.01%) cycles in affected programs: 3417695 -> 3389167 (-0.83%) helped: 28 / HURT: 21 total spills in shared programs: 19718 -> 19756 (0.19%) spills in affected programs: 436 -> 474 (8.72%) helped: 0 / HURT: 4 total fills in shared programs: 22547 -> 22607 (0.27%) fills in affected programs: 444 -> 504 (13.51%) helped: 0 / HURT: 4 Ivy Bridge total cycles in shared programs: 463451277 -> 463451273 (<.01%) cycles in affected programs: 95870 -> 95866 (<.01%) helped: 3 / HURT: 2 DG2, Tiger Lake, Ice Lake, and Skylake had similar results (Ice Lake shown) Totals: Instrs: 152825278 -> 152819969 (-0.00%); split: -0.00%, +0.00% Cycles: 15014075626 -> 15014628652 (+0.00%); split: -0.01%, +0.01% Subgroup size: 8528536 -> 8528560 (+0.00%) Send messages: 7711431 -> 7711464 (+0.00%) Spill count: 99907 -> 99509 (-0.40%); split: -0.40%, +0.00% Fill count: 202459 -> 201598 (-0.43%); split: -0.43%, +0.00% Scratch Memory Size: 4376576 -> 4371456 (-0.12%) Totals from 2915 (0.44% of 662497) affected shaders: Instrs: 2288842 -> 2283533 (-0.23%); split: -0.24%, +0.01% Cycles: 471633295 -> 472186321 (+0.12%); split: -0.27%, +0.39% Subgroup size: 27488 -> 27512 (+0.09%) Send messages: 151344 -> 151377 (+0.02%) Spill count: 48091 -> 47693 (-0.83%); split: -0.83%, +0.00% Fill count: 59053 -> 58192 (-1.46%); split: -1.46%, +0.00% Scratch Memory Size: 1827840 -> 1822720 (-0.28%) Reviewed-by: Matt Turner <mattst88@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19968>
2022-11-16 13:12:50 -08:00
/* bfi is ((...) | (~src0 & src2)). The second part is zero when src2 is
* either 0 or src0. Replacing the 0 with another value can eliminate a
* temporary register.
*/
if (is_const_zero(instr->src[2].src))
bld.BFI2(result, op[0], op[1], op[0]);
else
bld.BFI2(result, op[0], op[1], op[2]);
break;
case nir_op_bitfield_insert:
unreachable("not reached: should have been lowered");
/* For all shift operations:
*
* Gen4 - Gen7: After application of source modifiers, the low 5-bits of
* src1 are used an unsigned value for the shift count.
*
* Gen8: As with earlier platforms, but for Q and UQ types on src0, the low
* 6-bit of src1 are used.
*
* Gen9+: The low bits of src1 matching the size of src0 (e.g., 4-bits for
* W or UW src0).
*
* The implication is that the following instruction will produce a
* different result on Gen9+ than on previous platforms:
*
* shr(8) g4<1>UW g12<8,8,1>UW 0x0010UW
*
* where Gen9+ will shift by zero, and earlier platforms will shift by 16.
*
* This does not seem to be the case. Experimentally, it has been
* determined that shifts of 16-bit values on Gen8 behave properly. Shifts
* of 8-bit values on both Gen8 and Gen9 do not. Gen11+ lowers 8-bit
* values, so those platforms were not tested. No features expose access
* to 8- or 16-bit types on Gen7 or earlier, so those platforms were not
* tested either. See
* https://gitlab.freedesktop.org/mesa/crucible/-/merge_requests/76.
*
* This is part of the reason 8-bit values are lowered to 16-bit on all
* platforms.
*/
case nir_op_ishl:
bld.SHL(result, op[0], op[1]);
break;
case nir_op_ishr:
bld.ASR(result, op[0], op[1]);
break;
case nir_op_ushr:
bld.SHR(result, op[0], op[1]);
break;
case nir_op_urol:
bld.ROL(result, op[0], op[1]);
break;
case nir_op_uror:
bld.ROR(result, op[0], op[1]);
break;
case nir_op_pack_half_2x16_split:
bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
break;
case nir_op_sdot_4x8_iadd:
case nir_op_sdot_4x8_iadd_sat:
intel/fs: Force destination types on DP4A instructions Most of the time, this doesn't matter. On the versions with _sat, if the destination type is incorrect, the clamping will not happen correctly. Fixes the following CTS tests: dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.all_packed_ss_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.all_packed_su_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.all_packed_us_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.all_packed_uu_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.all_ss_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.all_su_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.all_us_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.all_uu_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.limits_packed_ss_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.limits_packed_su_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.limits_packed_us_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.limits_packed_uu_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.limits_ss_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.limits_su_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.limits_us_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.limits_uu_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.small_packed_ss_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.small_packed_su_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.small_packed_us_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.small_packed_uu_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.small_ss_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.small_su_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.small_us_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.small_uu_v4i8_out32 v2: Update anv-tgl-fails.txt. Reviewed-by: Ivan Briano <ivan.briano@intel.com> Fixes: 0f809dbf404 ("intel/compiler: Basic support for DP4A instruction") Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15417>
2022-03-16 12:56:54 -07:00
inst = bld.DP4A(retype(result, BRW_REGISTER_TYPE_D),
retype(op[2], BRW_REGISTER_TYPE_D),
retype(op[0], BRW_REGISTER_TYPE_D),
retype(op[1], BRW_REGISTER_TYPE_D));
if (instr->op == nir_op_sdot_4x8_iadd_sat)
inst->saturate = true;
break;
case nir_op_udot_4x8_uadd:
case nir_op_udot_4x8_uadd_sat:
intel/fs: Force destination types on DP4A instructions Most of the time, this doesn't matter. On the versions with _sat, if the destination type is incorrect, the clamping will not happen correctly. Fixes the following CTS tests: dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.all_packed_ss_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.all_packed_su_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.all_packed_us_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.all_packed_uu_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.all_ss_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.all_su_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.all_us_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.all_uu_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.limits_packed_ss_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.limits_packed_su_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.limits_packed_us_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.limits_packed_uu_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.limits_ss_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.limits_su_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.limits_us_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.limits_uu_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.small_packed_ss_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.small_packed_su_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.small_packed_us_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.small_packed_uu_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.small_ss_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.small_su_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.small_us_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.small_uu_v4i8_out32 v2: Update anv-tgl-fails.txt. Reviewed-by: Ivan Briano <ivan.briano@intel.com> Fixes: 0f809dbf404 ("intel/compiler: Basic support for DP4A instruction") Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15417>
2022-03-16 12:56:54 -07:00
inst = bld.DP4A(retype(result, BRW_REGISTER_TYPE_UD),
retype(op[2], BRW_REGISTER_TYPE_UD),
retype(op[0], BRW_REGISTER_TYPE_UD),
retype(op[1], BRW_REGISTER_TYPE_UD));
if (instr->op == nir_op_udot_4x8_uadd_sat)
inst->saturate = true;
break;
case nir_op_sudot_4x8_iadd:
case nir_op_sudot_4x8_iadd_sat:
intel/fs: Force destination types on DP4A instructions Most of the time, this doesn't matter. On the versions with _sat, if the destination type is incorrect, the clamping will not happen correctly. Fixes the following CTS tests: dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.all_packed_ss_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.all_packed_su_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.all_packed_us_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.all_packed_uu_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.all_ss_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.all_su_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.all_us_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.all_uu_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.limits_packed_ss_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.limits_packed_su_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.limits_packed_us_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.limits_packed_uu_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.limits_ss_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.limits_su_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.limits_us_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.limits_uu_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.small_packed_ss_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.small_packed_su_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.small_packed_us_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.small_packed_uu_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.small_ss_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.small_su_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.small_us_v4i8_out32 dEQP-VK.spirv_assembly.instruction.compute.opudotaccsatkhr.small_uu_v4i8_out32 v2: Update anv-tgl-fails.txt. Reviewed-by: Ivan Briano <ivan.briano@intel.com> Fixes: 0f809dbf404 ("intel/compiler: Basic support for DP4A instruction") Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15417>
2022-03-16 12:56:54 -07:00
inst = bld.DP4A(retype(result, BRW_REGISTER_TYPE_D),
retype(op[2], BRW_REGISTER_TYPE_D),
retype(op[0], BRW_REGISTER_TYPE_D),
retype(op[1], BRW_REGISTER_TYPE_UD));
if (instr->op == nir_op_sudot_4x8_iadd_sat)
inst->saturate = true;
break;
case nir_op_ffma:
if (nir_has_any_rounding_mode_enabled(execution_mode)) {
brw_rnd_mode rnd =
brw_rnd_mode_from_execution_mode(execution_mode);
bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
brw_imm_d(rnd));
}
inst = bld.MAD(result, op[2], op[1], op[0]);
break;
case nir_op_flrp:
if (nir_has_any_rounding_mode_enabled(execution_mode)) {
brw_rnd_mode rnd =
brw_rnd_mode_from_execution_mode(execution_mode);
bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
brw_imm_d(rnd));
}
inst = bld.LRP(result, op[0], op[1], op[2]);
break;
nir: Rename Boolean-related opcodes to include 32 in the name This is a squash of a bunch of individual changes: nir/builder: Generate 32-bit bool opcodes transparently nir/algebraic: Remap Boolean opcodes to the 32-bit variant Use 32-bit opcodes in the NIR producers and optimizations Generated with a little hand-editing and the following sed commands: sed -i 's/nir_op_ball_fequal/nir_op_b32all_fequal/g' **/*.c sed -i 's/nir_op_bany_fnequal/nir_op_b32any_fnequal/g' **/*.c sed -i 's/nir_op_ball_iequal/nir_op_b32all_iequal/g' **/*.c sed -i 's/nir_op_bany_inequal/nir_op_b32any_inequal/g' **/*.c sed -i 's/nir_op_\([fiu]lt\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fiu]ge\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fiu]ne\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fiu]eq\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fi]\)ne32g/nir_op_\1neg/g' **/*.c sed -i 's/nir_op_bcsel/nir_op_b32csel/g' **/*.c Use 32-bit opcodes in the NIR back-ends Generated with a little hand-editing and the following sed commands: sed -i 's/nir_op_ball_fequal/nir_op_b32all_fequal/g' **/*.c sed -i 's/nir_op_bany_fnequal/nir_op_b32any_fnequal/g' **/*.c sed -i 's/nir_op_ball_iequal/nir_op_b32all_iequal/g' **/*.c sed -i 's/nir_op_bany_inequal/nir_op_b32any_inequal/g' **/*.c sed -i 's/nir_op_\([fiu]lt\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fiu]ge\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fiu]ne\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fiu]eq\)/nir_op_\132/g' **/*.c sed -i 's/nir_op_\([fi]\)ne32g/nir_op_\1neg/g' **/*.c sed -i 's/nir_op_bcsel/nir_op_b32csel/g' **/*.c Reviewed-by: Eric Anholt <eric@anholt.net> Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> Tested-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
2018-10-18 11:44:38 -05:00
case nir_op_b32csel:
if (optimize_frontfacing_ternary(instr, result))
return;
bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
inst = bld.SEL(result, op[1], op[2]);
inst->predicate = BRW_PREDICATE_NORMAL;
break;
case nir_op_extract_u8:
case nir_op_extract_i8: {
unsigned byte = nir_src_as_uint(instr->src[1].src);
/* The PRMs say:
*
* BDW+
* There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
* Use two instructions and a word or DWord intermediate integer type.
*/
if (instr->def.bit_size == 64) {
const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
if (instr->op == nir_op_extract_i8) {
/* If we need to sign extend, extract to a word first */
fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W);
bld.MOV(w_temp, subscript(op[0], type, byte));
bld.MOV(result, w_temp);
} else if (byte & 1) {
/* Extract the high byte from the word containing the desired byte
* offset.
*/
bld.SHR(result,
subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2),
brw_imm_uw(8));
} else {
/* Otherwise use an AND with 0xff and a word type */
bld.AND(result,
subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2),
brw_imm_uw(0xff));
}
} else {
const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
bld.MOV(result, subscript(op[0], type, byte));
}
break;
}
case nir_op_extract_u16:
case nir_op_extract_i16: {
const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16);
unsigned word = nir_src_as_uint(instr->src[1].src);
bld.MOV(result, subscript(op[0], type, word));
break;
}
default:
unreachable("unhandled instruction");
}
/* If we need to do a boolean resolve, replace the result with -(x & 1)
* to sign extend the low bit to 0/~0
*/
if (devinfo->ver <= 5 &&
!result.is_null() &&
(instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
fs_reg masked = vgrf(glsl_type::int_type);
bld.AND(masked, result, brw_imm_d(1));
masked.negate = true;
bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked);
}
}
void
fs_visitor::nir_emit_load_const(const fs_builder &bld,
nir_load_const_instr *instr)
{
const brw_reg_type reg_type =
brw_reg_type_from_bit_size(instr->def.bit_size, BRW_REGISTER_TYPE_D);
fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
switch (instr->def.bit_size) {
case 8:
for (unsigned i = 0; i < instr->def.num_components; i++)
bld.MOV(offset(reg, bld, i), setup_imm_b(bld, instr->value[i].i8));
break;
case 16:
for (unsigned i = 0; i < instr->def.num_components; i++)
bld.MOV(offset(reg, bld, i), brw_imm_w(instr->value[i].i16));
break;
case 32:
for (unsigned i = 0; i < instr->def.num_components; i++)
bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value[i].i32));
break;
case 64:
assert(devinfo->ver >= 7);
if (!devinfo->has_64bit_int) {
for (unsigned i = 0; i < instr->def.num_components; i++) {
bld.MOV(retype(offset(reg, bld, i), BRW_REGISTER_TYPE_DF),
setup_imm_df(bld, instr->value[i].f64));
}
} else {
for (unsigned i = 0; i < instr->def.num_components; i++)
bld.MOV(offset(reg, bld, i), brw_imm_q(instr->value[i].i64));
}
break;
default:
unreachable("Invalid bit size");
}
nir_ssa_values[instr->def.index] = reg;
}
bool
fs_visitor::get_nir_src_bindless(const nir_src &src)
{
return nir_ssa_bind_infos[src.ssa->index].bindless;
}
unsigned
fs_visitor::get_nir_src_block(const nir_src &src)
{
return nir_ssa_bind_infos[src.ssa->index].valid ?
nir_ssa_bind_infos[src.ssa->index].block :
UINT32_MAX;
}
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
static bool
is_resource_src(nir_src src)
{
return src.ssa->parent_instr->type == nir_instr_type_intrinsic &&
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
nir_instr_as_intrinsic(src.ssa->parent_instr)->intrinsic == nir_intrinsic_resource_intel;
}
fs_reg
fs_visitor::get_resource_nir_src(const nir_src &src)
{
if (!is_resource_src(src))
return fs_reg();
return nir_resource_values[src.ssa->index];
}
fs_reg
fs_visitor::get_nir_src(const nir_src &src)
{
nir_intrinsic_instr *load_reg = nir_load_reg_for_def(src.ssa);
fs_reg reg;
if (!load_reg) {
if (nir_src_is_undef(src)) {
const brw_reg_type reg_type =
brw_reg_type_from_bit_size(src.ssa->bit_size,
BRW_REGISTER_TYPE_D);
reg = bld.vgrf(reg_type, src.ssa->num_components);
} else {
reg = nir_ssa_values[src.ssa->index];
}
} else {
nir_intrinsic_instr *decl_reg = nir_reg_get_decl(load_reg->src[0].ssa);
/* We don't handle indirects on locals */
assert(nir_intrinsic_base(load_reg) == 0);
assert(load_reg->intrinsic != nir_intrinsic_load_reg_indirect);
reg = nir_ssa_values[decl_reg->def.index];
}
if (nir_src_bit_size(src) == 64 && devinfo->ver == 7) {
/* The only 64-bit type available on gfx7 is DF, so use that. */
reg.type = BRW_REGISTER_TYPE_DF;
} else {
/* To avoid floating-point denorm flushing problems, set the type by
* default to an integer type - instructions that need floating point
* semantics will set this to F if they need to
*/
reg.type = brw_reg_type_from_bit_size(nir_src_bit_size(src),
BRW_REGISTER_TYPE_D);
}
return reg;
}
/**
* Return an IMM for constants; otherwise call get_nir_src() as normal.
*
* This function should not be called on any value which may be 64 bits.
* We could theoretically support 64-bit on gfx8+ but we choose not to
* because it wouldn't work in general (no gfx7 support) and there are
* enough restrictions in 64-bit immediates that you can't take the return
* value and treat it the same as the result of get_nir_src().
*/
fs_reg
fs_visitor::get_nir_src_imm(const nir_src &src)
{
assert(nir_src_bit_size(src) == 32);
return nir_src_is_const(src) ?
fs_reg(brw_imm_d(nir_src_as_int(src))) : get_nir_src(src);
}
fs_reg
fs_visitor::get_nir_def(const nir_def &def)
{
nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
if (!store_reg) {
const brw_reg_type reg_type =
brw_reg_type_from_bit_size(def.bit_size,
def.bit_size == 8 ?
BRW_REGISTER_TYPE_D :
BRW_REGISTER_TYPE_F);
nir_ssa_values[def.index] =
bld.vgrf(reg_type, def.num_components);
bld.UNDEF(nir_ssa_values[def.index]);
return nir_ssa_values[def.index];
} else {
nir_intrinsic_instr *decl_reg =
nir_reg_get_decl(store_reg->src[1].ssa);
/* We don't handle indirects on locals */
assert(nir_intrinsic_base(store_reg) == 0);
assert(store_reg->intrinsic != nir_intrinsic_store_reg_indirect);
return nir_ssa_values[decl_reg->def.index];
}
}
nir_component_mask_t
fs_visitor::get_nir_write_mask(const nir_def &def)
{
nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
if (!store_reg) {
return nir_component_mask(def.num_components);
} else {
return nir_intrinsic_write_mask(store_reg);
}
}
2015-07-17 14:40:03 +01:00
static fs_inst *
emit_pixel_interpolater_send(const fs_builder &bld,
enum opcode opcode,
const fs_reg &dst,
const fs_reg &src,
const fs_reg &desc,
glsl_interp_mode interpolation)
2015-07-17 14:40:03 +01:00
{
struct brw_wm_prog_data *wm_prog_data =
brw_wm_prog_data(bld.shader->stage_prog_data);
2015-07-17 14:40:03 +01:00
fs_inst *inst = bld.emit(opcode, dst, src, desc);
2015-07-17 14:40:03 +01:00
/* 2 floats per slot returned */
inst->size_written = 2 * dst.component_size(inst->exec_size);
if (interpolation == INTERP_MODE_NOPERSPECTIVE) {
inst->pi_noperspective = true;
/* TGL BSpec says:
* This field cannot be set to "Linear Interpolation"
* unless Non-Perspective Barycentric Enable in 3DSTATE_CLIP is enabled"
*/
wm_prog_data->uses_nonperspective_interp_modes = true;
}
2015-07-17 14:40:03 +01:00
wm_prog_data->pulls_bary = true;
2015-07-17 14:40:03 +01:00
return inst;
}
/**
* Computes 1 << x, given a D/UD register containing some value x.
*/
static fs_reg
intexp2(const fs_builder &bld, const fs_reg &x)
{
assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
fs_reg result = bld.vgrf(x.type, 1);
fs_reg one = bld.vgrf(x.type, 1);
bld.MOV(one, retype(brw_imm_d(1), one.type));
bld.SHL(result, one, x);
return result;
}
void
fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
{
assert(stage == MESA_SHADER_GEOMETRY);
struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
if (gs_compile->control_data_header_size_bits == 0)
return;
/* We can only do EndPrimitive() functionality when the control data
* consists of cut bits. Fortunately, the only time it isn't is when the
* output type is points, in which case EndPrimitive() is a no-op.
*/
if (gs_prog_data->control_data_format !=
GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
return;
}
/* Cut bits use one bit per vertex. */
assert(gs_compile->control_data_bits_per_vertex == 1);
fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
vertex_count.type = BRW_REGISTER_TYPE_UD;
/* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
* vertex n, 0 otherwise. So all we need to do here is mark bit
* (vertex_count - 1) % 32 in the cut_bits register to indicate that
* EndPrimitive() was called after emitting vertex (vertex_count - 1);
* vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
*
* Note that if EndPrimitive() is called before emitting any vertices, this
* will cause us to set bit 31 of the control_data_bits register to 1.
* That's fine because:
*
* - If max_vertices < 32, then vertex number 31 (zero-based) will never be
* output, so the hardware will ignore cut bit 31.
*
* - If max_vertices == 32, then vertex number 31 is guaranteed to be the
* last vertex, so setting cut bit 31 has no effect (since the primitive
* is automatically ended when the GS terminates).
*
* - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
* control_data_bits register to 0 when the first vertex is emitted.
*/
const fs_builder abld = bld.annotate("end primitive");
/* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
fs_reg mask = intexp2(abld, prev_count);
/* Note: we're relying on the fact that the GEN SHL instruction only pays
* attention to the lower 5 bits of its second source argument, so on this
* architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
* ((vertex_count - 1) % 32).
*/
abld.OR(this->control_data_bits, this->control_data_bits, mask);
}
void
fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
{
assert(stage == MESA_SHADER_GEOMETRY);
assert(gs_compile->control_data_bits_per_vertex != 0);
struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
const fs_builder abld = bld.annotate("emit control data bits");
const fs_builder fwa_bld = bld.exec_all();
/* We use a single UD register to accumulate control data bits (32 bits
* for each of the SIMD8 channels). So we need to write a DWord (32 bits)
* at a time.
*
* Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
* We have select a 128-bit group via the Global and Per-Slot Offsets, then
* use the Channel Mask phase to enable/disable which DWord within that
* group to write. (Remember, different SIMD8 channels may have emitted
* different numbers of vertices, so we may need per-slot offsets.)
*
* Channel masking presents an annoying problem: we may have to replicate
* the data up to 4 times:
*
* Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
*
* To avoid penalizing shaders that emit a small number of vertices, we
* can avoid these sometimes: if the size of the control data header is
* <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land
* land in the same 128-bit group, so we can skip per-slot offsets.
*
* Similarly, if the control data header is <= 32 bits, there is only one
* DWord, so we can skip channel masks.
*/
fs_reg channel_mask, per_slot_offset;
if (gs_compile->control_data_header_size_bits > 32)
channel_mask = vgrf(glsl_type::uint_type);
if (gs_compile->control_data_header_size_bits > 128)
per_slot_offset = vgrf(glsl_type::uint_type);
/* Figure out which DWord we're trying to write to using the formula:
*
* dword_index = (vertex_count - 1) * bits_per_vertex / 32
*
* Since bits_per_vertex is a power of two, and is known at compile
* time, this can be optimized to:
*
* dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
*/
if (channel_mask.file != BAD_FILE || per_slot_offset.file != BAD_FILE) {
fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
unsigned log2_bits_per_vertex =
util_last_bit(gs_compile->control_data_bits_per_vertex);
abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
if (per_slot_offset.file != BAD_FILE) {
/* Set the per-slot offset to dword_index / 4, so that we'll write to
* the appropriate OWord within the control data header.
*/
abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u));
}
/* Set the channel masks to 1 << (dword_index % 4), so that we'll
* write to the appropriate DWORD within the OWORD.
*/
fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
fwa_bld.AND(channel, dword_index, brw_imm_ud(3u));
channel_mask = intexp2(fwa_bld, channel);
/* Then the channel masks need to be in bits 23:16. */
fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u));
}
/* If there are channel masks, add 3 extra copies of the data. */
const unsigned length = 1 + 3 * unsigned(channel_mask.file != BAD_FILE);
fs_reg sources[4];
for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
sources[i] = this->control_data_bits;
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offset;
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = channel_mask;
srcs[URB_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_F, length);
srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length);
abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
fs_inst *inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
srcs, ARRAY_SIZE(srcs));
/* We need to increment Global Offset by 256-bits to make room for
* Broadwell's extra "Vertex Count" payload at the beginning of the
* URB entry. Since this is an OWord message, Global Offset is counted
* in 128-bit units, so we must set it to 2.
*/
if (gs_prog_data->static_vertex_count == -1)
inst->offset = 2;
}
void
fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
unsigned stream_id)
{
/* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
/* Note: we are calling this *before* increasing vertex_count, so
* this->vertex_count == vertex_count - 1 in the formula above.
*/
/* Stream mode uses 2 bits per vertex */
assert(gs_compile->control_data_bits_per_vertex == 2);
/* Must be a valid stream */
assert(stream_id < 4); /* MAX_VERTEX_STREAMS */
/* Control data bits are initialized to 0 so we don't have to set any
* bits when sending vertices to stream 0.
*/
if (stream_id == 0)
return;
const fs_builder abld = bld.annotate("set stream control data bits", NULL);
/* reg::sid = stream_id */
fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
abld.MOV(sid, brw_imm_ud(stream_id));
/* reg:shift_count = 2 * (vertex_count - 1) */
fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
abld.SHL(shift_count, vertex_count, brw_imm_ud(1u));
/* Note: we're relying on the fact that the GEN SHL instruction only pays
* attention to the lower 5 bits of its second source argument, so on this
* architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
* stream_id << ((2 * (vertex_count - 1)) % 32).
*/
fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
abld.SHL(mask, sid, shift_count);
abld.OR(this->control_data_bits, this->control_data_bits, mask);
}
void
fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
unsigned stream_id)
{
assert(stage == MESA_SHADER_GEOMETRY);
struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
vertex_count.type = BRW_REGISTER_TYPE_UD;
/* Haswell and later hardware ignores the "Render Stream Select" bits
* from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
* and instead sends all primitives down the pipeline for rasterization.
* If the SOL stage is enabled, "Render Stream Select" is honored and
* primitives bound to non-zero streams are discarded after stream output.
*
* Since the only purpose of primives sent to non-zero streams is to
* be recorded by transform feedback, we can simply discard all geometry
* bound to these streams when transform feedback is disabled.
*/
if (stream_id > 0 && !nir->info.has_transform_feedback_varyings)
return;
/* If we're outputting 32 control data bits or less, then we can wait
* until the shader is over to output them all. Otherwise we need to
* output them as we go. Now is the time to do it, since we're about to
* output the vertex_count'th vertex, so it's guaranteed that the
* control data bits associated with the (vertex_count - 1)th vertex are
* correct.
*/
if (gs_compile->control_data_header_size_bits > 32) {
const fs_builder abld =
bld.annotate("emit vertex: emit control data bits");
/* Only emit control data bits if we've finished accumulating a batch
* of 32 bits. This is the case when:
*
* (vertex_count * bits_per_vertex) % 32 == 0
*
* (in other words, when the last 5 bits of vertex_count *
* bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some
* integer n (which is always the case, since bits_per_vertex is
* always 1 or 2), this is equivalent to requiring that the last 5-n
* bits of vertex_count are 0:
*
* vertex_count & (2^(5-n) - 1) == 0
*
* 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
* equivalent to:
*
* vertex_count & (32 / bits_per_vertex - 1) == 0
*
* TODO: If vertex_count is an immediate, we could do some of this math
* at compile time...
*/
fs_inst *inst =
abld.AND(bld.null_reg_d(), vertex_count,
brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u));
inst->conditional_mod = BRW_CONDITIONAL_Z;
abld.IF(BRW_PREDICATE_NORMAL);
/* If vertex_count is 0, then no control data bits have been
* accumulated yet, so we can skip emitting them.
*/
abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u),
BRW_CONDITIONAL_NEQ);
abld.IF(BRW_PREDICATE_NORMAL);
emit_gs_control_data_bits(vertex_count);
abld.emit(BRW_OPCODE_ENDIF);
/* Reset control_data_bits to 0 so we can start accumulating a new
* batch.
*
* Note: in the case where vertex_count == 0, this neutralizes the
* effect of any call to EndPrimitive() that the shader may have
* made before outputting its first vertex.
*/
inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u));
inst->force_writemask_all = true;
abld.emit(BRW_OPCODE_ENDIF);
}
emit_urb_writes(vertex_count);
/* In stream mode we have to set control data bits for all vertices
* unless we have disabled control data bits completely (which we do
* do for GL_POINTS outputs that don't use streams).
*/
if (gs_compile->control_data_header_size_bits > 0 &&
gs_prog_data->control_data_format ==
GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
set_gs_stream_control_data_bits(vertex_count, stream_id);
}
}
void
fs_visitor::emit_gs_input_load(const fs_reg &dst,
const nir_src &vertex_src,
nir: Get rid of *_indirect variants of input/output load/store intrinsics There is some special-casing needed in a competent back-end. However, they can do their special-casing easily enough based on whether or not the offset is a constant. In the mean time, having the *_indirect variants adds special cases a number of places where they don't need to be and, in general, only complicates things. To complicate matters, NIR had no way to convdert an indirect load/store to a direct one in the case that the indirect was a constant so we would still not really get what the back-ends wanted. The best solution seems to be to get rid of the *_indirect variants entirely. This commit is a bunch of different changes squashed together: - nir: Get rid of *_indirect variants of input/output load/store intrinsics - nir/glsl: Stop handling UBO/SSBO load/stores differently depending on indirect - nir/lower_io: Get rid of load/store_foo_indirect - i965/fs: Get rid of load/store_foo_indirect - i965/vec4: Get rid of load/store_foo_indirect - tgsi_to_nir: Get rid of load/store_foo_indirect - ir3/nir: Use the new unified io intrinsics - vc4: Do all uniform loads with byte offsets - vc4/nir: Use the new unified io intrinsics - vc4: Fix load_user_clip_plane crash - vc4: add missing src for store outputs - vc4: Fix state uniforms - nir/lower_clip: Update to the new load/store intrinsics - nir/lower_two_sided_color: Update to the new load intrinsic NIR and i965 changes are Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> NIR indirect declarations and vc4 changes are Reviewed-by: Eric Anholt <eric@anholt.net> ir3 changes are Reviewed-by: Rob Clark <robdclark@gmail.com> NIR changes are Acked-by: Rob Clark <robdclark@gmail.com>
2015-11-25 14:14:05 -08:00
unsigned base_offset,
const nir_src &offset_src,
unsigned num_components,
unsigned first_component)
{
assert(type_sz(dst.type) == 4);
struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
nir: Get rid of *_indirect variants of input/output load/store intrinsics There is some special-casing needed in a competent back-end. However, they can do their special-casing easily enough based on whether or not the offset is a constant. In the mean time, having the *_indirect variants adds special cases a number of places where they don't need to be and, in general, only complicates things. To complicate matters, NIR had no way to convdert an indirect load/store to a direct one in the case that the indirect was a constant so we would still not really get what the back-ends wanted. The best solution seems to be to get rid of the *_indirect variants entirely. This commit is a bunch of different changes squashed together: - nir: Get rid of *_indirect variants of input/output load/store intrinsics - nir/glsl: Stop handling UBO/SSBO load/stores differently depending on indirect - nir/lower_io: Get rid of load/store_foo_indirect - i965/fs: Get rid of load/store_foo_indirect - i965/vec4: Get rid of load/store_foo_indirect - tgsi_to_nir: Get rid of load/store_foo_indirect - ir3/nir: Use the new unified io intrinsics - vc4: Do all uniform loads with byte offsets - vc4/nir: Use the new unified io intrinsics - vc4: Fix load_user_clip_plane crash - vc4: add missing src for store outputs - vc4: Fix state uniforms - nir/lower_clip: Update to the new load/store intrinsics - nir/lower_two_sided_color: Update to the new load intrinsic NIR and i965 changes are Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> NIR indirect declarations and vc4 changes are Reviewed-by: Eric Anholt <eric@anholt.net> ir3 changes are Reviewed-by: Rob Clark <robdclark@gmail.com> NIR changes are Acked-by: Rob Clark <robdclark@gmail.com>
2015-11-25 14:14:05 -08:00
const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
/* TODO: figure out push input layout for invocations == 1 */
if (gs_prog_data->invocations == 1 &&
nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) &&
4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) {
int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 +
nir_src_as_uint(vertex_src) * push_reg_count;
for (unsigned i = 0; i < num_components; i++) {
bld.MOV(offset(dst, bld, i),
fs_reg(ATTR, imm_offset + i + first_component, dst.type));
}
return;
}
/* Resort to the pull model. Ensure the VUE handles are provided. */
i965/fs: force pull model for 64-bit GS inputs Triggering the push model when 64-bit inputs are involved is not easy due to the constrains on the maximum number of registers that we allow for this mode, however, for GS with 'points' primitive type and just a couple of double varyings we can trigger this and it just doesn't work because the implementation is not 64-bit aware at all. For now, let's make sure that we don't attempt this model whith 64-bit inputs and we always fall back to pull model for them. Also, don't enable the VUE handles in the thread payload on the fly when we find an input for which we need the pull model, this is not safe: if we need to resort to the pull model we need to account for that when we setup the thread payload so we compute the first non-payload register properly. If we didn't do that correctly and we enable it on-the-fly here then we will end up VUE handles on the first non-payload register which will probably lead to GPU hangs. Instead, always enable the VUE handles for the pull model so we can safely use them when needed. The GS is going to resort to pull model almost in every situation anyway, so this shouldn't make a significant difference and it makes things easier and safer. v2: Always enable the VUE handles for pull model, this is easier and safer and the GS is going to fallback to pull model almost always anyway (Ken) v3: Only clamp the URB read length if we are over the maximum reserved for push inputs as we were doing in the original code (Ken). v4: No need to clamp the urb read length if invocations > 1 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2017-09-27 11:36:31 +02:00
assert(gs_prog_data->base.include_vue_handles);
fs_reg start = gs_payload().icp_handle_start;
fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
if (gs_prog_data->invocations == 1) {
if (nir_src_is_const(vertex_src)) {
/* The vertex index is constant; just select the proper URB handle. */
icp_handle = offset(start, bld, nir_src_as_uint(vertex_src));
} else {
/* The vertex index is non-constant. We need to use indirect
* addressing to fetch the proper URB handle.
*
* First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
* indicating that channel <n> should read the handle from
* DWord <n>. We convert that to bytes by multiplying by 4.
*
* Next, we convert the vertex index to bytes by multiplying
* by 32 (shifting by 5), and add the two together. This is
* the final indirect byte offset.
*/
intel/compiler: Use subgroup invocation for ICP handle loads When loading a TCS or GS input, we generate some code to read the URB handle for a particular input control point (ICP handle), which often involves indirect addressing due to a non-constant vertex. For example: mov(8) vgrf148+0.0:UW, 76543210V shl(8) vgrf149:UD, vgrf148+0.0:UW, 2u shl(8) vgrf150:UD, vgrf145:UD, 5u add(8) vgrf151:UD, vgrf150:UD, vgrf149:UD mov_indirect(8) vgrf147:UD, g2:UD, vgrf151:UD, 96u Unfortunately, the first load with 76543210V is considered a partial write because the 8 channels of 16-bit UW data doesn't fill an entire register, and we can't allocate VGRFs at sub-register granularity. This causes none of the above math to be CSE'd, even though the first two instructions are common to *all* input loads, and the rest may be reused sometimes as well. To work around this, we stop emitting 76543210V to a temporary, and instead use nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], which already contains this value, and is unconditionally set up for us. With all input loads using the same register for the sequence, our CSE pass is able to eliminate the rest of the common math. shader-db results on Tigerlake: total instructions in shared programs: 20748243 -> 20744844 (-0.02%) instructions in affected programs: 73410 -> 70011 (-4.63%) helped: 242 / HURT: 21 helped stats (abs) min: 1 max: 37 x̄: 14.17 x̃: 15 helped stats (rel) min: 0.17% max: 19.58% x̄: 6.13% x̃: 6.32% HURT stats (abs) min: 1 max: 4 x̄: 1.38 x̃: 1 HURT stats (rel) min: 0.18% max: 1.31% x̄: 0.58% x̃: 0.58% 95% mean confidence interval for instructions value: -13.73 -12.12 95% mean confidence interval for instructions %-change: -6.00% -5.19% Instructions are helped. total cycles in shared programs: 785828951 -> 785788480 (<.01%) cycles in affected programs: 597593 -> 557122 (-6.77%) helped: 227 / HURT: 13 helped stats (abs) min: 6 max: 624 x̄: 182.19 x̃: 185 helped stats (rel) min: 0.24% max: 18.22% x̄: 7.85% x̃: 7.80% HURT stats (abs) min: 2 max: 153 x̄: 68.08 x̃: 36 HURT stats (rel) min: 0.03% max: 7.79% x̄: 2.97% x̃: 1.25% 95% mean confidence interval for cycles value: -182.55 -154.71 95% mean confidence interval for cycles %-change: -7.84% -6.69% Cycles are helped. Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18455>
2022-09-06 17:18:16 -07:00
fs_reg sequence =
nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
/* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
/* Convert vertex_index to bytes (multiply by 32) */
bld.SHL(vertex_offset_bytes,
retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
brw_imm_ud(5u));
bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
/* Use first_icp_handle as the base offset. There is one register
* of URB handles per vertex, so inform the register allocator that
* we might read up to nir->info.gs.vertices_in registers.
*/
bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
fs_reg(icp_offset_bytes),
brw_imm_ud(nir->info.gs.vertices_in * REG_SIZE));
}
} else {
assert(gs_prog_data->invocations > 1);
if (nir_src_is_const(vertex_src)) {
unsigned vertex = nir_src_as_uint(vertex_src);
assert(devinfo->ver >= 9 || vertex <= 5);
bld.MOV(icp_handle, component(start, vertex));
} else {
/* The vertex index is non-constant. We need to use indirect
* addressing to fetch the proper URB handle.
*
*/
fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
/* Convert vertex_index to bytes (multiply by 4) */
bld.SHL(icp_offset_bytes,
retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
brw_imm_ud(2u));
/* Use first_icp_handle as the base offset. There is one DWord
* of URB handles per vertex, so inform the register allocator that
* we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
*/
bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
fs_reg(icp_offset_bytes),
brw_imm_ud(DIV_ROUND_UP(nir->info.gs.vertices_in, 8) *
REG_SIZE));
}
}
fs_inst *inst;
fs_reg indirect_offset = get_nir_src(offset_src);
if (nir_src_is_const(offset_src)) {
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
/* Constant indexing - use global offset. */
if (first_component != 0) {
unsigned read_components = num_components + first_component;
fs_reg tmp = bld.vgrf(dst.type, read_components);
inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs,
ARRAY_SIZE(srcs));
inst->size_written = read_components *
tmp.component_size(inst->exec_size);
for (unsigned i = 0; i < num_components; i++) {
bld.MOV(offset(dst, bld, i),
offset(tmp, bld, i + first_component));
}
} else {
inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs,
ARRAY_SIZE(srcs));
inst->size_written = num_components *
dst.component_size(inst->exec_size);
}
inst->offset = base_offset + nir_src_as_uint(offset_src);
} else {
/* Indirect indexing - use per-slot offsets as well. */
unsigned read_components = num_components + first_component;
fs_reg tmp = bld.vgrf(dst.type, read_components);
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
if (first_component != 0) {
inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
srcs, ARRAY_SIZE(srcs));
inst->size_written = read_components *
tmp.component_size(inst->exec_size);
for (unsigned i = 0; i < num_components; i++) {
bld.MOV(offset(dst, bld, i),
offset(tmp, bld, i + first_component));
}
} else {
inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
srcs, ARRAY_SIZE(srcs));
inst->size_written = num_components *
dst.component_size(inst->exec_size);
}
inst->offset = base_offset;
}
}
fs_reg
fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
{
nir_src *offset_src = nir_get_io_offset_src(instr);
if (nir_src_is_const(*offset_src)) {
/* The only constant offset we should find is 0. brw_nir.c's
* add_const_offset_to_base() will fold other constant offsets
* into the "base" index.
*/
assert(nir_src_as_uint(*offset_src) == 0);
return fs_reg();
}
return get_nir_src(*offset_src);
}
void
fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
nir_intrinsic_instr *instr)
{
assert(stage == MESA_SHADER_VERTEX);
fs_reg dest;
if (nir_intrinsic_infos[instr->intrinsic].has_dest)
dest = get_nir_def(instr->def);
switch (instr->intrinsic) {
case nir_intrinsic_load_vertex_id:
case nir_intrinsic_load_base_vertex:
unreachable("should be lowered by nir_lower_system_values()");
case nir_intrinsic_load_input: {
assert(instr->def.bit_size == 32);
fs_reg src = fs_reg(ATTR, nir_intrinsic_base(instr) * 4, dest.type);
src = offset(src, bld, nir_intrinsic_component(instr));
src = offset(src, bld, nir_src_as_uint(instr->src[0]));
for (unsigned i = 0; i < instr->num_components; i++)
bld.MOV(offset(dest, bld, i), offset(src, bld, i));
break;
}
case nir_intrinsic_load_vertex_id_zero_base:
case nir_intrinsic_load_instance_id:
case nir_intrinsic_load_base_instance:
case nir_intrinsic_load_draw_id:
case nir_intrinsic_load_first_vertex:
case nir_intrinsic_load_is_indexed_draw:
unreachable("lowered by brw_nir_lower_vs_inputs");
default:
nir_emit_intrinsic(bld, instr);
break;
}
}
fs_reg
fs_visitor::get_tcs_single_patch_icp_handle(const fs_builder &bld,
nir_intrinsic_instr *instr)
{
struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
const nir_src &vertex_src = instr->src[0];
nir_intrinsic_instr *vertex_intrin = nir_src_as_intrinsic(vertex_src);
const fs_reg start = tcs_payload().icp_handle_start;
fs_reg icp_handle;
if (nir_src_is_const(vertex_src)) {
/* Emit a MOV to resolve <0,1,0> regioning. */
icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
unsigned vertex = nir_src_as_uint(vertex_src);
bld.MOV(icp_handle, component(start, vertex));
} else if (tcs_prog_data->instances == 1 && vertex_intrin &&
vertex_intrin->intrinsic == nir_intrinsic_load_invocation_id) {
/* For the common case of only 1 instance, an array index of
* gl_InvocationID means reading the handles from the start. Skip all
* the indirect work.
*/
icp_handle = start;
} else {
/* The vertex index is non-constant. We need to use indirect
* addressing to fetch the proper URB handle.
*/
icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
/* Each ICP handle is a single DWord (4 bytes) */
fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
bld.SHL(vertex_offset_bytes,
retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
brw_imm_ud(2u));
/* We might read up to 4 registers. */
bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
start, vertex_offset_bytes,
brw_imm_ud(4 * REG_SIZE));
}
return icp_handle;
}
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8 Our tessellation control shaders can be dispatched in several modes. - SINGLE_PATCH (Gen7+) processes a single patch per thread, with each channel corresponding to a different patch vertex. PATCHLIST_N will launch (N / 8) threads. If N is less than 8, some channels will be disabled, leaving some untapped hardware capabilities. Conditionals based on gl_InvocationID are non-uniform, which means that they'll often have to execute both paths. However, if there are fewer than 8 vertices, all invocations will happen within a single thread, so barriers can become no-ops, which is nice. We also burn a maximum of 4 registers for ICP handles, so we can compile without regard for the value of N. It also works in all cases. - DUAL_PATCH mode processes up to two patches at a time, where the first four channels come from patch 1, and the second group of four come from patch 2. This tries to provide better EU utilization for small patches (N <= 4). It cannot be used in all cases. - 8_PATCH mode processes 8 patches at a time, with a thread launched per vertex in the patch. Each channel corresponds to the same vertex, but in each of the 8 patches. This utilizes all channels even for small patches. It also makes conditions on gl_InvocationID uniform, leading to proper jumps. Barriers, unfortunately, become real. Worse, for PATCHLIST_N, the thread payload burns N registers for ICP handles. This can burn up to 32 registers, or 1/4 of our register file, for URB handles. For Vulkan (and DX), we know the number of vertices at compile time, so we can limit the amount of waste. In GL, the patch dimension is dynamic state, so we either would have to waste all 32 (not reasonable) or guess (badly) and recompile. This is unfortunate. Because we can only spawn 16 thread instances, we can only use this mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH. This patch implements the new 8_PATCH TCS mode, but leaves us using SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to using 8_PATCH mode for testing and benchmarking purposes. We may want to consider using 8_PATCH mode in Vulkan in some cases. The data I've seen shows that 8_PATCH mode can be more efficient in some cases, but SINGLE_PATCH mode (the one we use today) is faster in other cases. Ultimately, the TES matters much more than the TCS for performance, so the decision may not matter much. Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
fs_reg
fs_visitor::get_tcs_multi_patch_icp_handle(const fs_builder &bld,
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8 Our tessellation control shaders can be dispatched in several modes. - SINGLE_PATCH (Gen7+) processes a single patch per thread, with each channel corresponding to a different patch vertex. PATCHLIST_N will launch (N / 8) threads. If N is less than 8, some channels will be disabled, leaving some untapped hardware capabilities. Conditionals based on gl_InvocationID are non-uniform, which means that they'll often have to execute both paths. However, if there are fewer than 8 vertices, all invocations will happen within a single thread, so barriers can become no-ops, which is nice. We also burn a maximum of 4 registers for ICP handles, so we can compile without regard for the value of N. It also works in all cases. - DUAL_PATCH mode processes up to two patches at a time, where the first four channels come from patch 1, and the second group of four come from patch 2. This tries to provide better EU utilization for small patches (N <= 4). It cannot be used in all cases. - 8_PATCH mode processes 8 patches at a time, with a thread launched per vertex in the patch. Each channel corresponds to the same vertex, but in each of the 8 patches. This utilizes all channels even for small patches. It also makes conditions on gl_InvocationID uniform, leading to proper jumps. Barriers, unfortunately, become real. Worse, for PATCHLIST_N, the thread payload burns N registers for ICP handles. This can burn up to 32 registers, or 1/4 of our register file, for URB handles. For Vulkan (and DX), we know the number of vertices at compile time, so we can limit the amount of waste. In GL, the patch dimension is dynamic state, so we either would have to waste all 32 (not reasonable) or guess (badly) and recompile. This is unfortunate. Because we can only spawn 16 thread instances, we can only use this mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH. This patch implements the new 8_PATCH TCS mode, but leaves us using SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to using 8_PATCH mode for testing and benchmarking purposes. We may want to consider using 8_PATCH mode in Vulkan in some cases. The data I've seen shows that 8_PATCH mode can be more efficient in some cases, but SINGLE_PATCH mode (the one we use today) is faster in other cases. Ultimately, the TES matters much more than the TCS for performance, so the decision may not matter much. Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
nir_intrinsic_instr *instr)
{
struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
const nir_src &vertex_src = instr->src[0];
const unsigned grf_size_bytes = REG_SIZE * reg_unit(devinfo);
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8 Our tessellation control shaders can be dispatched in several modes. - SINGLE_PATCH (Gen7+) processes a single patch per thread, with each channel corresponding to a different patch vertex. PATCHLIST_N will launch (N / 8) threads. If N is less than 8, some channels will be disabled, leaving some untapped hardware capabilities. Conditionals based on gl_InvocationID are non-uniform, which means that they'll often have to execute both paths. However, if there are fewer than 8 vertices, all invocations will happen within a single thread, so barriers can become no-ops, which is nice. We also burn a maximum of 4 registers for ICP handles, so we can compile without regard for the value of N. It also works in all cases. - DUAL_PATCH mode processes up to two patches at a time, where the first four channels come from patch 1, and the second group of four come from patch 2. This tries to provide better EU utilization for small patches (N <= 4). It cannot be used in all cases. - 8_PATCH mode processes 8 patches at a time, with a thread launched per vertex in the patch. Each channel corresponds to the same vertex, but in each of the 8 patches. This utilizes all channels even for small patches. It also makes conditions on gl_InvocationID uniform, leading to proper jumps. Barriers, unfortunately, become real. Worse, for PATCHLIST_N, the thread payload burns N registers for ICP handles. This can burn up to 32 registers, or 1/4 of our register file, for URB handles. For Vulkan (and DX), we know the number of vertices at compile time, so we can limit the amount of waste. In GL, the patch dimension is dynamic state, so we either would have to waste all 32 (not reasonable) or guess (badly) and recompile. This is unfortunate. Because we can only spawn 16 thread instances, we can only use this mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH. This patch implements the new 8_PATCH TCS mode, but leaves us using SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to using 8_PATCH mode for testing and benchmarking purposes. We may want to consider using 8_PATCH mode in Vulkan in some cases. The data I've seen shows that 8_PATCH mode can be more efficient in some cases, but SINGLE_PATCH mode (the one we use today) is faster in other cases. Ultimately, the TES matters much more than the TCS for performance, so the decision may not matter much. Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
const fs_reg start = tcs_payload().icp_handle_start;
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8 Our tessellation control shaders can be dispatched in several modes. - SINGLE_PATCH (Gen7+) processes a single patch per thread, with each channel corresponding to a different patch vertex. PATCHLIST_N will launch (N / 8) threads. If N is less than 8, some channels will be disabled, leaving some untapped hardware capabilities. Conditionals based on gl_InvocationID are non-uniform, which means that they'll often have to execute both paths. However, if there are fewer than 8 vertices, all invocations will happen within a single thread, so barriers can become no-ops, which is nice. We also burn a maximum of 4 registers for ICP handles, so we can compile without regard for the value of N. It also works in all cases. - DUAL_PATCH mode processes up to two patches at a time, where the first four channels come from patch 1, and the second group of four come from patch 2. This tries to provide better EU utilization for small patches (N <= 4). It cannot be used in all cases. - 8_PATCH mode processes 8 patches at a time, with a thread launched per vertex in the patch. Each channel corresponds to the same vertex, but in each of the 8 patches. This utilizes all channels even for small patches. It also makes conditions on gl_InvocationID uniform, leading to proper jumps. Barriers, unfortunately, become real. Worse, for PATCHLIST_N, the thread payload burns N registers for ICP handles. This can burn up to 32 registers, or 1/4 of our register file, for URB handles. For Vulkan (and DX), we know the number of vertices at compile time, so we can limit the amount of waste. In GL, the patch dimension is dynamic state, so we either would have to waste all 32 (not reasonable) or guess (badly) and recompile. This is unfortunate. Because we can only spawn 16 thread instances, we can only use this mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH. This patch implements the new 8_PATCH TCS mode, but leaves us using SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to using 8_PATCH mode for testing and benchmarking purposes. We may want to consider using 8_PATCH mode in Vulkan in some cases. The data I've seen shows that 8_PATCH mode can be more efficient in some cases, but SINGLE_PATCH mode (the one we use today) is faster in other cases. Ultimately, the TES matters much more than the TCS for performance, so the decision may not matter much. Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
if (nir_src_is_const(vertex_src))
return byte_offset(start, nir_src_as_uint(vertex_src) * grf_size_bytes);
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8 Our tessellation control shaders can be dispatched in several modes. - SINGLE_PATCH (Gen7+) processes a single patch per thread, with each channel corresponding to a different patch vertex. PATCHLIST_N will launch (N / 8) threads. If N is less than 8, some channels will be disabled, leaving some untapped hardware capabilities. Conditionals based on gl_InvocationID are non-uniform, which means that they'll often have to execute both paths. However, if there are fewer than 8 vertices, all invocations will happen within a single thread, so barriers can become no-ops, which is nice. We also burn a maximum of 4 registers for ICP handles, so we can compile without regard for the value of N. It also works in all cases. - DUAL_PATCH mode processes up to two patches at a time, where the first four channels come from patch 1, and the second group of four come from patch 2. This tries to provide better EU utilization for small patches (N <= 4). It cannot be used in all cases. - 8_PATCH mode processes 8 patches at a time, with a thread launched per vertex in the patch. Each channel corresponds to the same vertex, but in each of the 8 patches. This utilizes all channels even for small patches. It also makes conditions on gl_InvocationID uniform, leading to proper jumps. Barriers, unfortunately, become real. Worse, for PATCHLIST_N, the thread payload burns N registers for ICP handles. This can burn up to 32 registers, or 1/4 of our register file, for URB handles. For Vulkan (and DX), we know the number of vertices at compile time, so we can limit the amount of waste. In GL, the patch dimension is dynamic state, so we either would have to waste all 32 (not reasonable) or guess (badly) and recompile. This is unfortunate. Because we can only spawn 16 thread instances, we can only use this mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH. This patch implements the new 8_PATCH TCS mode, but leaves us using SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to using 8_PATCH mode for testing and benchmarking purposes. We may want to consider using 8_PATCH mode in Vulkan in some cases. The data I've seen shows that 8_PATCH mode can be more efficient in some cases, but SINGLE_PATCH mode (the one we use today) is faster in other cases. Ultimately, the TES matters much more than the TCS for performance, so the decision may not matter much. Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
/* The vertex index is non-constant. We need to use indirect
* addressing to fetch the proper URB handle.
*
* First, we start with the sequence indicating that channel <n>
* should read the handle from DWord <n>. We convert that to bytes
* by multiplying by 4.
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8 Our tessellation control shaders can be dispatched in several modes. - SINGLE_PATCH (Gen7+) processes a single patch per thread, with each channel corresponding to a different patch vertex. PATCHLIST_N will launch (N / 8) threads. If N is less than 8, some channels will be disabled, leaving some untapped hardware capabilities. Conditionals based on gl_InvocationID are non-uniform, which means that they'll often have to execute both paths. However, if there are fewer than 8 vertices, all invocations will happen within a single thread, so barriers can become no-ops, which is nice. We also burn a maximum of 4 registers for ICP handles, so we can compile without regard for the value of N. It also works in all cases. - DUAL_PATCH mode processes up to two patches at a time, where the first four channels come from patch 1, and the second group of four come from patch 2. This tries to provide better EU utilization for small patches (N <= 4). It cannot be used in all cases. - 8_PATCH mode processes 8 patches at a time, with a thread launched per vertex in the patch. Each channel corresponds to the same vertex, but in each of the 8 patches. This utilizes all channels even for small patches. It also makes conditions on gl_InvocationID uniform, leading to proper jumps. Barriers, unfortunately, become real. Worse, for PATCHLIST_N, the thread payload burns N registers for ICP handles. This can burn up to 32 registers, or 1/4 of our register file, for URB handles. For Vulkan (and DX), we know the number of vertices at compile time, so we can limit the amount of waste. In GL, the patch dimension is dynamic state, so we either would have to waste all 32 (not reasonable) or guess (badly) and recompile. This is unfortunate. Because we can only spawn 16 thread instances, we can only use this mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH. This patch implements the new 8_PATCH TCS mode, but leaves us using SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to using 8_PATCH mode for testing and benchmarking purposes. We may want to consider using 8_PATCH mode in Vulkan in some cases. The data I've seen shows that 8_PATCH mode can be more efficient in some cases, but SINGLE_PATCH mode (the one we use today) is faster in other cases. Ultimately, the TES matters much more than the TCS for performance, so the decision may not matter much. Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
*
* Next, we convert the vertex index to bytes by multiplying
* by the GRF size (by shifting), and add the two together. This is
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8 Our tessellation control shaders can be dispatched in several modes. - SINGLE_PATCH (Gen7+) processes a single patch per thread, with each channel corresponding to a different patch vertex. PATCHLIST_N will launch (N / 8) threads. If N is less than 8, some channels will be disabled, leaving some untapped hardware capabilities. Conditionals based on gl_InvocationID are non-uniform, which means that they'll often have to execute both paths. However, if there are fewer than 8 vertices, all invocations will happen within a single thread, so barriers can become no-ops, which is nice. We also burn a maximum of 4 registers for ICP handles, so we can compile without regard for the value of N. It also works in all cases. - DUAL_PATCH mode processes up to two patches at a time, where the first four channels come from patch 1, and the second group of four come from patch 2. This tries to provide better EU utilization for small patches (N <= 4). It cannot be used in all cases. - 8_PATCH mode processes 8 patches at a time, with a thread launched per vertex in the patch. Each channel corresponds to the same vertex, but in each of the 8 patches. This utilizes all channels even for small patches. It also makes conditions on gl_InvocationID uniform, leading to proper jumps. Barriers, unfortunately, become real. Worse, for PATCHLIST_N, the thread payload burns N registers for ICP handles. This can burn up to 32 registers, or 1/4 of our register file, for URB handles. For Vulkan (and DX), we know the number of vertices at compile time, so we can limit the amount of waste. In GL, the patch dimension is dynamic state, so we either would have to waste all 32 (not reasonable) or guess (badly) and recompile. This is unfortunate. Because we can only spawn 16 thread instances, we can only use this mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH. This patch implements the new 8_PATCH TCS mode, but leaves us using SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to using 8_PATCH mode for testing and benchmarking purposes. We may want to consider using 8_PATCH mode in Vulkan in some cases. The data I've seen shows that 8_PATCH mode can be more efficient in some cases, but SINGLE_PATCH mode (the one we use today) is faster in other cases. Ultimately, the TES matters much more than the TCS for performance, so the decision may not matter much. Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
* the final indirect byte offset.
*/
fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
intel/compiler: Use subgroup invocation for ICP handle loads When loading a TCS or GS input, we generate some code to read the URB handle for a particular input control point (ICP handle), which often involves indirect addressing due to a non-constant vertex. For example: mov(8) vgrf148+0.0:UW, 76543210V shl(8) vgrf149:UD, vgrf148+0.0:UW, 2u shl(8) vgrf150:UD, vgrf145:UD, 5u add(8) vgrf151:UD, vgrf150:UD, vgrf149:UD mov_indirect(8) vgrf147:UD, g2:UD, vgrf151:UD, 96u Unfortunately, the first load with 76543210V is considered a partial write because the 8 channels of 16-bit UW data doesn't fill an entire register, and we can't allocate VGRFs at sub-register granularity. This causes none of the above math to be CSE'd, even though the first two instructions are common to *all* input loads, and the rest may be reused sometimes as well. To work around this, we stop emitting 76543210V to a temporary, and instead use nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], which already contains this value, and is unconditionally set up for us. With all input loads using the same register for the sequence, our CSE pass is able to eliminate the rest of the common math. shader-db results on Tigerlake: total instructions in shared programs: 20748243 -> 20744844 (-0.02%) instructions in affected programs: 73410 -> 70011 (-4.63%) helped: 242 / HURT: 21 helped stats (abs) min: 1 max: 37 x̄: 14.17 x̃: 15 helped stats (rel) min: 0.17% max: 19.58% x̄: 6.13% x̃: 6.32% HURT stats (abs) min: 1 max: 4 x̄: 1.38 x̃: 1 HURT stats (rel) min: 0.18% max: 1.31% x̄: 0.58% x̃: 0.58% 95% mean confidence interval for instructions value: -13.73 -12.12 95% mean confidence interval for instructions %-change: -6.00% -5.19% Instructions are helped. total cycles in shared programs: 785828951 -> 785788480 (<.01%) cycles in affected programs: 597593 -> 557122 (-6.77%) helped: 227 / HURT: 13 helped stats (abs) min: 6 max: 624 x̄: 182.19 x̃: 185 helped stats (rel) min: 0.24% max: 18.22% x̄: 7.85% x̃: 7.80% HURT stats (abs) min: 2 max: 153 x̄: 68.08 x̃: 36 HURT stats (rel) min: 0.03% max: 7.79% x̄: 2.97% x̃: 1.25% 95% mean confidence interval for cycles value: -182.55 -154.71 95% mean confidence interval for cycles %-change: -7.84% -6.69% Cycles are helped. Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18455>
2022-09-06 17:18:16 -07:00
fs_reg sequence = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8 Our tessellation control shaders can be dispatched in several modes. - SINGLE_PATCH (Gen7+) processes a single patch per thread, with each channel corresponding to a different patch vertex. PATCHLIST_N will launch (N / 8) threads. If N is less than 8, some channels will be disabled, leaving some untapped hardware capabilities. Conditionals based on gl_InvocationID are non-uniform, which means that they'll often have to execute both paths. However, if there are fewer than 8 vertices, all invocations will happen within a single thread, so barriers can become no-ops, which is nice. We also burn a maximum of 4 registers for ICP handles, so we can compile without regard for the value of N. It also works in all cases. - DUAL_PATCH mode processes up to two patches at a time, where the first four channels come from patch 1, and the second group of four come from patch 2. This tries to provide better EU utilization for small patches (N <= 4). It cannot be used in all cases. - 8_PATCH mode processes 8 patches at a time, with a thread launched per vertex in the patch. Each channel corresponds to the same vertex, but in each of the 8 patches. This utilizes all channels even for small patches. It also makes conditions on gl_InvocationID uniform, leading to proper jumps. Barriers, unfortunately, become real. Worse, for PATCHLIST_N, the thread payload burns N registers for ICP handles. This can burn up to 32 registers, or 1/4 of our register file, for URB handles. For Vulkan (and DX), we know the number of vertices at compile time, so we can limit the amount of waste. In GL, the patch dimension is dynamic state, so we either would have to waste all 32 (not reasonable) or guess (badly) and recompile. This is unfortunate. Because we can only spawn 16 thread instances, we can only use this mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH. This patch implements the new 8_PATCH TCS mode, but leaves us using SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to using 8_PATCH mode for testing and benchmarking purposes. We may want to consider using 8_PATCH mode in Vulkan in some cases. The data I've seen shows that 8_PATCH mode can be more efficient in some cases, but SINGLE_PATCH mode (the one we use today) is faster in other cases. Ultimately, the TES matters much more than the TCS for performance, so the decision may not matter much. Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
/* Offsets will be 0, 4, 8, ... */
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8 Our tessellation control shaders can be dispatched in several modes. - SINGLE_PATCH (Gen7+) processes a single patch per thread, with each channel corresponding to a different patch vertex. PATCHLIST_N will launch (N / 8) threads. If N is less than 8, some channels will be disabled, leaving some untapped hardware capabilities. Conditionals based on gl_InvocationID are non-uniform, which means that they'll often have to execute both paths. However, if there are fewer than 8 vertices, all invocations will happen within a single thread, so barriers can become no-ops, which is nice. We also burn a maximum of 4 registers for ICP handles, so we can compile without regard for the value of N. It also works in all cases. - DUAL_PATCH mode processes up to two patches at a time, where the first four channels come from patch 1, and the second group of four come from patch 2. This tries to provide better EU utilization for small patches (N <= 4). It cannot be used in all cases. - 8_PATCH mode processes 8 patches at a time, with a thread launched per vertex in the patch. Each channel corresponds to the same vertex, but in each of the 8 patches. This utilizes all channels even for small patches. It also makes conditions on gl_InvocationID uniform, leading to proper jumps. Barriers, unfortunately, become real. Worse, for PATCHLIST_N, the thread payload burns N registers for ICP handles. This can burn up to 32 registers, or 1/4 of our register file, for URB handles. For Vulkan (and DX), we know the number of vertices at compile time, so we can limit the amount of waste. In GL, the patch dimension is dynamic state, so we either would have to waste all 32 (not reasonable) or guess (badly) and recompile. This is unfortunate. Because we can only spawn 16 thread instances, we can only use this mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH. This patch implements the new 8_PATCH TCS mode, but leaves us using SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to using 8_PATCH mode for testing and benchmarking purposes. We may want to consider using 8_PATCH mode in Vulkan in some cases. The data I've seen shows that 8_PATCH mode can be more efficient in some cases, but SINGLE_PATCH mode (the one we use today) is faster in other cases. Ultimately, the TES matters much more than the TCS for performance, so the decision may not matter much. Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
/* Convert vertex_index to bytes (multiply by 32) */
assert(util_is_power_of_two_nonzero(grf_size_bytes)); /* for ffs() */
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8 Our tessellation control shaders can be dispatched in several modes. - SINGLE_PATCH (Gen7+) processes a single patch per thread, with each channel corresponding to a different patch vertex. PATCHLIST_N will launch (N / 8) threads. If N is less than 8, some channels will be disabled, leaving some untapped hardware capabilities. Conditionals based on gl_InvocationID are non-uniform, which means that they'll often have to execute both paths. However, if there are fewer than 8 vertices, all invocations will happen within a single thread, so barriers can become no-ops, which is nice. We also burn a maximum of 4 registers for ICP handles, so we can compile without regard for the value of N. It also works in all cases. - DUAL_PATCH mode processes up to two patches at a time, where the first four channels come from patch 1, and the second group of four come from patch 2. This tries to provide better EU utilization for small patches (N <= 4). It cannot be used in all cases. - 8_PATCH mode processes 8 patches at a time, with a thread launched per vertex in the patch. Each channel corresponds to the same vertex, but in each of the 8 patches. This utilizes all channels even for small patches. It also makes conditions on gl_InvocationID uniform, leading to proper jumps. Barriers, unfortunately, become real. Worse, for PATCHLIST_N, the thread payload burns N registers for ICP handles. This can burn up to 32 registers, or 1/4 of our register file, for URB handles. For Vulkan (and DX), we know the number of vertices at compile time, so we can limit the amount of waste. In GL, the patch dimension is dynamic state, so we either would have to waste all 32 (not reasonable) or guess (badly) and recompile. This is unfortunate. Because we can only spawn 16 thread instances, we can only use this mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH. This patch implements the new 8_PATCH TCS mode, but leaves us using SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to using 8_PATCH mode for testing and benchmarking purposes. We may want to consider using 8_PATCH mode in Vulkan in some cases. The data I've seen shows that 8_PATCH mode can be more efficient in some cases, but SINGLE_PATCH mode (the one we use today) is faster in other cases. Ultimately, the TES matters much more than the TCS for performance, so the decision may not matter much. Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
bld.SHL(vertex_offset_bytes,
retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
brw_imm_ud(ffs(grf_size_bytes) - 1));
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8 Our tessellation control shaders can be dispatched in several modes. - SINGLE_PATCH (Gen7+) processes a single patch per thread, with each channel corresponding to a different patch vertex. PATCHLIST_N will launch (N / 8) threads. If N is less than 8, some channels will be disabled, leaving some untapped hardware capabilities. Conditionals based on gl_InvocationID are non-uniform, which means that they'll often have to execute both paths. However, if there are fewer than 8 vertices, all invocations will happen within a single thread, so barriers can become no-ops, which is nice. We also burn a maximum of 4 registers for ICP handles, so we can compile without regard for the value of N. It also works in all cases. - DUAL_PATCH mode processes up to two patches at a time, where the first four channels come from patch 1, and the second group of four come from patch 2. This tries to provide better EU utilization for small patches (N <= 4). It cannot be used in all cases. - 8_PATCH mode processes 8 patches at a time, with a thread launched per vertex in the patch. Each channel corresponds to the same vertex, but in each of the 8 patches. This utilizes all channels even for small patches. It also makes conditions on gl_InvocationID uniform, leading to proper jumps. Barriers, unfortunately, become real. Worse, for PATCHLIST_N, the thread payload burns N registers for ICP handles. This can burn up to 32 registers, or 1/4 of our register file, for URB handles. For Vulkan (and DX), we know the number of vertices at compile time, so we can limit the amount of waste. In GL, the patch dimension is dynamic state, so we either would have to waste all 32 (not reasonable) or guess (badly) and recompile. This is unfortunate. Because we can only spawn 16 thread instances, we can only use this mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH. This patch implements the new 8_PATCH TCS mode, but leaves us using SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to using 8_PATCH mode for testing and benchmarking purposes. We may want to consider using 8_PATCH mode in Vulkan in some cases. The data I've seen shows that 8_PATCH mode can be more efficient in some cases, but SINGLE_PATCH mode (the one we use today) is faster in other cases. Ultimately, the TES matters much more than the TCS for performance, so the decision may not matter much. Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
/* Use start of ICP handles as the base offset. There is one register
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8 Our tessellation control shaders can be dispatched in several modes. - SINGLE_PATCH (Gen7+) processes a single patch per thread, with each channel corresponding to a different patch vertex. PATCHLIST_N will launch (N / 8) threads. If N is less than 8, some channels will be disabled, leaving some untapped hardware capabilities. Conditionals based on gl_InvocationID are non-uniform, which means that they'll often have to execute both paths. However, if there are fewer than 8 vertices, all invocations will happen within a single thread, so barriers can become no-ops, which is nice. We also burn a maximum of 4 registers for ICP handles, so we can compile without regard for the value of N. It also works in all cases. - DUAL_PATCH mode processes up to two patches at a time, where the first four channels come from patch 1, and the second group of four come from patch 2. This tries to provide better EU utilization for small patches (N <= 4). It cannot be used in all cases. - 8_PATCH mode processes 8 patches at a time, with a thread launched per vertex in the patch. Each channel corresponds to the same vertex, but in each of the 8 patches. This utilizes all channels even for small patches. It also makes conditions on gl_InvocationID uniform, leading to proper jumps. Barriers, unfortunately, become real. Worse, for PATCHLIST_N, the thread payload burns N registers for ICP handles. This can burn up to 32 registers, or 1/4 of our register file, for URB handles. For Vulkan (and DX), we know the number of vertices at compile time, so we can limit the amount of waste. In GL, the patch dimension is dynamic state, so we either would have to waste all 32 (not reasonable) or guess (badly) and recompile. This is unfortunate. Because we can only spawn 16 thread instances, we can only use this mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH. This patch implements the new 8_PATCH TCS mode, but leaves us using SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to using 8_PATCH mode for testing and benchmarking purposes. We may want to consider using 8_PATCH mode in Vulkan in some cases. The data I've seen shows that 8_PATCH mode can be more efficient in some cases, but SINGLE_PATCH mode (the one we use today) is faster in other cases. Ultimately, the TES matters much more than the TCS for performance, so the decision may not matter much. Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
* of URB handles per vertex, so inform the register allocator that
* we might read up to nir->info.gs.vertices_in registers.
*/
bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
icp_offset_bytes,
brw_imm_ud(brw_tcs_prog_key_input_vertices(tcs_key) *
grf_size_bytes));
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8 Our tessellation control shaders can be dispatched in several modes. - SINGLE_PATCH (Gen7+) processes a single patch per thread, with each channel corresponding to a different patch vertex. PATCHLIST_N will launch (N / 8) threads. If N is less than 8, some channels will be disabled, leaving some untapped hardware capabilities. Conditionals based on gl_InvocationID are non-uniform, which means that they'll often have to execute both paths. However, if there are fewer than 8 vertices, all invocations will happen within a single thread, so barriers can become no-ops, which is nice. We also burn a maximum of 4 registers for ICP handles, so we can compile without regard for the value of N. It also works in all cases. - DUAL_PATCH mode processes up to two patches at a time, where the first four channels come from patch 1, and the second group of four come from patch 2. This tries to provide better EU utilization for small patches (N <= 4). It cannot be used in all cases. - 8_PATCH mode processes 8 patches at a time, with a thread launched per vertex in the patch. Each channel corresponds to the same vertex, but in each of the 8 patches. This utilizes all channels even for small patches. It also makes conditions on gl_InvocationID uniform, leading to proper jumps. Barriers, unfortunately, become real. Worse, for PATCHLIST_N, the thread payload burns N registers for ICP handles. This can burn up to 32 registers, or 1/4 of our register file, for URB handles. For Vulkan (and DX), we know the number of vertices at compile time, so we can limit the amount of waste. In GL, the patch dimension is dynamic state, so we either would have to waste all 32 (not reasonable) or guess (badly) and recompile. This is unfortunate. Because we can only spawn 16 thread instances, we can only use this mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH. This patch implements the new 8_PATCH TCS mode, but leaves us using SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to using 8_PATCH mode for testing and benchmarking purposes. We may want to consider using 8_PATCH mode in Vulkan in some cases. The data I've seen shows that 8_PATCH mode can be more efficient in some cases, but SINGLE_PATCH mode (the one we use today) is faster in other cases. Ultimately, the TES matters much more than the TCS for performance, so the decision may not matter much. Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
return icp_handle;
}
void
fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
nir_intrinsic_instr *instr)
{
assert(stage == MESA_SHADER_TESS_CTRL);
struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8 Our tessellation control shaders can be dispatched in several modes. - SINGLE_PATCH (Gen7+) processes a single patch per thread, with each channel corresponding to a different patch vertex. PATCHLIST_N will launch (N / 8) threads. If N is less than 8, some channels will be disabled, leaving some untapped hardware capabilities. Conditionals based on gl_InvocationID are non-uniform, which means that they'll often have to execute both paths. However, if there are fewer than 8 vertices, all invocations will happen within a single thread, so barriers can become no-ops, which is nice. We also burn a maximum of 4 registers for ICP handles, so we can compile without regard for the value of N. It also works in all cases. - DUAL_PATCH mode processes up to two patches at a time, where the first four channels come from patch 1, and the second group of four come from patch 2. This tries to provide better EU utilization for small patches (N <= 4). It cannot be used in all cases. - 8_PATCH mode processes 8 patches at a time, with a thread launched per vertex in the patch. Each channel corresponds to the same vertex, but in each of the 8 patches. This utilizes all channels even for small patches. It also makes conditions on gl_InvocationID uniform, leading to proper jumps. Barriers, unfortunately, become real. Worse, for PATCHLIST_N, the thread payload burns N registers for ICP handles. This can burn up to 32 registers, or 1/4 of our register file, for URB handles. For Vulkan (and DX), we know the number of vertices at compile time, so we can limit the amount of waste. In GL, the patch dimension is dynamic state, so we either would have to waste all 32 (not reasonable) or guess (badly) and recompile. This is unfortunate. Because we can only spawn 16 thread instances, we can only use this mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH. This patch implements the new 8_PATCH TCS mode, but leaves us using SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to using 8_PATCH mode for testing and benchmarking purposes. We may want to consider using 8_PATCH mode in Vulkan in some cases. The data I've seen shows that 8_PATCH mode can be more efficient in some cases, but SINGLE_PATCH mode (the one we use today) is faster in other cases. Ultimately, the TES matters much more than the TCS for performance, so the decision may not matter much. Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
fs_reg dst;
if (nir_intrinsic_infos[instr->intrinsic].has_dest)
dst = get_nir_def(instr->def);
switch (instr->intrinsic) {
case nir_intrinsic_load_primitive_id:
bld.MOV(dst, tcs_payload().primitive_id);
break;
case nir_intrinsic_load_invocation_id:
bld.MOV(retype(dst, invocation_id.type), invocation_id);
break;
case nir_intrinsic_barrier:
if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
nir_emit_intrinsic(bld, instr);
if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
if (tcs_prog_data->instances != 1)
emit_tcs_barrier();
}
break;
case nir_intrinsic_load_input:
unreachable("nir_lower_io should never give us these.");
break;
case nir_intrinsic_load_per_vertex_input: {
assert(instr->def.bit_size == 32);
fs_reg indirect_offset = get_indirect_offset(instr);
unsigned imm_offset = nir_intrinsic_base(instr);
fs_inst *inst;
const bool multi_patch =
vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_MULTI_PATCH;
fs_reg icp_handle = multi_patch ?
get_tcs_multi_patch_icp_handle(bld, instr) :
get_tcs_single_patch_icp_handle(bld, instr);
/* We can only read two double components with each URB read, so
* we send two read messages in that case, each one loading up to
* two double components.
*/
unsigned num_components = instr->num_components;
unsigned first_component = nir_intrinsic_component(instr);
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
if (indirect_offset.file == BAD_FILE) {
/* Constant indexing - use global offset. */
if (first_component != 0) {
unsigned read_components = num_components + first_component;
fs_reg tmp = bld.vgrf(dst.type, read_components);
inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs,
ARRAY_SIZE(srcs));
for (unsigned i = 0; i < num_components; i++) {
bld.MOV(offset(dst, bld, i),
offset(tmp, bld, i + first_component));
}
} else {
inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs,
ARRAY_SIZE(srcs));
}
inst->offset = imm_offset;
} else {
/* Indirect indexing - use per-slot offsets as well. */
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
if (first_component != 0) {
unsigned read_components = num_components + first_component;
fs_reg tmp = bld.vgrf(dst.type, read_components);
inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
srcs, ARRAY_SIZE(srcs));
for (unsigned i = 0; i < num_components; i++) {
bld.MOV(offset(dst, bld, i),
offset(tmp, bld, i + first_component));
}
} else {
inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
srcs, ARRAY_SIZE(srcs));
}
inst->offset = imm_offset;
}
inst->size_written = (num_components + first_component) *
inst->dst.component_size(inst->exec_size);
/* Copy the temporary to the destination to deal with writemasking.
*
* Also attempt to deal with gl_PointSize being in the .w component.
*/
if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
assert(type_sz(dst.type) == 4);
inst->dst = bld.vgrf(dst.type, 4);
inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
bld.MOV(dst, offset(inst->dst, bld, 3));
}
break;
}
case nir_intrinsic_load_output:
case nir_intrinsic_load_per_vertex_output: {
assert(instr->def.bit_size == 32);
fs_reg indirect_offset = get_indirect_offset(instr);
unsigned imm_offset = nir_intrinsic_base(instr);
unsigned first_component = nir_intrinsic_component(instr);
fs_inst *inst;
if (indirect_offset.file == BAD_FILE) {
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8 Our tessellation control shaders can be dispatched in several modes. - SINGLE_PATCH (Gen7+) processes a single patch per thread, with each channel corresponding to a different patch vertex. PATCHLIST_N will launch (N / 8) threads. If N is less than 8, some channels will be disabled, leaving some untapped hardware capabilities. Conditionals based on gl_InvocationID are non-uniform, which means that they'll often have to execute both paths. However, if there are fewer than 8 vertices, all invocations will happen within a single thread, so barriers can become no-ops, which is nice. We also burn a maximum of 4 registers for ICP handles, so we can compile without regard for the value of N. It also works in all cases. - DUAL_PATCH mode processes up to two patches at a time, where the first four channels come from patch 1, and the second group of four come from patch 2. This tries to provide better EU utilization for small patches (N <= 4). It cannot be used in all cases. - 8_PATCH mode processes 8 patches at a time, with a thread launched per vertex in the patch. Each channel corresponds to the same vertex, but in each of the 8 patches. This utilizes all channels even for small patches. It also makes conditions on gl_InvocationID uniform, leading to proper jumps. Barriers, unfortunately, become real. Worse, for PATCHLIST_N, the thread payload burns N registers for ICP handles. This can burn up to 32 registers, or 1/4 of our register file, for URB handles. For Vulkan (and DX), we know the number of vertices at compile time, so we can limit the amount of waste. In GL, the patch dimension is dynamic state, so we either would have to waste all 32 (not reasonable) or guess (badly) and recompile. This is unfortunate. Because we can only spawn 16 thread instances, we can only use this mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH. This patch implements the new 8_PATCH TCS mode, but leaves us using SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to using 8_PATCH mode for testing and benchmarking purposes. We may want to consider using 8_PATCH mode in Vulkan in some cases. The data I've seen shows that 8_PATCH mode can be more efficient in some cases, but SINGLE_PATCH mode (the one we use today) is faster in other cases. Ultimately, the TES matters much more than the TCS for performance, so the decision may not matter much. Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
/* This MOV replicates the output handle to all enabled channels
* is SINGLE_PATCH mode.
*/
fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
bld.MOV(patch_handle, tcs_payload().patch_urb_output);
{
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = patch_handle;
if (first_component != 0) {
unsigned read_components =
instr->num_components + first_component;
fs_reg tmp = bld.vgrf(dst.type, read_components);
inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
srcs, ARRAY_SIZE(srcs));
inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
for (unsigned i = 0; i < instr->num_components; i++) {
bld.MOV(offset(dst, bld, i),
offset(tmp, bld, i + first_component));
}
} else {
inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
srcs, ARRAY_SIZE(srcs));
inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
}
inst->offset = imm_offset;
}
} else {
/* Indirect indexing - use per-slot offsets as well. */
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = tcs_payload().patch_urb_output;
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
if (first_component != 0) {
unsigned read_components =
instr->num_components + first_component;
fs_reg tmp = bld.vgrf(dst.type, read_components);
inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
srcs, ARRAY_SIZE(srcs));
inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
for (unsigned i = 0; i < instr->num_components; i++) {
bld.MOV(offset(dst, bld, i),
offset(tmp, bld, i + first_component));
}
} else {
inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
srcs, ARRAY_SIZE(srcs));
inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
}
inst->offset = imm_offset;
}
break;
}
case nir_intrinsic_store_output:
case nir_intrinsic_store_per_vertex_output: {
assert(nir_src_bit_size(instr->src[0]) == 32);
fs_reg value = get_nir_src(instr->src[0]);
fs_reg indirect_offset = get_indirect_offset(instr);
unsigned imm_offset = nir_intrinsic_base(instr);
unsigned mask = nir_intrinsic_write_mask(instr);
if (mask == 0)
break;
unsigned num_components = util_last_bit(mask);
unsigned first_component = nir_intrinsic_component(instr);
assert((first_component + num_components) <= 4);
mask = mask << first_component;
const bool has_urb_lsc = devinfo->ver >= 20;
fs_reg mask_reg;
if (mask != WRITEMASK_XYZW)
mask_reg = brw_imm_ud(mask << 16);
fs_reg sources[4];
unsigned m = has_urb_lsc ? 0 : first_component;
for (unsigned i = 0; i < num_components; i++) {
int c = i + first_component;
if (mask & (1 << c)) {
sources[m++] = offset(value, bld, i);
} else if (devinfo->ver < 20) {
m++;
}
}
assert(has_urb_lsc || m == (first_component + num_components));
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = tcs_payload().patch_urb_output;
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask_reg;
srcs[URB_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_F, m);
srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(m);
bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, m, 0);
fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
srcs, ARRAY_SIZE(srcs));
inst->offset = imm_offset;
break;
}
default:
nir_emit_intrinsic(bld, instr);
break;
}
}
void
fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
nir_intrinsic_instr *instr)
{
assert(stage == MESA_SHADER_TESS_EVAL);
struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data);
fs_reg dest;
if (nir_intrinsic_infos[instr->intrinsic].has_dest)
dest = get_nir_def(instr->def);
switch (instr->intrinsic) {
case nir_intrinsic_load_primitive_id:
bld.MOV(dest, tes_payload().primitive_id);
break;
case nir_intrinsic_load_tess_coord:
for (unsigned i = 0; i < 3; i++)
bld.MOV(offset(dest, bld, i), tes_payload().coords[i]);
break;
case nir_intrinsic_load_input:
case nir_intrinsic_load_per_vertex_input: {
assert(instr->def.bit_size == 32);
fs_reg indirect_offset = get_indirect_offset(instr);
unsigned imm_offset = nir_intrinsic_base(instr);
unsigned first_component = nir_intrinsic_component(instr);
fs_inst *inst;
if (indirect_offset.file == BAD_FILE) {
i965: Push most TES inputs in SIMD8 mode. Using the push model for inputs is much more efficient than pulling inputs - the hardware can simply copy a large chunk into URB registers at thread creation time, rather than having the thread send messages to request data from the L3 cache. Unfortunately, it's possible to have more TES inputs than fit in registers, so we have to fall back to the pull model in some cases. However, it turns out that most tessellation evaluation shaders are fairly simple, and don't use many inputs. An arbitrary cut-off of 32 vec4 slots (16 registers) is more than sufficient to ensure that 100% of TES inputs are pushed for Shadow of Mordor, Unigine Heaven, GPUTest/TessMark, and SynMark. Note that unlike most SIMD8 stages, this actually reads packed vec4 data, since that is what our vec4 TCS programs write. Improves performance in GPUTest's tessmark_x64 microbenchmark by 93.4426% +/- 5.35541% (n = 25) on my Lenovo X250 at 1024x768. Improves performance in Synmark's Gl40TerrainFlyTess microbenchmark by 22.74% +/- 0.309394% (n = 5). Improves performance in Shadow of Mordor at low settings with tessellation enabled at 1280x720 by 2.12197% +/- 0.478553% (n = 4). shader-db statistics for files containing tessellation shaders: total instructions in shared programs: 184358 -> 181181 (-1.72%) instructions in affected programs: 27971 -> 24794 (-11.36%) helped: 226 Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-12-27 17:26:30 -08:00
/* Arbitrarily only push up to 32 vec4 slots worth of data,
* which is 16 registers (since each holds 2 vec4 slots).
*/
const unsigned max_push_slots = 32;
if (imm_offset < max_push_slots) {
i965: Push most TES inputs in SIMD8 mode. Using the push model for inputs is much more efficient than pulling inputs - the hardware can simply copy a large chunk into URB registers at thread creation time, rather than having the thread send messages to request data from the L3 cache. Unfortunately, it's possible to have more TES inputs than fit in registers, so we have to fall back to the pull model in some cases. However, it turns out that most tessellation evaluation shaders are fairly simple, and don't use many inputs. An arbitrary cut-off of 32 vec4 slots (16 registers) is more than sufficient to ensure that 100% of TES inputs are pushed for Shadow of Mordor, Unigine Heaven, GPUTest/TessMark, and SynMark. Note that unlike most SIMD8 stages, this actually reads packed vec4 data, since that is what our vec4 TCS programs write. Improves performance in GPUTest's tessmark_x64 microbenchmark by 93.4426% +/- 5.35541% (n = 25) on my Lenovo X250 at 1024x768. Improves performance in Synmark's Gl40TerrainFlyTess microbenchmark by 22.74% +/- 0.309394% (n = 5). Improves performance in Shadow of Mordor at low settings with tessellation enabled at 1280x720 by 2.12197% +/- 0.478553% (n = 4). shader-db statistics for files containing tessellation shaders: total instructions in shared programs: 184358 -> 181181 (-1.72%) instructions in affected programs: 27971 -> 24794 (-11.36%) helped: 226 Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-12-27 17:26:30 -08:00
fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);
for (int i = 0; i < instr->num_components; i++) {
unsigned comp = 4 * (imm_offset % 2) + i + first_component;
bld.MOV(offset(dest, bld, i), component(src, comp));
i965: Push most TES inputs in SIMD8 mode. Using the push model for inputs is much more efficient than pulling inputs - the hardware can simply copy a large chunk into URB registers at thread creation time, rather than having the thread send messages to request data from the L3 cache. Unfortunately, it's possible to have more TES inputs than fit in registers, so we have to fall back to the pull model in some cases. However, it turns out that most tessellation evaluation shaders are fairly simple, and don't use many inputs. An arbitrary cut-off of 32 vec4 slots (16 registers) is more than sufficient to ensure that 100% of TES inputs are pushed for Shadow of Mordor, Unigine Heaven, GPUTest/TessMark, and SynMark. Note that unlike most SIMD8 stages, this actually reads packed vec4 data, since that is what our vec4 TCS programs write. Improves performance in GPUTest's tessmark_x64 microbenchmark by 93.4426% +/- 5.35541% (n = 25) on my Lenovo X250 at 1024x768. Improves performance in Synmark's Gl40TerrainFlyTess microbenchmark by 22.74% +/- 0.309394% (n = 5). Improves performance in Shadow of Mordor at low settings with tessellation enabled at 1280x720 by 2.12197% +/- 0.478553% (n = 4). shader-db statistics for files containing tessellation shaders: total instructions in shared programs: 184358 -> 181181 (-1.72%) instructions in affected programs: 27971 -> 24794 (-11.36%) helped: 226 Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-12-27 17:26:30 -08:00
}
i965: Push most TES inputs in SIMD8 mode. Using the push model for inputs is much more efficient than pulling inputs - the hardware can simply copy a large chunk into URB registers at thread creation time, rather than having the thread send messages to request data from the L3 cache. Unfortunately, it's possible to have more TES inputs than fit in registers, so we have to fall back to the pull model in some cases. However, it turns out that most tessellation evaluation shaders are fairly simple, and don't use many inputs. An arbitrary cut-off of 32 vec4 slots (16 registers) is more than sufficient to ensure that 100% of TES inputs are pushed for Shadow of Mordor, Unigine Heaven, GPUTest/TessMark, and SynMark. Note that unlike most SIMD8 stages, this actually reads packed vec4 data, since that is what our vec4 TCS programs write. Improves performance in GPUTest's tessmark_x64 microbenchmark by 93.4426% +/- 5.35541% (n = 25) on my Lenovo X250 at 1024x768. Improves performance in Synmark's Gl40TerrainFlyTess microbenchmark by 22.74% +/- 0.309394% (n = 5). Improves performance in Shadow of Mordor at low settings with tessellation enabled at 1280x720 by 2.12197% +/- 0.478553% (n = 4). shader-db statistics for files containing tessellation shaders: total instructions in shared programs: 184358 -> 181181 (-1.72%) instructions in affected programs: 27971 -> 24794 (-11.36%) helped: 226 Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-12-27 17:26:30 -08:00
tes_prog_data->base.urb_read_length =
MAX2(tes_prog_data->base.urb_read_length,
(imm_offset / 2) + 1);
i965: Push most TES inputs in SIMD8 mode. Using the push model for inputs is much more efficient than pulling inputs - the hardware can simply copy a large chunk into URB registers at thread creation time, rather than having the thread send messages to request data from the L3 cache. Unfortunately, it's possible to have more TES inputs than fit in registers, so we have to fall back to the pull model in some cases. However, it turns out that most tessellation evaluation shaders are fairly simple, and don't use many inputs. An arbitrary cut-off of 32 vec4 slots (16 registers) is more than sufficient to ensure that 100% of TES inputs are pushed for Shadow of Mordor, Unigine Heaven, GPUTest/TessMark, and SynMark. Note that unlike most SIMD8 stages, this actually reads packed vec4 data, since that is what our vec4 TCS programs write. Improves performance in GPUTest's tessmark_x64 microbenchmark by 93.4426% +/- 5.35541% (n = 25) on my Lenovo X250 at 1024x768. Improves performance in Synmark's Gl40TerrainFlyTess microbenchmark by 22.74% +/- 0.309394% (n = 5). Improves performance in Shadow of Mordor at low settings with tessellation enabled at 1280x720 by 2.12197% +/- 0.478553% (n = 4). shader-db statistics for files containing tessellation shaders: total instructions in shared programs: 184358 -> 181181 (-1.72%) instructions in affected programs: 27971 -> 24794 (-11.36%) helped: 226 Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-12-27 17:26:30 -08:00
} else {
/* Replicate the patch handle to all enabled channels */
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = tes_payload().patch_urb_input;
i965: Push most TES inputs in SIMD8 mode. Using the push model for inputs is much more efficient than pulling inputs - the hardware can simply copy a large chunk into URB registers at thread creation time, rather than having the thread send messages to request data from the L3 cache. Unfortunately, it's possible to have more TES inputs than fit in registers, so we have to fall back to the pull model in some cases. However, it turns out that most tessellation evaluation shaders are fairly simple, and don't use many inputs. An arbitrary cut-off of 32 vec4 slots (16 registers) is more than sufficient to ensure that 100% of TES inputs are pushed for Shadow of Mordor, Unigine Heaven, GPUTest/TessMark, and SynMark. Note that unlike most SIMD8 stages, this actually reads packed vec4 data, since that is what our vec4 TCS programs write. Improves performance in GPUTest's tessmark_x64 microbenchmark by 93.4426% +/- 5.35541% (n = 25) on my Lenovo X250 at 1024x768. Improves performance in Synmark's Gl40TerrainFlyTess microbenchmark by 22.74% +/- 0.309394% (n = 5). Improves performance in Shadow of Mordor at low settings with tessellation enabled at 1280x720 by 2.12197% +/- 0.478553% (n = 4). shader-db statistics for files containing tessellation shaders: total instructions in shared programs: 184358 -> 181181 (-1.72%) instructions in affected programs: 27971 -> 24794 (-11.36%) helped: 226 Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-12-27 17:26:30 -08:00
if (first_component != 0) {
unsigned read_components =
instr->num_components + first_component;
fs_reg tmp = bld.vgrf(dest.type, read_components);
inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
srcs, ARRAY_SIZE(srcs));
inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
for (unsigned i = 0; i < instr->num_components; i++) {
bld.MOV(offset(dest, bld, i),
offset(tmp, bld, i + first_component));
}
} else {
inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dest,
srcs, ARRAY_SIZE(srcs));
inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
}
i965: Push most TES inputs in SIMD8 mode. Using the push model for inputs is much more efficient than pulling inputs - the hardware can simply copy a large chunk into URB registers at thread creation time, rather than having the thread send messages to request data from the L3 cache. Unfortunately, it's possible to have more TES inputs than fit in registers, so we have to fall back to the pull model in some cases. However, it turns out that most tessellation evaluation shaders are fairly simple, and don't use many inputs. An arbitrary cut-off of 32 vec4 slots (16 registers) is more than sufficient to ensure that 100% of TES inputs are pushed for Shadow of Mordor, Unigine Heaven, GPUTest/TessMark, and SynMark. Note that unlike most SIMD8 stages, this actually reads packed vec4 data, since that is what our vec4 TCS programs write. Improves performance in GPUTest's tessmark_x64 microbenchmark by 93.4426% +/- 5.35541% (n = 25) on my Lenovo X250 at 1024x768. Improves performance in Synmark's Gl40TerrainFlyTess microbenchmark by 22.74% +/- 0.309394% (n = 5). Improves performance in Shadow of Mordor at low settings with tessellation enabled at 1280x720 by 2.12197% +/- 0.478553% (n = 4). shader-db statistics for files containing tessellation shaders: total instructions in shared programs: 184358 -> 181181 (-1.72%) instructions in affected programs: 27971 -> 24794 (-11.36%) helped: 226 Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-12-27 17:26:30 -08:00
inst->offset = imm_offset;
}
} else {
/* Indirect indexing - use per-slot offsets as well. */
/* We can only read two double components with each URB read, so
* we send two read messages in that case, each one loading up to
* two double components.
*/
unsigned num_components = instr->num_components;
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
srcs[URB_LOGICAL_SRC_HANDLE] = tes_payload().patch_urb_input;
srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
if (first_component != 0) {
unsigned read_components =
num_components + first_component;
fs_reg tmp = bld.vgrf(dest.type, read_components);
inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
srcs, ARRAY_SIZE(srcs));
for (unsigned i = 0; i < num_components; i++) {
bld.MOV(offset(dest, bld, i),
offset(tmp, bld, i + first_component));
}
} else {
inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dest,
srcs, ARRAY_SIZE(srcs));
}
inst->offset = imm_offset;
inst->size_written = (num_components + first_component) *
inst->dst.component_size(inst->exec_size);
}
break;
}
default:
nir_emit_intrinsic(bld, instr);
break;
}
}
void
fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
nir_intrinsic_instr *instr)
{
assert(stage == MESA_SHADER_GEOMETRY);
fs_reg indirect_offset;
fs_reg dest;
if (nir_intrinsic_infos[instr->intrinsic].has_dest)
dest = get_nir_def(instr->def);
switch (instr->intrinsic) {
case nir_intrinsic_load_primitive_id:
assert(stage == MESA_SHADER_GEOMETRY);
assert(brw_gs_prog_data(prog_data)->include_primitive_id);
bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), gs_payload().primitive_id);
break;
case nir_intrinsic_load_input:
unreachable("load_input intrinsics are invalid for the GS stage");
case nir_intrinsic_load_per_vertex_input:
emit_gs_input_load(dest, instr->src[0], nir_intrinsic_base(instr),
instr->src[1], instr->num_components,
nir_intrinsic_component(instr));
break;
case nir_intrinsic_emit_vertex_with_counter:
emit_gs_vertex(instr->src[0], nir_intrinsic_stream_id(instr));
break;
case nir_intrinsic_end_primitive_with_counter:
emit_gs_end_primitive(instr->src[0]);
break;
case nir_intrinsic_set_vertex_and_primitive_count:
bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
break;
case nir_intrinsic_load_invocation_id: {
fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
assert(val.file != BAD_FILE);
dest.type = val.type;
bld.MOV(dest, val);
break;
}
default:
nir_emit_intrinsic(bld, instr);
break;
}
}
/**
* Fetch the current render target layer index.
*/
static fs_reg
fetch_render_target_array_index(const fs_builder &bld)
{
if (bld.shader->devinfo->ver >= 12) {
/* The render target array index is provided in the thread payload as
* bits 26:16 of r1.1.
*/
const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 3),
brw_imm_uw(0x7ff));
return idx;
} else if (bld.shader->devinfo->ver >= 6) {
/* The render target array index is provided in the thread payload as
* bits 26:16 of r0.0.
*/
const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1),
brw_imm_uw(0x7ff));
return idx;
} else {
/* Pre-SNB we only ever render into the first layer of the framebuffer
* since layered rendering is not implemented.
*/
return brw_imm_ud(0);
}
}
/**
* Fake non-coherent framebuffer read implemented using TXF to fetch from the
* framebuffer at the current fragment coordinates and sample index.
*/
fs_inst *
fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst,
unsigned target)
{
const struct intel_device_info *devinfo = bld.shader->devinfo;
assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
const brw_wm_prog_key *wm_key =
reinterpret_cast<const brw_wm_prog_key *>(key);
assert(!wm_key->coherent_fb_fetch);
/* Calculate the fragment coordinates. */
const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
bld.MOV(offset(coords, bld, 0), pixel_x);
bld.MOV(offset(coords, bld, 1), pixel_y);
bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
/* Calculate the sample index and MCS payload when multisampling. Luckily
* the MCS fetch message behaves deterministically for UMS surfaces, so it
* shouldn't be necessary to recompile based on whether the framebuffer is
* CMS or UMS.
*/
assert(wm_key->multisample_fbo == BRW_ALWAYS ||
wm_key->multisample_fbo == BRW_NEVER);
if (wm_key->multisample_fbo &&
nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup();
const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
const fs_reg mcs = wm_key->multisample_fbo ?
emit_mcs_fetch(coords, 3, brw_imm_ud(target), fs_reg()) : fs_reg();
/* Use either a normal or a CMS texel fetch message depending on whether
* the framebuffer is single or multisample. On SKL+ use the wide CMS
* message just in case the framebuffer uses 16x multisampling, it should
* be equivalent to the normal CMS fetch for lower multisampling modes.
*/
opcode op;
if (wm_key->multisample_fbo) {
/* On SKL+ use the wide CMS message just in case the framebuffer uses 16x
* multisampling, it should be equivalent to the normal CMS fetch for
* lower multisampling modes.
*
* On Gfx12HP, there is only CMS_W variant available.
*/
if (devinfo->verx10 >= 125)
op = SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL;
else if (devinfo->ver >= 9)
op = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
else
op = SHADER_OPCODE_TXF_CMS_LOGICAL;
} else {
op = SHADER_OPCODE_TXF_LOGICAL;
}
/* Emit the instruction. */
fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
srcs[TEX_LOGICAL_SRC_COORDINATE] = coords;
srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_ud(0);
srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = sample;
srcs[TEX_LOGICAL_SRC_MCS] = mcs;
srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(target);
srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0);
srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(3);
srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_ud(0);
srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_ud(0);
fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
return inst;
}
/**
* Actual coherent framebuffer read implemented using the native render target
* read message. Requires SKL+.
*/
static fs_inst *
emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target)
{
assert(bld.shader->devinfo->ver >= 9);
fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst);
inst->target = target;
inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
return inst;
}
i965/fs: Rework representation of fragment output locations in NIR. The problem with the current approach is that driver output locations are represented as a linear offset within the nir_outputs array, which makes it rather difficult for the back-end to figure out what color output and index some nir_intrinsic_load/store_output was meant for, because the offset of a given output within the nir_output array is dependent on the type and size of all previously allocated outputs. Instead this defines the driver location of an output to be the pair formed by its GLSL-assigned location and index (I've borrowed the bitfield macros from brw_defines.h in order to represent the pair of integers as a single scalar value that can be assigned to nir_variable_data::driver_location). nir_assign_var_locations is no longer useful for fragment outputs. Because fragment outputs are now allocated independently rather than within the nir_outputs array, the get_frag_output() helper becomes necessary in order to obtain the right temporary register for a given location-index pair. The type_size helper passed to nir_lower_io is now type_size_dvec4 rather than type_size_vec4_times_4 so that output array offsets are provided in terms of whole array elements rather than in terms of scalar components (dvec4 is the largest vector type supported by the GLSL so this will cause all individual fragment outputs to have a size of one regardless of the type). Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2016-07-21 21:26:20 -07:00
static fs_reg
alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n)
{
if (n && regs[0].file != BAD_FILE) {
return regs[0];
} else {
const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size);
for (unsigned i = 0; i < n; i++)
regs[i] = tmp;
return tmp;
}
}
static fs_reg
alloc_frag_output(fs_visitor *v, unsigned location)
i965/fs: Rework representation of fragment output locations in NIR. The problem with the current approach is that driver output locations are represented as a linear offset within the nir_outputs array, which makes it rather difficult for the back-end to figure out what color output and index some nir_intrinsic_load/store_output was meant for, because the offset of a given output within the nir_output array is dependent on the type and size of all previously allocated outputs. Instead this defines the driver location of an output to be the pair formed by its GLSL-assigned location and index (I've borrowed the bitfield macros from brw_defines.h in order to represent the pair of integers as a single scalar value that can be assigned to nir_variable_data::driver_location). nir_assign_var_locations is no longer useful for fragment outputs. Because fragment outputs are now allocated independently rather than within the nir_outputs array, the get_frag_output() helper becomes necessary in order to obtain the right temporary register for a given location-index pair. The type_size helper passed to nir_lower_io is now type_size_dvec4 rather than type_size_vec4_times_4 so that output array offsets are provided in terms of whole array elements rather than in terms of scalar components (dvec4 is the largest vector type supported by the GLSL so this will cause all individual fragment outputs to have a size of one regardless of the type). Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2016-07-21 21:26:20 -07:00
{
assert(v->stage == MESA_SHADER_FRAGMENT);
const brw_wm_prog_key *const key =
reinterpret_cast<const brw_wm_prog_key *>(v->key);
const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION);
const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX);
if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
return alloc_temporary(v->bld, 4, &v->dual_src_output, 1);
i965/fs: Rework representation of fragment output locations in NIR. The problem with the current approach is that driver output locations are represented as a linear offset within the nir_outputs array, which makes it rather difficult for the back-end to figure out what color output and index some nir_intrinsic_load/store_output was meant for, because the offset of a given output within the nir_output array is dependent on the type and size of all previously allocated outputs. Instead this defines the driver location of an output to be the pair formed by its GLSL-assigned location and index (I've borrowed the bitfield macros from brw_defines.h in order to represent the pair of integers as a single scalar value that can be assigned to nir_variable_data::driver_location). nir_assign_var_locations is no longer useful for fragment outputs. Because fragment outputs are now allocated independently rather than within the nir_outputs array, the get_frag_output() helper becomes necessary in order to obtain the right temporary register for a given location-index pair. The type_size helper passed to nir_lower_io is now type_size_dvec4 rather than type_size_vec4_times_4 so that output array offsets are provided in terms of whole array elements rather than in terms of scalar components (dvec4 is the largest vector type supported by the GLSL so this will cause all individual fragment outputs to have a size of one regardless of the type). Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2016-07-21 21:26:20 -07:00
else if (l == FRAG_RESULT_COLOR)
return alloc_temporary(v->bld, 4, v->outputs,
MAX2(key->nr_color_regions, 1));
i965/fs: Rework representation of fragment output locations in NIR. The problem with the current approach is that driver output locations are represented as a linear offset within the nir_outputs array, which makes it rather difficult for the back-end to figure out what color output and index some nir_intrinsic_load/store_output was meant for, because the offset of a given output within the nir_output array is dependent on the type and size of all previously allocated outputs. Instead this defines the driver location of an output to be the pair formed by its GLSL-assigned location and index (I've borrowed the bitfield macros from brw_defines.h in order to represent the pair of integers as a single scalar value that can be assigned to nir_variable_data::driver_location). nir_assign_var_locations is no longer useful for fragment outputs. Because fragment outputs are now allocated independently rather than within the nir_outputs array, the get_frag_output() helper becomes necessary in order to obtain the right temporary register for a given location-index pair. The type_size helper passed to nir_lower_io is now type_size_dvec4 rather than type_size_vec4_times_4 so that output array offsets are provided in terms of whole array elements rather than in terms of scalar components (dvec4 is the largest vector type supported by the GLSL so this will cause all individual fragment outputs to have a size of one regardless of the type). Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2016-07-21 21:26:20 -07:00
else if (l == FRAG_RESULT_DEPTH)
return alloc_temporary(v->bld, 1, &v->frag_depth, 1);
i965/fs: Rework representation of fragment output locations in NIR. The problem with the current approach is that driver output locations are represented as a linear offset within the nir_outputs array, which makes it rather difficult for the back-end to figure out what color output and index some nir_intrinsic_load/store_output was meant for, because the offset of a given output within the nir_output array is dependent on the type and size of all previously allocated outputs. Instead this defines the driver location of an output to be the pair formed by its GLSL-assigned location and index (I've borrowed the bitfield macros from brw_defines.h in order to represent the pair of integers as a single scalar value that can be assigned to nir_variable_data::driver_location). nir_assign_var_locations is no longer useful for fragment outputs. Because fragment outputs are now allocated independently rather than within the nir_outputs array, the get_frag_output() helper becomes necessary in order to obtain the right temporary register for a given location-index pair. The type_size helper passed to nir_lower_io is now type_size_dvec4 rather than type_size_vec4_times_4 so that output array offsets are provided in terms of whole array elements rather than in terms of scalar components (dvec4 is the largest vector type supported by the GLSL so this will cause all individual fragment outputs to have a size of one regardless of the type). Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2016-07-21 21:26:20 -07:00
else if (l == FRAG_RESULT_STENCIL)
return alloc_temporary(v->bld, 1, &v->frag_stencil, 1);
i965/fs: Rework representation of fragment output locations in NIR. The problem with the current approach is that driver output locations are represented as a linear offset within the nir_outputs array, which makes it rather difficult for the back-end to figure out what color output and index some nir_intrinsic_load/store_output was meant for, because the offset of a given output within the nir_output array is dependent on the type and size of all previously allocated outputs. Instead this defines the driver location of an output to be the pair formed by its GLSL-assigned location and index (I've borrowed the bitfield macros from brw_defines.h in order to represent the pair of integers as a single scalar value that can be assigned to nir_variable_data::driver_location). nir_assign_var_locations is no longer useful for fragment outputs. Because fragment outputs are now allocated independently rather than within the nir_outputs array, the get_frag_output() helper becomes necessary in order to obtain the right temporary register for a given location-index pair. The type_size helper passed to nir_lower_io is now type_size_dvec4 rather than type_size_vec4_times_4 so that output array offsets are provided in terms of whole array elements rather than in terms of scalar components (dvec4 is the largest vector type supported by the GLSL so this will cause all individual fragment outputs to have a size of one regardless of the type). Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2016-07-21 21:26:20 -07:00
else if (l == FRAG_RESULT_SAMPLE_MASK)
return alloc_temporary(v->bld, 1, &v->sample_mask, 1);
i965/fs: Rework representation of fragment output locations in NIR. The problem with the current approach is that driver output locations are represented as a linear offset within the nir_outputs array, which makes it rather difficult for the back-end to figure out what color output and index some nir_intrinsic_load/store_output was meant for, because the offset of a given output within the nir_output array is dependent on the type and size of all previously allocated outputs. Instead this defines the driver location of an output to be the pair formed by its GLSL-assigned location and index (I've borrowed the bitfield macros from brw_defines.h in order to represent the pair of integers as a single scalar value that can be assigned to nir_variable_data::driver_location). nir_assign_var_locations is no longer useful for fragment outputs. Because fragment outputs are now allocated independently rather than within the nir_outputs array, the get_frag_output() helper becomes necessary in order to obtain the right temporary register for a given location-index pair. The type_size helper passed to nir_lower_io is now type_size_dvec4 rather than type_size_vec4_times_4 so that output array offsets are provided in terms of whole array elements rather than in terms of scalar components (dvec4 is the largest vector type supported by the GLSL so this will cause all individual fragment outputs to have a size of one regardless of the type). Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2016-07-21 21:26:20 -07:00
else if (l >= FRAG_RESULT_DATA0 &&
l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS)
return alloc_temporary(v->bld, 4,
&v->outputs[l - FRAG_RESULT_DATA0], 1);
i965/fs: Rework representation of fragment output locations in NIR. The problem with the current approach is that driver output locations are represented as a linear offset within the nir_outputs array, which makes it rather difficult for the back-end to figure out what color output and index some nir_intrinsic_load/store_output was meant for, because the offset of a given output within the nir_output array is dependent on the type and size of all previously allocated outputs. Instead this defines the driver location of an output to be the pair formed by its GLSL-assigned location and index (I've borrowed the bitfield macros from brw_defines.h in order to represent the pair of integers as a single scalar value that can be assigned to nir_variable_data::driver_location). nir_assign_var_locations is no longer useful for fragment outputs. Because fragment outputs are now allocated independently rather than within the nir_outputs array, the get_frag_output() helper becomes necessary in order to obtain the right temporary register for a given location-index pair. The type_size helper passed to nir_lower_io is now type_size_dvec4 rather than type_size_vec4_times_4 so that output array offsets are provided in terms of whole array elements rather than in terms of scalar components (dvec4 is the largest vector type supported by the GLSL so this will cause all individual fragment outputs to have a size of one regardless of the type). Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2016-07-21 21:26:20 -07:00
else
unreachable("Invalid location");
}
void
fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
nir_intrinsic_instr *instr)
{
assert(stage == MESA_SHADER_FRAGMENT);
fs_reg dest;
if (nir_intrinsic_infos[instr->intrinsic].has_dest)
dest = get_nir_def(instr->def);
switch (instr->intrinsic) {
case nir_intrinsic_load_front_face:
bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
emit_frontfacing_interpolation());
break;
case nir_intrinsic_load_sample_pos:
case nir_intrinsic_load_sample_pos_or_center: {
fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
assert(sample_pos.file != BAD_FILE);
dest.type = sample_pos.type;
bld.MOV(dest, sample_pos);
bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
break;
}
case nir_intrinsic_load_layer_id:
dest.type = BRW_REGISTER_TYPE_UD;
bld.MOV(dest, fetch_render_target_array_index(bld));
break;
case nir_intrinsic_is_helper_invocation:
emit_is_helper_invocation(dest);
break;
case nir_intrinsic_load_helper_invocation:
case nir_intrinsic_load_sample_mask_in:
case nir_intrinsic_load_sample_id:
case nir_intrinsic_load_frag_shading_rate: {
gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
fs_reg val = nir_system_values[sv];
assert(val.file != BAD_FILE);
dest.type = val.type;
bld.MOV(dest, val);
break;
}
case nir_intrinsic_store_output: {
const fs_reg src = get_nir_src(instr->src[0]);
const unsigned store_offset = nir_src_as_uint(instr->src[1]);
i965/fs: Rework representation of fragment output locations in NIR. The problem with the current approach is that driver output locations are represented as a linear offset within the nir_outputs array, which makes it rather difficult for the back-end to figure out what color output and index some nir_intrinsic_load/store_output was meant for, because the offset of a given output within the nir_output array is dependent on the type and size of all previously allocated outputs. Instead this defines the driver location of an output to be the pair formed by its GLSL-assigned location and index (I've borrowed the bitfield macros from brw_defines.h in order to represent the pair of integers as a single scalar value that can be assigned to nir_variable_data::driver_location). nir_assign_var_locations is no longer useful for fragment outputs. Because fragment outputs are now allocated independently rather than within the nir_outputs array, the get_frag_output() helper becomes necessary in order to obtain the right temporary register for a given location-index pair. The type_size helper passed to nir_lower_io is now type_size_dvec4 rather than type_size_vec4_times_4 so that output array offsets are provided in terms of whole array elements rather than in terms of scalar components (dvec4 is the largest vector type supported by the GLSL so this will cause all individual fragment outputs to have a size of one regardless of the type). Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2016-07-21 21:26:20 -07:00
const unsigned location = nir_intrinsic_base(instr) +
SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION);
const fs_reg new_dest = retype(alloc_frag_output(this, location),
i965/fs: Rework representation of fragment output locations in NIR. The problem with the current approach is that driver output locations are represented as a linear offset within the nir_outputs array, which makes it rather difficult for the back-end to figure out what color output and index some nir_intrinsic_load/store_output was meant for, because the offset of a given output within the nir_output array is dependent on the type and size of all previously allocated outputs. Instead this defines the driver location of an output to be the pair formed by its GLSL-assigned location and index (I've borrowed the bitfield macros from brw_defines.h in order to represent the pair of integers as a single scalar value that can be assigned to nir_variable_data::driver_location). nir_assign_var_locations is no longer useful for fragment outputs. Because fragment outputs are now allocated independently rather than within the nir_outputs array, the get_frag_output() helper becomes necessary in order to obtain the right temporary register for a given location-index pair. The type_size helper passed to nir_lower_io is now type_size_dvec4 rather than type_size_vec4_times_4 so that output array offsets are provided in terms of whole array elements rather than in terms of scalar components (dvec4 is the largest vector type supported by the GLSL so this will cause all individual fragment outputs to have a size of one regardless of the type). Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2016-07-21 21:26:20 -07:00
src.type);
for (unsigned j = 0; j < instr->num_components; j++)
bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
offset(src, bld, j));
break;
}
case nir_intrinsic_load_output: {
const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
BRW_NIR_FRAG_OUTPUT_LOCATION);
assert(l >= FRAG_RESULT_DATA0);
const unsigned load_offset = nir_src_as_uint(instr->src[0]);
const unsigned target = l - FRAG_RESULT_DATA0 + load_offset;
const fs_reg tmp = bld.vgrf(dest.type, 4);
if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch)
emit_coherent_fb_read(bld, tmp, target);
else
emit_non_coherent_fb_read(bld, tmp, target);
for (unsigned j = 0; j < instr->num_components; j++) {
bld.MOV(offset(dest, bld, j),
offset(tmp, bld, nir_intrinsic_component(instr) + j));
}
break;
}
case nir_intrinsic_demote:
case nir_intrinsic_discard:
case nir_intrinsic_terminate:
case nir_intrinsic_demote_if:
case nir_intrinsic_discard_if:
case nir_intrinsic_terminate_if: {
/* We track our discarded pixels in f0.1/f1.0. By predicating on it, we
* can update just the flag bits that aren't yet discarded. If there's
* no condition, we emit a CMP of g0 != g0, so all currently executing
* channels will get turned off.
*/
intel/fs: Improve discard_if code generation Previously we would blindly emit an sequence like: mov(1) f0.1<1>UW g1.14<0,1,0>UW ... cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */ (+f0.1) cmp.z.f0.1(16) null<1>D g7<8,8,1>D 0D The first move sets the flags based on the initial execution mask. Later discard sequences contain a predicated compare that can only remove more SIMD channels. Often times the only user of the result from the first compare is the second compare. Instead, generate a sequence like mov(1) f0.1<1>UW g1.14<0,1,0>UW ... cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */ (+f0.1) cmp.ge.f0.1(8) null<1>F g5<8,8,1>F 0x41700000F /* 15F */ If the results stored in g7 and f0.0 are not used, the comparison will be eliminated. This removes an instruction and potentially reduces register pressure. v2: Major re-write of the commit message (including fixing the assembly code). Suggested by Matt. All Gen8+ platforms had similar results. (Ice Lake shown) total instructions in shared programs: 17224434 -> 17198659 (-0.15%) instructions in affected programs: 2908125 -> 2882350 (-0.89%) helped: 18891 HURT: 5 helped stats (abs) min: 1 max: 12 x̄: 1.38 x̃: 1 helped stats (rel) min: 0.03% max: 25.00% x̄: 1.76% x̃: 1.02% HURT stats (abs) min: 9 max: 105 x̄: 51.40 x̃: 35 HURT stats (rel) min: 0.43% max: 4.92% x̄: 2.34% x̃: 1.56% 95% mean confidence interval for instructions value: -1.39 -1.34 95% mean confidence interval for instructions %-change: -1.79% -1.73% Instructions are helped. total cycles in shared programs: 361468458 -> 361170679 (-0.08%) cycles in affected programs: 38470116 -> 38172337 (-0.77%) helped: 16202 HURT: 1456 helped stats (abs) min: 1 max: 4473 x̄: 26.24 x̃: 18 helped stats (rel) min: <.01% max: 28.44% x̄: 2.90% x̃: 2.18% HURT stats (abs) min: 1 max: 5982 x̄: 87.51 x̃: 28 HURT stats (rel) min: <.01% max: 51.29% x̄: 5.48% x̃: 1.64% 95% mean confidence interval for cycles value: -18.24 -15.49 95% mean confidence interval for cycles %-change: -2.26% -2.14% Cycles are helped. total spills in shared programs: 12147 -> 12176 (0.24%) spills in affected programs: 175 -> 204 (16.57%) helped: 8 HURT: 5 total fills in shared programs: 25262 -> 25292 (0.12%) fills in affected programs: 269 -> 299 (11.15%) helped: 8 HURT: 5 Haswell total instructions in shared programs: 13530316 -> 13502647 (-0.20%) instructions in affected programs: 2507824 -> 2480155 (-1.10%) helped: 18859 HURT: 10 helped stats (abs) min: 1 max: 12 x̄: 1.48 x̃: 1 helped stats (rel) min: 0.03% max: 27.78% x̄: 2.38% x̃: 1.41% HURT stats (abs) min: 5 max: 39 x̄: 25.70 x̃: 31 HURT stats (rel) min: 0.22% max: 1.66% x̄: 1.09% x̃: 1.31% 95% mean confidence interval for instructions value: -1.49 -1.44 95% mean confidence interval for instructions %-change: -2.42% -2.34% Instructions are helped. total cycles in shared programs: 377865412 -> 377639034 (-0.06%) cycles in affected programs: 40169572 -> 39943194 (-0.56%) helped: 15550 HURT: 1938 helped stats (abs) min: 1 max: 2482 x̄: 25.67 x̃: 18 helped stats (rel) min: <.01% max: 37.77% x̄: 3.00% x̃: 2.25% HURT stats (abs) min: 1 max: 4862 x̄: 89.17 x̃: 35 HURT stats (rel) min: <.01% max: 67.67% x̄: 6.16% x̃: 2.75% 95% mean confidence interval for cycles value: -14.42 -11.47 95% mean confidence interval for cycles %-change: -2.05% -1.91% Cycles are helped. total spills in shared programs: 26769 -> 26814 (0.17%) spills in affected programs: 826 -> 871 (5.45%) helped: 9 HURT: 10 total fills in shared programs: 38383 -> 38425 (0.11%) fills in affected programs: 834 -> 876 (5.04%) helped: 9 HURT: 10 LOST: 5 GAINED: 10 Ivy Bridge total instructions in shared programs: 12079250 -> 12044139 (-0.29%) instructions in affected programs: 2409680 -> 2374569 (-1.46%) helped: 16135 HURT: 0 helped stats (abs) min: 1 max: 23 x̄: 2.18 x̃: 2 helped stats (rel) min: 0.07% max: 37.50% x̄: 2.72% x̃: 1.68% 95% mean confidence interval for instructions value: -2.21 -2.14 95% mean confidence interval for instructions %-change: -2.76% -2.67% Instructions are helped. total cycles in shared programs: 180116747 -> 179900405 (-0.12%) cycles in affected programs: 25439823 -> 25223481 (-0.85%) helped: 13817 HURT: 1499 helped stats (abs) min: 1 max: 1886 x̄: 26.40 x̃: 18 helped stats (rel) min: <.01% max: 38.84% x̄: 2.57% x̃: 1.97% HURT stats (abs) min: 1 max: 3684 x̄: 98.99 x̃: 52 HURT stats (rel) min: <.01% max: 97.01% x̄: 6.37% x̃: 3.42% 95% mean confidence interval for cycles value: -15.68 -12.57 95% mean confidence interval for cycles %-change: -1.77% -1.63% Cycles are helped. LOST: 8 GAINED: 10 Sandy Bridge total instructions in shared programs: 10878990 -> 10863659 (-0.14%) instructions in affected programs: 1806702 -> 1791371 (-0.85%) helped: 13023 HURT: 0 helped stats (abs) min: 1 max: 5 x̄: 1.18 x̃: 1 helped stats (rel) min: 0.07% max: 13.79% x̄: 1.65% x̃: 1.10% 95% mean confidence interval for instructions value: -1.18 -1.17 95% mean confidence interval for instructions %-change: -1.68% -1.62% Instructions are helped. total cycles in shared programs: 154082878 -> 153862810 (-0.14%) cycles in affected programs: 20199374 -> 19979306 (-1.09%) helped: 12048 HURT: 510 helped stats (abs) min: 1 max: 323 x̄: 20.57 x̃: 18 helped stats (rel) min: 0.03% max: 17.78% x̄: 2.05% x̃: 1.52% HURT stats (abs) min: 1 max: 448 x̄: 54.39 x̃: 16 HURT stats (rel) min: 0.02% max: 37.98% x̄: 4.13% x̃: 1.17% 95% mean confidence interval for cycles value: -17.97 -17.08 95% mean confidence interval for cycles %-change: -1.84% -1.75% Cycles are helped. LOST: 1 GAINED: 0 Iron Lake total instructions in shared programs: 8155075 -> 8142729 (-0.15%) instructions in affected programs: 949495 -> 937149 (-1.30%) helped: 5810 HURT: 0 helped stats (abs) min: 1 max: 8 x̄: 2.12 x̃: 2 helped stats (rel) min: 0.10% max: 16.67% x̄: 2.53% x̃: 1.85% 95% mean confidence interval for instructions value: -2.14 -2.11 95% mean confidence interval for instructions %-change: -2.59% -2.48% Instructions are helped. total cycles in shared programs: 188584610 -> 188549632 (-0.02%) cycles in affected programs: 17274446 -> 17239468 (-0.20%) helped: 3881 HURT: 90 helped stats (abs) min: 2 max: 168 x̄: 9.08 x̃: 6 helped stats (rel) min: <.01% max: 23.53% x̄: 0.83% x̃: 0.30% HURT stats (abs) min: 2 max: 10 x̄: 2.80 x̃: 2 HURT stats (rel) min: <.01% max: 0.60% x̄: 0.10% x̃: 0.07% 95% mean confidence interval for cycles value: -9.35 -8.27 95% mean confidence interval for cycles %-change: -0.85% -0.77% Cycles are helped. GM45 total instructions in shared programs: 5019308 -> 5013119 (-0.12%) instructions in affected programs: 489028 -> 482839 (-1.27%) helped: 2912 HURT: 0 helped stats (abs) min: 1 max: 8 x̄: 2.13 x̃: 2 helped stats (rel) min: 0.10% max: 16.67% x̄: 2.46% x̃: 1.81% 95% mean confidence interval for instructions value: -2.14 -2.11 95% mean confidence interval for instructions %-change: -2.54% -2.39% Instructions are helped. total cycles in shared programs: 129002592 -> 128977804 (-0.02%) cycles in affected programs: 12669152 -> 12644364 (-0.20%) helped: 2759 HURT: 37 helped stats (abs) min: 2 max: 168 x̄: 9.03 x̃: 4 helped stats (rel) min: <.01% max: 21.43% x̄: 0.75% x̃: 0.31% HURT stats (abs) min: 2 max: 10 x̄: 3.62 x̃: 4 HURT stats (rel) min: <.01% max: 0.41% x̄: 0.10% x̃: 0.04% 95% mean confidence interval for cycles value: -9.53 -8.20 95% mean confidence interval for cycles %-change: -0.79% -0.70% Cycles are helped. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Matt Turner <mattst88@gmail.com>
2019-05-20 17:25:01 -07:00
fs_inst *cmp = NULL;
if (instr->intrinsic == nir_intrinsic_demote_if ||
instr->intrinsic == nir_intrinsic_discard_if ||
instr->intrinsic == nir_intrinsic_terminate_if) {
intel/fs: Improve discard_if code generation Previously we would blindly emit an sequence like: mov(1) f0.1<1>UW g1.14<0,1,0>UW ... cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */ (+f0.1) cmp.z.f0.1(16) null<1>D g7<8,8,1>D 0D The first move sets the flags based on the initial execution mask. Later discard sequences contain a predicated compare that can only remove more SIMD channels. Often times the only user of the result from the first compare is the second compare. Instead, generate a sequence like mov(1) f0.1<1>UW g1.14<0,1,0>UW ... cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */ (+f0.1) cmp.ge.f0.1(8) null<1>F g5<8,8,1>F 0x41700000F /* 15F */ If the results stored in g7 and f0.0 are not used, the comparison will be eliminated. This removes an instruction and potentially reduces register pressure. v2: Major re-write of the commit message (including fixing the assembly code). Suggested by Matt. All Gen8+ platforms had similar results. (Ice Lake shown) total instructions in shared programs: 17224434 -> 17198659 (-0.15%) instructions in affected programs: 2908125 -> 2882350 (-0.89%) helped: 18891 HURT: 5 helped stats (abs) min: 1 max: 12 x̄: 1.38 x̃: 1 helped stats (rel) min: 0.03% max: 25.00% x̄: 1.76% x̃: 1.02% HURT stats (abs) min: 9 max: 105 x̄: 51.40 x̃: 35 HURT stats (rel) min: 0.43% max: 4.92% x̄: 2.34% x̃: 1.56% 95% mean confidence interval for instructions value: -1.39 -1.34 95% mean confidence interval for instructions %-change: -1.79% -1.73% Instructions are helped. total cycles in shared programs: 361468458 -> 361170679 (-0.08%) cycles in affected programs: 38470116 -> 38172337 (-0.77%) helped: 16202 HURT: 1456 helped stats (abs) min: 1 max: 4473 x̄: 26.24 x̃: 18 helped stats (rel) min: <.01% max: 28.44% x̄: 2.90% x̃: 2.18% HURT stats (abs) min: 1 max: 5982 x̄: 87.51 x̃: 28 HURT stats (rel) min: <.01% max: 51.29% x̄: 5.48% x̃: 1.64% 95% mean confidence interval for cycles value: -18.24 -15.49 95% mean confidence interval for cycles %-change: -2.26% -2.14% Cycles are helped. total spills in shared programs: 12147 -> 12176 (0.24%) spills in affected programs: 175 -> 204 (16.57%) helped: 8 HURT: 5 total fills in shared programs: 25262 -> 25292 (0.12%) fills in affected programs: 269 -> 299 (11.15%) helped: 8 HURT: 5 Haswell total instructions in shared programs: 13530316 -> 13502647 (-0.20%) instructions in affected programs: 2507824 -> 2480155 (-1.10%) helped: 18859 HURT: 10 helped stats (abs) min: 1 max: 12 x̄: 1.48 x̃: 1 helped stats (rel) min: 0.03% max: 27.78% x̄: 2.38% x̃: 1.41% HURT stats (abs) min: 5 max: 39 x̄: 25.70 x̃: 31 HURT stats (rel) min: 0.22% max: 1.66% x̄: 1.09% x̃: 1.31% 95% mean confidence interval for instructions value: -1.49 -1.44 95% mean confidence interval for instructions %-change: -2.42% -2.34% Instructions are helped. total cycles in shared programs: 377865412 -> 377639034 (-0.06%) cycles in affected programs: 40169572 -> 39943194 (-0.56%) helped: 15550 HURT: 1938 helped stats (abs) min: 1 max: 2482 x̄: 25.67 x̃: 18 helped stats (rel) min: <.01% max: 37.77% x̄: 3.00% x̃: 2.25% HURT stats (abs) min: 1 max: 4862 x̄: 89.17 x̃: 35 HURT stats (rel) min: <.01% max: 67.67% x̄: 6.16% x̃: 2.75% 95% mean confidence interval for cycles value: -14.42 -11.47 95% mean confidence interval for cycles %-change: -2.05% -1.91% Cycles are helped. total spills in shared programs: 26769 -> 26814 (0.17%) spills in affected programs: 826 -> 871 (5.45%) helped: 9 HURT: 10 total fills in shared programs: 38383 -> 38425 (0.11%) fills in affected programs: 834 -> 876 (5.04%) helped: 9 HURT: 10 LOST: 5 GAINED: 10 Ivy Bridge total instructions in shared programs: 12079250 -> 12044139 (-0.29%) instructions in affected programs: 2409680 -> 2374569 (-1.46%) helped: 16135 HURT: 0 helped stats (abs) min: 1 max: 23 x̄: 2.18 x̃: 2 helped stats (rel) min: 0.07% max: 37.50% x̄: 2.72% x̃: 1.68% 95% mean confidence interval for instructions value: -2.21 -2.14 95% mean confidence interval for instructions %-change: -2.76% -2.67% Instructions are helped. total cycles in shared programs: 180116747 -> 179900405 (-0.12%) cycles in affected programs: 25439823 -> 25223481 (-0.85%) helped: 13817 HURT: 1499 helped stats (abs) min: 1 max: 1886 x̄: 26.40 x̃: 18 helped stats (rel) min: <.01% max: 38.84% x̄: 2.57% x̃: 1.97% HURT stats (abs) min: 1 max: 3684 x̄: 98.99 x̃: 52 HURT stats (rel) min: <.01% max: 97.01% x̄: 6.37% x̃: 3.42% 95% mean confidence interval for cycles value: -15.68 -12.57 95% mean confidence interval for cycles %-change: -1.77% -1.63% Cycles are helped. LOST: 8 GAINED: 10 Sandy Bridge total instructions in shared programs: 10878990 -> 10863659 (-0.14%) instructions in affected programs: 1806702 -> 1791371 (-0.85%) helped: 13023 HURT: 0 helped stats (abs) min: 1 max: 5 x̄: 1.18 x̃: 1 helped stats (rel) min: 0.07% max: 13.79% x̄: 1.65% x̃: 1.10% 95% mean confidence interval for instructions value: -1.18 -1.17 95% mean confidence interval for instructions %-change: -1.68% -1.62% Instructions are helped. total cycles in shared programs: 154082878 -> 153862810 (-0.14%) cycles in affected programs: 20199374 -> 19979306 (-1.09%) helped: 12048 HURT: 510 helped stats (abs) min: 1 max: 323 x̄: 20.57 x̃: 18 helped stats (rel) min: 0.03% max: 17.78% x̄: 2.05% x̃: 1.52% HURT stats (abs) min: 1 max: 448 x̄: 54.39 x̃: 16 HURT stats (rel) min: 0.02% max: 37.98% x̄: 4.13% x̃: 1.17% 95% mean confidence interval for cycles value: -17.97 -17.08 95% mean confidence interval for cycles %-change: -1.84% -1.75% Cycles are helped. LOST: 1 GAINED: 0 Iron Lake total instructions in shared programs: 8155075 -> 8142729 (-0.15%) instructions in affected programs: 949495 -> 937149 (-1.30%) helped: 5810 HURT: 0 helped stats (abs) min: 1 max: 8 x̄: 2.12 x̃: 2 helped stats (rel) min: 0.10% max: 16.67% x̄: 2.53% x̃: 1.85% 95% mean confidence interval for instructions value: -2.14 -2.11 95% mean confidence interval for instructions %-change: -2.59% -2.48% Instructions are helped. total cycles in shared programs: 188584610 -> 188549632 (-0.02%) cycles in affected programs: 17274446 -> 17239468 (-0.20%) helped: 3881 HURT: 90 helped stats (abs) min: 2 max: 168 x̄: 9.08 x̃: 6 helped stats (rel) min: <.01% max: 23.53% x̄: 0.83% x̃: 0.30% HURT stats (abs) min: 2 max: 10 x̄: 2.80 x̃: 2 HURT stats (rel) min: <.01% max: 0.60% x̄: 0.10% x̃: 0.07% 95% mean confidence interval for cycles value: -9.35 -8.27 95% mean confidence interval for cycles %-change: -0.85% -0.77% Cycles are helped. GM45 total instructions in shared programs: 5019308 -> 5013119 (-0.12%) instructions in affected programs: 489028 -> 482839 (-1.27%) helped: 2912 HURT: 0 helped stats (abs) min: 1 max: 8 x̄: 2.13 x̃: 2 helped stats (rel) min: 0.10% max: 16.67% x̄: 2.46% x̃: 1.81% 95% mean confidence interval for instructions value: -2.14 -2.11 95% mean confidence interval for instructions %-change: -2.54% -2.39% Instructions are helped. total cycles in shared programs: 129002592 -> 128977804 (-0.02%) cycles in affected programs: 12669152 -> 12644364 (-0.20%) helped: 2759 HURT: 37 helped stats (abs) min: 2 max: 168 x̄: 9.03 x̃: 4 helped stats (rel) min: <.01% max: 21.43% x̄: 0.75% x̃: 0.31% HURT stats (abs) min: 2 max: 10 x̄: 3.62 x̃: 4 HURT stats (rel) min: <.01% max: 0.41% x̄: 0.10% x̃: 0.04% 95% mean confidence interval for cycles value: -9.53 -8.20 95% mean confidence interval for cycles %-change: -0.79% -0.70% Cycles are helped. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Matt Turner <mattst88@gmail.com>
2019-05-20 17:25:01 -07:00
nir_alu_instr *alu = nir_src_as_alu_instr(instr->src[0]);
if (alu != NULL &&
alu->op != nir_op_bcsel &&
(devinfo->ver > 5 ||
intel/fs: Disable conditional discard optimization on Gen4 and Gen5 The CMP instruction on Gen4 and Gen5 generates one bit (the LSB) of valid data and 31 bits of junk. Results of comparisons that are used as Boolean values need to have a fixup applied to generate the proper 0/~0 values. Calling fs_visitor::nir_emit_alu with need_dest=false prevents the fixup code from being generated. This results in a sequence like: cmp.l.f0.0(16) g8<1>F g14<8,8,1>F 0x0F /* 0F */ ... cmp.l.f0.0(16) g4<1>F g6<8,8,1>F 0x0F /* 0F */ (+f0.1) or.z.f0.1(16) null<1>UD g4<8,8,1>UD g8<8,8,1>UD instead of cmp.l.f0.0(16) g8<1>F g14<8,8,1>F 0x0F /* 0F */ ... cmp.l.f0.0(16) g4<1>F g6<8,8,1>F 0x0F /* 0F */ or(16) g4<1>UD g4<8,8,1>UD g8<8,8,1>UD (+f0.1) and.z.f0.1(16) null<1>UD g4<8,8,1>UD 1UD I examined a couple of the shaders hurt by this change, and ALL of them would have been affected by this bug. :( Reviewed-by: Tapani Pälli <tapani.palli@intel.com> Closes: https://gitlab.freedesktop.org/mesa/mesa/issues/1836 Fixes: 0ba9497e66a ("intel/fs: Improve discard_if code generation") Iron Lake total instructions in shared programs: 8122757 -> 8122957 (<.01%) instructions in affected programs: 8307 -> 8507 (2.41%) helped: 0 HURT: 100 HURT stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 HURT stats (rel) min: 0.84% max: 6.67% x̄: 2.81% x̃: 2.76% 95% mean confidence interval for instructions value: 2.00 2.00 95% mean confidence interval for instructions %-change: 2.58% 3.03% Instructions are HURT. total cycles in shared programs: 188510100 -> 188510376 (<.01%) cycles in affected programs: 76018 -> 76294 (0.36%) helped: 0 HURT: 55 HURT stats (abs) min: 2 max: 12 x̄: 5.02 x̃: 4 HURT stats (rel) min: 0.07% max: 3.75% x̄: 0.86% x̃: 0.56% 95% mean confidence interval for cycles value: 4.33 5.71 95% mean confidence interval for cycles %-change: 0.60% 1.12% Cycles are HURT. GM45 total instructions in shared programs: 4994403 -> 4994503 (<.01%) instructions in affected programs: 4212 -> 4312 (2.37%) helped: 0 HURT: 50 HURT stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 HURT stats (rel) min: 0.84% max: 6.25% x̄: 2.76% x̃: 2.72% 95% mean confidence interval for instructions value: 2.00 2.00 95% mean confidence interval for instructions %-change: 2.45% 3.07% Instructions are HURT. total cycles in shared programs: 128928750 -> 128928982 (<.01%) cycles in affected programs: 67442 -> 67674 (0.34%) helped: 0 HURT: 47 HURT stats (abs) min: 2 max: 12 x̄: 4.94 x̃: 4 HURT stats (rel) min: 0.09% max: 3.75% x̄: 0.75% x̃: 0.53% 95% mean confidence interval for cycles value: 4.19 5.68 95% mean confidence interval for cycles %-change: 0.50% 1.00% Cycles are HURT.
2019-11-18 11:52:47 -08:00
(alu->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) != BRW_NIR_BOOLEAN_NEEDS_RESOLVE ||
alu->op == nir_op_fneu32 || alu->op == nir_op_feq32 ||
intel/fs: Disable conditional discard optimization on Gen4 and Gen5 The CMP instruction on Gen4 and Gen5 generates one bit (the LSB) of valid data and 31 bits of junk. Results of comparisons that are used as Boolean values need to have a fixup applied to generate the proper 0/~0 values. Calling fs_visitor::nir_emit_alu with need_dest=false prevents the fixup code from being generated. This results in a sequence like: cmp.l.f0.0(16) g8<1>F g14<8,8,1>F 0x0F /* 0F */ ... cmp.l.f0.0(16) g4<1>F g6<8,8,1>F 0x0F /* 0F */ (+f0.1) or.z.f0.1(16) null<1>UD g4<8,8,1>UD g8<8,8,1>UD instead of cmp.l.f0.0(16) g8<1>F g14<8,8,1>F 0x0F /* 0F */ ... cmp.l.f0.0(16) g4<1>F g6<8,8,1>F 0x0F /* 0F */ or(16) g4<1>UD g4<8,8,1>UD g8<8,8,1>UD (+f0.1) and.z.f0.1(16) null<1>UD g4<8,8,1>UD 1UD I examined a couple of the shaders hurt by this change, and ALL of them would have been affected by this bug. :( Reviewed-by: Tapani Pälli <tapani.palli@intel.com> Closes: https://gitlab.freedesktop.org/mesa/mesa/issues/1836 Fixes: 0ba9497e66a ("intel/fs: Improve discard_if code generation") Iron Lake total instructions in shared programs: 8122757 -> 8122957 (<.01%) instructions in affected programs: 8307 -> 8507 (2.41%) helped: 0 HURT: 100 HURT stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 HURT stats (rel) min: 0.84% max: 6.67% x̄: 2.81% x̃: 2.76% 95% mean confidence interval for instructions value: 2.00 2.00 95% mean confidence interval for instructions %-change: 2.58% 3.03% Instructions are HURT. total cycles in shared programs: 188510100 -> 188510376 (<.01%) cycles in affected programs: 76018 -> 76294 (0.36%) helped: 0 HURT: 55 HURT stats (abs) min: 2 max: 12 x̄: 5.02 x̃: 4 HURT stats (rel) min: 0.07% max: 3.75% x̄: 0.86% x̃: 0.56% 95% mean confidence interval for cycles value: 4.33 5.71 95% mean confidence interval for cycles %-change: 0.60% 1.12% Cycles are HURT. GM45 total instructions in shared programs: 4994403 -> 4994503 (<.01%) instructions in affected programs: 4212 -> 4312 (2.37%) helped: 0 HURT: 50 HURT stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 HURT stats (rel) min: 0.84% max: 6.25% x̄: 2.76% x̃: 2.72% 95% mean confidence interval for instructions value: 2.00 2.00 95% mean confidence interval for instructions %-change: 2.45% 3.07% Instructions are HURT. total cycles in shared programs: 128928750 -> 128928982 (<.01%) cycles in affected programs: 67442 -> 67674 (0.34%) helped: 0 HURT: 47 HURT stats (abs) min: 2 max: 12 x̄: 4.94 x̃: 4 HURT stats (rel) min: 0.09% max: 3.75% x̄: 0.75% x̃: 0.53% 95% mean confidence interval for cycles value: 4.19 5.68 95% mean confidence interval for cycles %-change: 0.50% 1.00% Cycles are HURT.
2019-11-18 11:52:47 -08:00
alu->op == nir_op_flt32 || alu->op == nir_op_fge32 ||
alu->op == nir_op_ine32 || alu->op == nir_op_ieq32 ||
alu->op == nir_op_ilt32 || alu->op == nir_op_ige32 ||
alu->op == nir_op_ult32 || alu->op == nir_op_uge32)) {
intel/fs: Improve discard_if code generation Previously we would blindly emit an sequence like: mov(1) f0.1<1>UW g1.14<0,1,0>UW ... cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */ (+f0.1) cmp.z.f0.1(16) null<1>D g7<8,8,1>D 0D The first move sets the flags based on the initial execution mask. Later discard sequences contain a predicated compare that can only remove more SIMD channels. Often times the only user of the result from the first compare is the second compare. Instead, generate a sequence like mov(1) f0.1<1>UW g1.14<0,1,0>UW ... cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */ (+f0.1) cmp.ge.f0.1(8) null<1>F g5<8,8,1>F 0x41700000F /* 15F */ If the results stored in g7 and f0.0 are not used, the comparison will be eliminated. This removes an instruction and potentially reduces register pressure. v2: Major re-write of the commit message (including fixing the assembly code). Suggested by Matt. All Gen8+ platforms had similar results. (Ice Lake shown) total instructions in shared programs: 17224434 -> 17198659 (-0.15%) instructions in affected programs: 2908125 -> 2882350 (-0.89%) helped: 18891 HURT: 5 helped stats (abs) min: 1 max: 12 x̄: 1.38 x̃: 1 helped stats (rel) min: 0.03% max: 25.00% x̄: 1.76% x̃: 1.02% HURT stats (abs) min: 9 max: 105 x̄: 51.40 x̃: 35 HURT stats (rel) min: 0.43% max: 4.92% x̄: 2.34% x̃: 1.56% 95% mean confidence interval for instructions value: -1.39 -1.34 95% mean confidence interval for instructions %-change: -1.79% -1.73% Instructions are helped. total cycles in shared programs: 361468458 -> 361170679 (-0.08%) cycles in affected programs: 38470116 -> 38172337 (-0.77%) helped: 16202 HURT: 1456 helped stats (abs) min: 1 max: 4473 x̄: 26.24 x̃: 18 helped stats (rel) min: <.01% max: 28.44% x̄: 2.90% x̃: 2.18% HURT stats (abs) min: 1 max: 5982 x̄: 87.51 x̃: 28 HURT stats (rel) min: <.01% max: 51.29% x̄: 5.48% x̃: 1.64% 95% mean confidence interval for cycles value: -18.24 -15.49 95% mean confidence interval for cycles %-change: -2.26% -2.14% Cycles are helped. total spills in shared programs: 12147 -> 12176 (0.24%) spills in affected programs: 175 -> 204 (16.57%) helped: 8 HURT: 5 total fills in shared programs: 25262 -> 25292 (0.12%) fills in affected programs: 269 -> 299 (11.15%) helped: 8 HURT: 5 Haswell total instructions in shared programs: 13530316 -> 13502647 (-0.20%) instructions in affected programs: 2507824 -> 2480155 (-1.10%) helped: 18859 HURT: 10 helped stats (abs) min: 1 max: 12 x̄: 1.48 x̃: 1 helped stats (rel) min: 0.03% max: 27.78% x̄: 2.38% x̃: 1.41% HURT stats (abs) min: 5 max: 39 x̄: 25.70 x̃: 31 HURT stats (rel) min: 0.22% max: 1.66% x̄: 1.09% x̃: 1.31% 95% mean confidence interval for instructions value: -1.49 -1.44 95% mean confidence interval for instructions %-change: -2.42% -2.34% Instructions are helped. total cycles in shared programs: 377865412 -> 377639034 (-0.06%) cycles in affected programs: 40169572 -> 39943194 (-0.56%) helped: 15550 HURT: 1938 helped stats (abs) min: 1 max: 2482 x̄: 25.67 x̃: 18 helped stats (rel) min: <.01% max: 37.77% x̄: 3.00% x̃: 2.25% HURT stats (abs) min: 1 max: 4862 x̄: 89.17 x̃: 35 HURT stats (rel) min: <.01% max: 67.67% x̄: 6.16% x̃: 2.75% 95% mean confidence interval for cycles value: -14.42 -11.47 95% mean confidence interval for cycles %-change: -2.05% -1.91% Cycles are helped. total spills in shared programs: 26769 -> 26814 (0.17%) spills in affected programs: 826 -> 871 (5.45%) helped: 9 HURT: 10 total fills in shared programs: 38383 -> 38425 (0.11%) fills in affected programs: 834 -> 876 (5.04%) helped: 9 HURT: 10 LOST: 5 GAINED: 10 Ivy Bridge total instructions in shared programs: 12079250 -> 12044139 (-0.29%) instructions in affected programs: 2409680 -> 2374569 (-1.46%) helped: 16135 HURT: 0 helped stats (abs) min: 1 max: 23 x̄: 2.18 x̃: 2 helped stats (rel) min: 0.07% max: 37.50% x̄: 2.72% x̃: 1.68% 95% mean confidence interval for instructions value: -2.21 -2.14 95% mean confidence interval for instructions %-change: -2.76% -2.67% Instructions are helped. total cycles in shared programs: 180116747 -> 179900405 (-0.12%) cycles in affected programs: 25439823 -> 25223481 (-0.85%) helped: 13817 HURT: 1499 helped stats (abs) min: 1 max: 1886 x̄: 26.40 x̃: 18 helped stats (rel) min: <.01% max: 38.84% x̄: 2.57% x̃: 1.97% HURT stats (abs) min: 1 max: 3684 x̄: 98.99 x̃: 52 HURT stats (rel) min: <.01% max: 97.01% x̄: 6.37% x̃: 3.42% 95% mean confidence interval for cycles value: -15.68 -12.57 95% mean confidence interval for cycles %-change: -1.77% -1.63% Cycles are helped. LOST: 8 GAINED: 10 Sandy Bridge total instructions in shared programs: 10878990 -> 10863659 (-0.14%) instructions in affected programs: 1806702 -> 1791371 (-0.85%) helped: 13023 HURT: 0 helped stats (abs) min: 1 max: 5 x̄: 1.18 x̃: 1 helped stats (rel) min: 0.07% max: 13.79% x̄: 1.65% x̃: 1.10% 95% mean confidence interval for instructions value: -1.18 -1.17 95% mean confidence interval for instructions %-change: -1.68% -1.62% Instructions are helped. total cycles in shared programs: 154082878 -> 153862810 (-0.14%) cycles in affected programs: 20199374 -> 19979306 (-1.09%) helped: 12048 HURT: 510 helped stats (abs) min: 1 max: 323 x̄: 20.57 x̃: 18 helped stats (rel) min: 0.03% max: 17.78% x̄: 2.05% x̃: 1.52% HURT stats (abs) min: 1 max: 448 x̄: 54.39 x̃: 16 HURT stats (rel) min: 0.02% max: 37.98% x̄: 4.13% x̃: 1.17% 95% mean confidence interval for cycles value: -17.97 -17.08 95% mean confidence interval for cycles %-change: -1.84% -1.75% Cycles are helped. LOST: 1 GAINED: 0 Iron Lake total instructions in shared programs: 8155075 -> 8142729 (-0.15%) instructions in affected programs: 949495 -> 937149 (-1.30%) helped: 5810 HURT: 0 helped stats (abs) min: 1 max: 8 x̄: 2.12 x̃: 2 helped stats (rel) min: 0.10% max: 16.67% x̄: 2.53% x̃: 1.85% 95% mean confidence interval for instructions value: -2.14 -2.11 95% mean confidence interval for instructions %-change: -2.59% -2.48% Instructions are helped. total cycles in shared programs: 188584610 -> 188549632 (-0.02%) cycles in affected programs: 17274446 -> 17239468 (-0.20%) helped: 3881 HURT: 90 helped stats (abs) min: 2 max: 168 x̄: 9.08 x̃: 6 helped stats (rel) min: <.01% max: 23.53% x̄: 0.83% x̃: 0.30% HURT stats (abs) min: 2 max: 10 x̄: 2.80 x̃: 2 HURT stats (rel) min: <.01% max: 0.60% x̄: 0.10% x̃: 0.07% 95% mean confidence interval for cycles value: -9.35 -8.27 95% mean confidence interval for cycles %-change: -0.85% -0.77% Cycles are helped. GM45 total instructions in shared programs: 5019308 -> 5013119 (-0.12%) instructions in affected programs: 489028 -> 482839 (-1.27%) helped: 2912 HURT: 0 helped stats (abs) min: 1 max: 8 x̄: 2.13 x̃: 2 helped stats (rel) min: 0.10% max: 16.67% x̄: 2.46% x̃: 1.81% 95% mean confidence interval for instructions value: -2.14 -2.11 95% mean confidence interval for instructions %-change: -2.54% -2.39% Instructions are helped. total cycles in shared programs: 129002592 -> 128977804 (-0.02%) cycles in affected programs: 12669152 -> 12644364 (-0.20%) helped: 2759 HURT: 37 helped stats (abs) min: 2 max: 168 x̄: 9.03 x̃: 4 helped stats (rel) min: <.01% max: 21.43% x̄: 0.75% x̃: 0.31% HURT stats (abs) min: 2 max: 10 x̄: 3.62 x̃: 4 HURT stats (rel) min: <.01% max: 0.41% x̄: 0.10% x̃: 0.04% 95% mean confidence interval for cycles value: -9.53 -8.20 95% mean confidence interval for cycles %-change: -0.79% -0.70% Cycles are helped. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Matt Turner <mattst88@gmail.com>
2019-05-20 17:25:01 -07:00
/* Re-emit the instruction that generated the Boolean value, but
* do not store it. Since this instruction will be conditional,
* other instructions that want to use the real Boolean value may
* get garbage. This was a problem for piglit's fs-discard-exit-2
* test.
*
* Ideally we'd detect that the instruction cannot have a
* conditional modifier before emitting the instructions. Alas,
* that is nigh impossible. Instead, we're going to assume the
* instruction (or last instruction) generated can have a
* conditional modifier. If it cannot, fallback to the old-style
* compare, and hope dead code elimination will clean up the
* extra instructions generated.
*/
nir_emit_alu(bld, alu, false);
cmp = (fs_inst *) instructions.get_tail();
if (cmp->conditional_mod == BRW_CONDITIONAL_NONE) {
if (cmp->can_do_cmod())
cmp->conditional_mod = BRW_CONDITIONAL_Z;
else
cmp = NULL;
} else {
/* The old sequence that would have been generated is,
* basically, bool_result == false. This is equivalent to
* !bool_result, so negate the old modifier.
*/
cmp->conditional_mod = brw_negate_cmod(cmp->conditional_mod);
}
}
if (cmp == NULL) {
cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),
brw_imm_d(0), BRW_CONDITIONAL_Z);
}
} else {
fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
BRW_REGISTER_TYPE_UW));
cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
}
intel/fs: Improve discard_if code generation Previously we would blindly emit an sequence like: mov(1) f0.1<1>UW g1.14<0,1,0>UW ... cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */ (+f0.1) cmp.z.f0.1(16) null<1>D g7<8,8,1>D 0D The first move sets the flags based on the initial execution mask. Later discard sequences contain a predicated compare that can only remove more SIMD channels. Often times the only user of the result from the first compare is the second compare. Instead, generate a sequence like mov(1) f0.1<1>UW g1.14<0,1,0>UW ... cmp.l.f0(16) g7<1>F g5<8,8,1>F 0x41700000F /* 15F */ (+f0.1) cmp.ge.f0.1(8) null<1>F g5<8,8,1>F 0x41700000F /* 15F */ If the results stored in g7 and f0.0 are not used, the comparison will be eliminated. This removes an instruction and potentially reduces register pressure. v2: Major re-write of the commit message (including fixing the assembly code). Suggested by Matt. All Gen8+ platforms had similar results. (Ice Lake shown) total instructions in shared programs: 17224434 -> 17198659 (-0.15%) instructions in affected programs: 2908125 -> 2882350 (-0.89%) helped: 18891 HURT: 5 helped stats (abs) min: 1 max: 12 x̄: 1.38 x̃: 1 helped stats (rel) min: 0.03% max: 25.00% x̄: 1.76% x̃: 1.02% HURT stats (abs) min: 9 max: 105 x̄: 51.40 x̃: 35 HURT stats (rel) min: 0.43% max: 4.92% x̄: 2.34% x̃: 1.56% 95% mean confidence interval for instructions value: -1.39 -1.34 95% mean confidence interval for instructions %-change: -1.79% -1.73% Instructions are helped. total cycles in shared programs: 361468458 -> 361170679 (-0.08%) cycles in affected programs: 38470116 -> 38172337 (-0.77%) helped: 16202 HURT: 1456 helped stats (abs) min: 1 max: 4473 x̄: 26.24 x̃: 18 helped stats (rel) min: <.01% max: 28.44% x̄: 2.90% x̃: 2.18% HURT stats (abs) min: 1 max: 5982 x̄: 87.51 x̃: 28 HURT stats (rel) min: <.01% max: 51.29% x̄: 5.48% x̃: 1.64% 95% mean confidence interval for cycles value: -18.24 -15.49 95% mean confidence interval for cycles %-change: -2.26% -2.14% Cycles are helped. total spills in shared programs: 12147 -> 12176 (0.24%) spills in affected programs: 175 -> 204 (16.57%) helped: 8 HURT: 5 total fills in shared programs: 25262 -> 25292 (0.12%) fills in affected programs: 269 -> 299 (11.15%) helped: 8 HURT: 5 Haswell total instructions in shared programs: 13530316 -> 13502647 (-0.20%) instructions in affected programs: 2507824 -> 2480155 (-1.10%) helped: 18859 HURT: 10 helped stats (abs) min: 1 max: 12 x̄: 1.48 x̃: 1 helped stats (rel) min: 0.03% max: 27.78% x̄: 2.38% x̃: 1.41% HURT stats (abs) min: 5 max: 39 x̄: 25.70 x̃: 31 HURT stats (rel) min: 0.22% max: 1.66% x̄: 1.09% x̃: 1.31% 95% mean confidence interval for instructions value: -1.49 -1.44 95% mean confidence interval for instructions %-change: -2.42% -2.34% Instructions are helped. total cycles in shared programs: 377865412 -> 377639034 (-0.06%) cycles in affected programs: 40169572 -> 39943194 (-0.56%) helped: 15550 HURT: 1938 helped stats (abs) min: 1 max: 2482 x̄: 25.67 x̃: 18 helped stats (rel) min: <.01% max: 37.77% x̄: 3.00% x̃: 2.25% HURT stats (abs) min: 1 max: 4862 x̄: 89.17 x̃: 35 HURT stats (rel) min: <.01% max: 67.67% x̄: 6.16% x̃: 2.75% 95% mean confidence interval for cycles value: -14.42 -11.47 95% mean confidence interval for cycles %-change: -2.05% -1.91% Cycles are helped. total spills in shared programs: 26769 -> 26814 (0.17%) spills in affected programs: 826 -> 871 (5.45%) helped: 9 HURT: 10 total fills in shared programs: 38383 -> 38425 (0.11%) fills in affected programs: 834 -> 876 (5.04%) helped: 9 HURT: 10 LOST: 5 GAINED: 10 Ivy Bridge total instructions in shared programs: 12079250 -> 12044139 (-0.29%) instructions in affected programs: 2409680 -> 2374569 (-1.46%) helped: 16135 HURT: 0 helped stats (abs) min: 1 max: 23 x̄: 2.18 x̃: 2 helped stats (rel) min: 0.07% max: 37.50% x̄: 2.72% x̃: 1.68% 95% mean confidence interval for instructions value: -2.21 -2.14 95% mean confidence interval for instructions %-change: -2.76% -2.67% Instructions are helped. total cycles in shared programs: 180116747 -> 179900405 (-0.12%) cycles in affected programs: 25439823 -> 25223481 (-0.85%) helped: 13817 HURT: 1499 helped stats (abs) min: 1 max: 1886 x̄: 26.40 x̃: 18 helped stats (rel) min: <.01% max: 38.84% x̄: 2.57% x̃: 1.97% HURT stats (abs) min: 1 max: 3684 x̄: 98.99 x̃: 52 HURT stats (rel) min: <.01% max: 97.01% x̄: 6.37% x̃: 3.42% 95% mean confidence interval for cycles value: -15.68 -12.57 95% mean confidence interval for cycles %-change: -1.77% -1.63% Cycles are helped. LOST: 8 GAINED: 10 Sandy Bridge total instructions in shared programs: 10878990 -> 10863659 (-0.14%) instructions in affected programs: 1806702 -> 1791371 (-0.85%) helped: 13023 HURT: 0 helped stats (abs) min: 1 max: 5 x̄: 1.18 x̃: 1 helped stats (rel) min: 0.07% max: 13.79% x̄: 1.65% x̃: 1.10% 95% mean confidence interval for instructions value: -1.18 -1.17 95% mean confidence interval for instructions %-change: -1.68% -1.62% Instructions are helped. total cycles in shared programs: 154082878 -> 153862810 (-0.14%) cycles in affected programs: 20199374 -> 19979306 (-1.09%) helped: 12048 HURT: 510 helped stats (abs) min: 1 max: 323 x̄: 20.57 x̃: 18 helped stats (rel) min: 0.03% max: 17.78% x̄: 2.05% x̃: 1.52% HURT stats (abs) min: 1 max: 448 x̄: 54.39 x̃: 16 HURT stats (rel) min: 0.02% max: 37.98% x̄: 4.13% x̃: 1.17% 95% mean confidence interval for cycles value: -17.97 -17.08 95% mean confidence interval for cycles %-change: -1.84% -1.75% Cycles are helped. LOST: 1 GAINED: 0 Iron Lake total instructions in shared programs: 8155075 -> 8142729 (-0.15%) instructions in affected programs: 949495 -> 937149 (-1.30%) helped: 5810 HURT: 0 helped stats (abs) min: 1 max: 8 x̄: 2.12 x̃: 2 helped stats (rel) min: 0.10% max: 16.67% x̄: 2.53% x̃: 1.85% 95% mean confidence interval for instructions value: -2.14 -2.11 95% mean confidence interval for instructions %-change: -2.59% -2.48% Instructions are helped. total cycles in shared programs: 188584610 -> 188549632 (-0.02%) cycles in affected programs: 17274446 -> 17239468 (-0.20%) helped: 3881 HURT: 90 helped stats (abs) min: 2 max: 168 x̄: 9.08 x̃: 6 helped stats (rel) min: <.01% max: 23.53% x̄: 0.83% x̃: 0.30% HURT stats (abs) min: 2 max: 10 x̄: 2.80 x̃: 2 HURT stats (rel) min: <.01% max: 0.60% x̄: 0.10% x̃: 0.07% 95% mean confidence interval for cycles value: -9.35 -8.27 95% mean confidence interval for cycles %-change: -0.85% -0.77% Cycles are helped. GM45 total instructions in shared programs: 5019308 -> 5013119 (-0.12%) instructions in affected programs: 489028 -> 482839 (-1.27%) helped: 2912 HURT: 0 helped stats (abs) min: 1 max: 8 x̄: 2.13 x̃: 2 helped stats (rel) min: 0.10% max: 16.67% x̄: 2.46% x̃: 1.81% 95% mean confidence interval for instructions value: -2.14 -2.11 95% mean confidence interval for instructions %-change: -2.54% -2.39% Instructions are helped. total cycles in shared programs: 129002592 -> 128977804 (-0.02%) cycles in affected programs: 12669152 -> 12644364 (-0.20%) helped: 2759 HURT: 37 helped stats (abs) min: 2 max: 168 x̄: 9.03 x̃: 4 helped stats (rel) min: <.01% max: 21.43% x̄: 0.75% x̃: 0.31% HURT stats (abs) min: 2 max: 10 x̄: 3.62 x̃: 4 HURT stats (rel) min: <.01% max: 0.41% x̄: 0.10% x̃: 0.04% 95% mean confidence interval for cycles value: -9.53 -8.20 95% mean confidence interval for cycles %-change: -0.79% -0.70% Cycles are helped. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Matt Turner <mattst88@gmail.com>
2019-05-20 17:25:01 -07:00
cmp->predicate = BRW_PREDICATE_NORMAL;
cmp->flag_subreg = sample_mask_flag_subreg(this);
fs_inst *jump = bld.emit(BRW_OPCODE_HALT);
jump->flag_subreg = sample_mask_flag_subreg(this);
jump->predicate_inverse = true;
if (instr->intrinsic == nir_intrinsic_terminate ||
instr->intrinsic == nir_intrinsic_terminate_if) {
jump->predicate = BRW_PREDICATE_NORMAL;
} else {
/* Only jump when the whole quad is demoted. For historical
* reasons this is also used for discard.
*/
jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H;
}
if (devinfo->ver < 7)
limit_dispatch_width(
16, "Fragment discard/demote not implemented in SIMD32 mode.\n");
break;
}
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
case nir_intrinsic_load_input: {
/* In Fragment Shaders load_input is used either for flat inputs or
* per-primitive inputs.
*/
assert(instr->def.bit_size == 32);
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
unsigned base = nir_intrinsic_base(instr);
unsigned comp = nir_intrinsic_component(instr);
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
unsigned num_components = instr->num_components;
const struct brw_wm_prog_key *wm_key = (brw_wm_prog_key*) this->key;
if (wm_key->mesh_input == BRW_SOMETIMES) {
assert(devinfo->verx10 >= 125);
/* The FS payload gives us the viewport and layer clamped to valid
* ranges, but the spec for gl_ViewportIndex and gl_Layer includes
* the language:
* the fragment stage will read the same value written by the
* geometry stage, even if that value is out of range.
*
* Which is why these are normally passed as regular attributes.
* This isn't tested anywhere except some GL-only piglit tests
* though, so for the case where the FS may be used against either a
* traditional pipeline or a mesh one, where the position of these
* will change depending on the previous stage, read them from the
* payload to simplify things until the requisite magic is in place.
*/
if (base == VARYING_SLOT_LAYER || base == VARYING_SLOT_VIEWPORT) {
assert(num_components == 1);
fs_reg g1(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD));
unsigned mask, shift_count;
if (base == VARYING_SLOT_LAYER) {
shift_count = 16;
mask = 0x7ff << shift_count;
} else {
shift_count = 27;
mask = 0xf << shift_count;
}
fs_reg vp_or_layer = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.AND(vp_or_layer, g1, brw_imm_ud(mask));
fs_reg shifted_value = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.SHR(shifted_value, vp_or_layer, brw_imm_ud(shift_count));
bld.MOV(offset(dest, bld, 0), retype(shifted_value, dest.type));
break;
}
}
/* TODO(mesh): Multiview. Verify and handle these special cases for Mesh. */
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
/* Special case fields in the VUE header */
if (base == VARYING_SLOT_LAYER)
comp = 1;
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
else if (base == VARYING_SLOT_VIEWPORT)
comp = 2;
if (BITFIELD64_BIT(base) & nir->info.per_primitive_inputs) {
assert(base != VARYING_SLOT_PRIMITIVE_INDICES);
for (unsigned int i = 0; i < num_components; i++) {
bld.MOV(offset(dest, bld, i),
retype(per_primitive_reg(base, comp + i), dest.type));
}
} else {
for (unsigned int i = 0; i < num_components; i++) {
bld.MOV(offset(dest, bld, i),
retype(component(interp_reg(base, comp + i), 3), dest.type));
}
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
}
break;
}
case nir_intrinsic_load_fs_input_interp_deltas: {
assert(stage == MESA_SHADER_FRAGMENT);
assert(nir_src_as_uint(instr->src[0]) == 0);
fs_reg interp = interp_reg(nir_intrinsic_base(instr),
nir_intrinsic_component(instr));
dest.type = BRW_REGISTER_TYPE_F;
bld.MOV(offset(dest, bld, 0), component(interp, 3));
bld.MOV(offset(dest, bld, 1), component(interp, 1));
bld.MOV(offset(dest, bld, 2), component(interp, 0));
break;
}
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
case nir_intrinsic_load_barycentric_pixel:
case nir_intrinsic_load_barycentric_centroid:
case nir_intrinsic_load_barycentric_sample: {
/* Use the delta_xy values computed from the payload */
enum brw_barycentric_mode bary = brw_barycentric_mode(instr);
intel/fs: Switch to standard vector layout for barycentrics at optimization time. This involves permuting the registers of barycentric vectors to have the standard X[0-n] Y[0-n] layout at NIR translation time. Barycentrics are converted to the format expected by the PLN instruction in the lower_barycentrics() pass run after the optimization loop. Main reason is correctness of SIMD32 fragment shaders. The shuffle_from_pln_layout() and shuffle_to_pln_layout() helpers used during NIR translation are busted for SIMD32. This leads to serious corruption at present with INTEL_DEBUG=do32, especially on Gen11+ where these helpers are hit more frequently due to the lack of a hardware PLN instruction. Of course one could have chosen to fix those helpers instead, but there is another far more subtle issue that was reported during review of the SIMD32 fragment shader codegen changes: The SIMD splitting pass currently handles SIMD32 barycentric vectors as if they had the standard X[0-n] Y[0-n] layout, even though they are interleaved for the PLN instruction, which causes incorrect execution masks to be applied to the MOVs unzipping barycentric vectors in cases where a LINTERP instruction occurs under non-uniform control flow. I'm not aware of any conformance regressions due to the latter issue at present, but for our peace of mind let's move the conversion to the PLN layout into the lower_barycentrics() pass run after lower_simd_width(). This leads to the following shader-db improvements (including SIMD32 shaders) in combination with the previous back-end preparation changes -- Without them (especially the copy propagation changes) this would lead to a massive number of regressions. On ICL: total instructions in shared programs: 20662316 -> 20466903 (-0.95%) instructions in affected programs: 10538474 -> 10343061 (-1.85%) helped: 68775 HURT: 6 total spills in shared programs: 8938 -> 8748 (-2.13%) spills in affected programs: 376 -> 186 (-50.53%) helped: 9 HURT: 5 total fills in shared programs: 8965 -> 8663 (-3.37%) fills in affected programs: 965 -> 663 (-31.30%) helped: 9 HURT: 6 LOST: 146 GAINED: 43 On SKL: total instructions in shared programs: 18725867 -> 18614912 (-0.59%) instructions in affected programs: 3876590 -> 3765635 (-2.86%) helped: 27492 HURT: 2 LOST: 191 GAINED: 417 On SNB: total instructions in shared programs: 14573613 -> 13980646 (-4.07%) instructions in affected programs: 5199074 -> 4606107 (-11.41%) helped: 29998 HURT: 0 LOST: 21 GAINED: 30 Results are somewhat less impressive but still significant without SIMD32 fragment shaders enabled. On ICL: total instructions in shared programs: 16148728 -> 16061659 (-0.54%) instructions in affected programs: 6114788 -> 6027719 (-1.42%) helped: 42046 HURT: 6 total spills in shared programs: 8218 -> 8028 (-2.31%) spills in affected programs: 376 -> 186 (-50.53%) helped: 9 HURT: 5 total fills in shared programs: 8953 -> 8651 (-3.37%) fills in affected programs: 965 -> 663 (-31.30%) helped: 9 HURT: 6 LOST: 0 GAINED: 3 On SKL: total instructions in shared programs: 14927994 -> 14926738 (-0.01%) instructions in affected programs: 168850 -> 167594 (-0.74%) helped: 711 HURT: 2 On SNB: total instructions in shared programs: 10770538 -> 10734403 (-0.34%) instructions in affected programs: 2702172 -> 2666037 (-1.34%) helped: 17818 HURT: 0 All of the hurt shaders are either spilling slightly more or emitting additional NOP instructions due to the SIMD16 POW workaround for Gen8-9 combined with differences in scheduling. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2020-01-03 17:08:51 -08:00
const fs_reg srcs[] = { offset(this->delta_xy[bary], bld, 0),
offset(this->delta_xy[bary], bld, 1) };
bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
break;
}
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
case nir_intrinsic_load_barycentric_at_sample: {
const glsl_interp_mode interpolation =
(enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
fs_reg msg_data;
if (nir_src_is_const(instr->src[0])) {
msg_data = brw_imm_ud(nir_src_as_uint(instr->src[0]) << 4);
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
} else {
const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
BRW_REGISTER_TYPE_UD);
const fs_reg sample_id = bld.emit_uniformize(sample_src);
msg_data = component(bld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD), 0);
bld.exec_all().group(1, 0).SHL(msg_data, sample_id, brw_imm_ud(4u));
}
emit_pixel_interpolater_send(bld,
FS_OPCODE_INTERPOLATE_AT_SAMPLE,
dest,
fs_reg(), /* src */
msg_data,
interpolation);
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
break;
}
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
case nir_intrinsic_load_barycentric_at_offset: {
const glsl_interp_mode interpolation =
(enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
if (const_offset) {
assert(nir_src_bit_size(instr->src[0]) == 32);
unsigned off_x = const_offset[0].u32 & 0xf;
unsigned off_y = const_offset[1].u32 & 0xf;
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
emit_pixel_interpolater_send(bld,
FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
dest,
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
fs_reg(), /* src */
brw_imm_ud(off_x | (off_y << 4)),
interpolation);
} else {
fs_reg src = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_D);
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
emit_pixel_interpolater_send(bld,
opcode,
dest,
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
src,
brw_imm_ud(0u),
interpolation);
}
break;
}
case nir_intrinsic_load_frag_coord:
emit_fragcoord_interpolation(dest);
break;
case nir_intrinsic_load_interpolated_input: {
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
assert(instr->src[0].ssa &&
instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
nir_intrinsic_instr *bary_intrinsic =
nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
enum glsl_interp_mode interp_mode =
(enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);
fs_reg dst_xy;
if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
intel/fs: Switch to standard vector layout for barycentrics at optimization time. This involves permuting the registers of barycentric vectors to have the standard X[0-n] Y[0-n] layout at NIR translation time. Barycentrics are converted to the format expected by the PLN instruction in the lower_barycentrics() pass run after the optimization loop. Main reason is correctness of SIMD32 fragment shaders. The shuffle_from_pln_layout() and shuffle_to_pln_layout() helpers used during NIR translation are busted for SIMD32. This leads to serious corruption at present with INTEL_DEBUG=do32, especially on Gen11+ where these helpers are hit more frequently due to the lack of a hardware PLN instruction. Of course one could have chosen to fix those helpers instead, but there is another far more subtle issue that was reported during review of the SIMD32 fragment shader codegen changes: The SIMD splitting pass currently handles SIMD32 barycentric vectors as if they had the standard X[0-n] Y[0-n] layout, even though they are interleaved for the PLN instruction, which causes incorrect execution masks to be applied to the MOVs unzipping barycentric vectors in cases where a LINTERP instruction occurs under non-uniform control flow. I'm not aware of any conformance regressions due to the latter issue at present, but for our peace of mind let's move the conversion to the PLN layout into the lower_barycentrics() pass run after lower_simd_width(). This leads to the following shader-db improvements (including SIMD32 shaders) in combination with the previous back-end preparation changes -- Without them (especially the copy propagation changes) this would lead to a massive number of regressions. On ICL: total instructions in shared programs: 20662316 -> 20466903 (-0.95%) instructions in affected programs: 10538474 -> 10343061 (-1.85%) helped: 68775 HURT: 6 total spills in shared programs: 8938 -> 8748 (-2.13%) spills in affected programs: 376 -> 186 (-50.53%) helped: 9 HURT: 5 total fills in shared programs: 8965 -> 8663 (-3.37%) fills in affected programs: 965 -> 663 (-31.30%) helped: 9 HURT: 6 LOST: 146 GAINED: 43 On SKL: total instructions in shared programs: 18725867 -> 18614912 (-0.59%) instructions in affected programs: 3876590 -> 3765635 (-2.86%) helped: 27492 HURT: 2 LOST: 191 GAINED: 417 On SNB: total instructions in shared programs: 14573613 -> 13980646 (-4.07%) instructions in affected programs: 5199074 -> 4606107 (-11.41%) helped: 29998 HURT: 0 LOST: 21 GAINED: 30 Results are somewhat less impressive but still significant without SIMD32 fragment shaders enabled. On ICL: total instructions in shared programs: 16148728 -> 16061659 (-0.54%) instructions in affected programs: 6114788 -> 6027719 (-1.42%) helped: 42046 HURT: 6 total spills in shared programs: 8218 -> 8028 (-2.31%) spills in affected programs: 376 -> 186 (-50.53%) helped: 9 HURT: 5 total fills in shared programs: 8953 -> 8651 (-3.37%) fills in affected programs: 965 -> 663 (-31.30%) helped: 9 HURT: 6 LOST: 0 GAINED: 3 On SKL: total instructions in shared programs: 14927994 -> 14926738 (-0.01%) instructions in affected programs: 168850 -> 167594 (-0.74%) helped: 711 HURT: 2 On SNB: total instructions in shared programs: 10770538 -> 10734403 (-0.34%) instructions in affected programs: 2702172 -> 2666037 (-1.34%) helped: 17818 HURT: 0 All of the hurt shaders are either spilling slightly more or emitting additional NOP instructions due to the SIMD16 POW workaround for Gen8-9 combined with differences in scheduling. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2020-01-03 17:08:51 -08:00
/* Use the result of the PI message. */
dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F);
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
} else {
/* Use the delta_xy values computed from the payload */
enum brw_barycentric_mode bary = brw_barycentric_mode(bary_intrinsic);
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
dst_xy = this->delta_xy[bary];
}
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
for (unsigned int i = 0; i < instr->num_components; i++) {
fs_reg interp =
component(interp_reg(nir_intrinsic_base(instr),
nir_intrinsic_component(instr) + i), 0);
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
interp.type = BRW_REGISTER_TYPE_F;
dest.type = BRW_REGISTER_TYPE_F;
if (devinfo->ver < 6 && interp_mode == INTERP_MODE_SMOOTH) {
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
fs_reg tmp = vgrf(glsl_type::float_type);
bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);
bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);
} else {
bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
}
}
break;
}
i965: Rewrite FS input handling to use the new NIR intrinsics. This eliminates the need to walk the list of input variables, recurse into their types (via logic largely redundant with nir_lower_io), and interpolate all possible inputs up front. The backend no longer has to care about variables at all, which eliminates complications from trying to pack multiple variables into the same location. Instead, each intrinsic specifies exactly what's needed. This should unblock Timothy's work on GL_ARB_enhanced_layouts. Each load_interpolated_input intrinsic corresponds to PLN instructions, while load_barycentric_at_* intrinsics correspond to pixel interpolator messages. The pixel/centroid/sample barycentric intrinsics simply refer to payload fields (delta_xy[]), and don't actually generate any code. Because we use a single intrinsic for both centroid-qualified variables and interpolateAtCentroid(), they become indistinguishable. We stop sending pixel interpolator messages for those, and instead use the payload provided data, which should be considerably faster. On Broadwell: total instructions in shared programs: 9067751 -> 9067570 (-0.00%) instructions in affected programs: 145902 -> 145721 (-0.12%) helped: 422 HURT: 209 total spills in shared programs: 2849 -> 2899 (1.76%) spills in affected programs: 760 -> 810 (6.58%) helped: 0 HURT: 10 total fills in shared programs: 3910 -> 3950 (1.02%) fills in affected programs: 617 -> 657 (6.48%) helped: 0 HURT: 10 LOST: 3 GAINED: 3 The differences mostly appear to be slight changes in MOVs. v2: Use nir_shader_compiler_options::use_interpolated_input_intrinsics flag rather than passing it directly to nir_lower_io. Use the unreachable() macro rather than assert in one place. (Review feedback from Chris Forbes.) Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Chris Forbes <chrisforbes@google.com> Acked-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-12 03:57:25 -07:00
default:
nir_emit_intrinsic(bld, instr);
break;
}
}
void
fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
nir_intrinsic_instr *instr)
{
assert(gl_shader_stage_uses_workgroup(stage));
struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
fs_reg dest;
if (nir_intrinsic_infos[instr->intrinsic].has_dest)
dest = get_nir_def(instr->def);
switch (instr->intrinsic) {
case nir_intrinsic_barrier:
if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
nir_emit_intrinsic(bld, instr);
if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
/* The whole workgroup fits in a single HW thread, so all the
* invocations are already executed lock-step. Instead of an actual
* barrier just emit a scheduling fence, that will generate no code.
*/
if (!nir->info.workgroup_size_variable &&
workgroup_size() <= dispatch_width) {
bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE);
break;
}
emit_barrier();
cs_prog_data->uses_barrier = true;
}
break;
case nir_intrinsic_load_subgroup_id:
cs_payload().load_subgroup_id(bld, dest);
break;
case nir_intrinsic_load_local_invocation_id: {
fs_reg val = nir_system_values[SYSTEM_VALUE_LOCAL_INVOCATION_ID];
assert(val.file != BAD_FILE);
dest.type = val.type;
for (unsigned i = 0; i < 3; i++)
bld.MOV(offset(dest, bld, i), offset(val, bld, i));
break;
}
case nir_intrinsic_load_workgroup_id:
case nir_intrinsic_load_workgroup_id_zero_base: {
fs_reg val = nir_system_values[SYSTEM_VALUE_WORKGROUP_ID];
assert(val.file != BAD_FILE);
dest.type = val.type;
for (unsigned i = 0; i < 3; i++)
bld.MOV(offset(dest, bld, i), offset(val, bld, i));
break;
}
case nir_intrinsic_load_num_workgroups: {
assert(instr->def.bit_size == 32);
cs_prog_data->uses_num_work_groups = true;
fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(0);
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(3); /* num components */
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = brw_imm_ud(0);
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
fs_inst *inst =
bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
inst->size_written = 3 * dispatch_width * 4;
break;
}
case nir_intrinsic_shared_atomic:
case nir_intrinsic_shared_atomic_swap:
nir_emit_surface_atomic(bld, instr, brw_imm_ud(GFX7_BTI_SLM),
false /* bindless */);
break;
case nir_intrinsic_load_shared: {
assert(devinfo->ver >= 7);
const unsigned bit_size = instr->def.bit_size;
fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM);
fs_reg addr = get_nir_src(instr->src[0]);
int base = nir_intrinsic_base(instr);
if (base) {
fs_reg addr_off = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
bld.ADD(addr_off, addr, brw_imm_d(base));
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr_off;
} else {
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr;
}
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
/* Make dest unsigned because that's what the temporary will be */
dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
/* Read the vector */
assert(bit_size <= 32);
assert(nir_intrinsic_align(instr) > 0);
if (bit_size == 32 &&
nir_intrinsic_align(instr) >= 4) {
assert(instr->def.num_components <= 4);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
fs_inst *inst =
bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
inst->size_written = instr->num_components * dispatch_width * 4;
} else {
assert(instr->def.num_components == 1);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
bld.MOV(dest, subscript(read_result, dest.type, 0));
}
break;
}
case nir_intrinsic_store_shared: {
assert(devinfo->ver >= 7);
const unsigned bit_size = nir_src_bit_size(instr->src[0]);
fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM);
fs_reg addr = get_nir_src(instr->src[1]);
int base = nir_intrinsic_base(instr);
if (base) {
fs_reg addr_off = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
bld.ADD(addr_off, addr, brw_imm_d(base));
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr_off;
} else {
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr;
}
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
/* No point in masking with sample mask, here we're handling compute
* intrinsics.
*/
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
fs_reg data = get_nir_src(instr->src[0]);
data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
assert(bit_size <= 32);
assert(nir_intrinsic_write_mask(instr) ==
(1u << instr->num_components) - 1);
assert(nir_intrinsic_align(instr) > 0);
if (bit_size == 32 &&
nir_intrinsic_align(instr) >= 4) {
assert(nir_src_num_components(instr->src[0]) <= 4);
srcs[SURFACE_LOGICAL_SRC_DATA] = data;
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
} else {
assert(nir_src_num_components(instr->src[0]) == 1);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
}
break;
}
case nir_intrinsic_load_workgroup_size: {
/* Should have been lowered by brw_nir_lower_cs_intrinsics() or
* crocus/iris_setup_uniforms() for the variable group size case.
*/
unreachable("Should have been lowered");
break;
}
default:
nir_emit_intrinsic(bld, instr);
break;
}
}
static void
emit_rt_lsc_fence(const fs_builder &bld,
enum lsc_fence_scope scope,
enum lsc_flush_type flush_type)
{
const intel_device_info *devinfo = bld.shader->devinfo;
const fs_builder ubld = bld.exec_all().group(8, 0);
fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, tmp,
brw_imm_ud(0) /* desc */,
brw_imm_ud(0) /* ex_desc */,
brw_vec8_grf(0, 0) /* payload */);
send->sfid = GFX12_SFID_UGM;
send->desc = lsc_fence_msg_desc(devinfo, scope, flush_type, true);
send->mlen = reg_unit(devinfo); /* g0 header */
send->ex_mlen = 0;
/* Temp write for scheduling */
send->size_written = REG_SIZE * reg_unit(devinfo);
send->send_has_side_effects = true;
ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), tmp);
}
void
fs_visitor::nir_emit_bs_intrinsic(const fs_builder &bld,
nir_intrinsic_instr *instr)
{
assert(brw_shader_stage_is_bindless(stage));
const bs_thread_payload &payload = bs_payload();
fs_reg dest;
if (nir_intrinsic_infos[instr->intrinsic].has_dest)
dest = get_nir_def(instr->def);
switch (instr->intrinsic) {
case nir_intrinsic_load_btd_global_arg_addr_intel:
bld.MOV(dest, retype(payload.global_arg_ptr, dest.type));
break;
case nir_intrinsic_load_btd_local_arg_addr_intel:
bld.MOV(dest, retype(payload.local_arg_ptr, dest.type));
break;
case nir_intrinsic_load_btd_shader_type_intel:
payload.load_shader_type(bld, dest);
break;
default:
nir_emit_intrinsic(bld, instr);
break;
}
}
static fs_reg
brw_nir_reduction_op_identity(const fs_builder &bld,
nir_op op, brw_reg_type type)
{
nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8);
switch (type_sz(type)) {
case 1:
if (type == BRW_REGISTER_TYPE_UB) {
return brw_imm_uw(value.u8);
} else {
assert(type == BRW_REGISTER_TYPE_B);
return brw_imm_w(value.i8);
}
case 2:
return retype(brw_imm_uw(value.u16), type);
case 4:
return retype(brw_imm_ud(value.u32), type);
case 8:
if (type == BRW_REGISTER_TYPE_DF)
return setup_imm_df(bld, value.f64);
else
return retype(brw_imm_u64(value.u64), type);
default:
unreachable("Invalid type size");
}
}
static opcode
brw_op_for_nir_reduction_op(nir_op op)
{
switch (op) {
case nir_op_iadd: return BRW_OPCODE_ADD;
case nir_op_fadd: return BRW_OPCODE_ADD;
case nir_op_imul: return BRW_OPCODE_MUL;
case nir_op_fmul: return BRW_OPCODE_MUL;
case nir_op_imin: return BRW_OPCODE_SEL;
case nir_op_umin: return BRW_OPCODE_SEL;
case nir_op_fmin: return BRW_OPCODE_SEL;
case nir_op_imax: return BRW_OPCODE_SEL;
case nir_op_umax: return BRW_OPCODE_SEL;
case nir_op_fmax: return BRW_OPCODE_SEL;
case nir_op_iand: return BRW_OPCODE_AND;
case nir_op_ior: return BRW_OPCODE_OR;
case nir_op_ixor: return BRW_OPCODE_XOR;
default:
unreachable("Invalid reduction operation");
}
}
static brw_conditional_mod
brw_cond_mod_for_nir_reduction_op(nir_op op)
{
switch (op) {
case nir_op_iadd: return BRW_CONDITIONAL_NONE;
case nir_op_fadd: return BRW_CONDITIONAL_NONE;
case nir_op_imul: return BRW_CONDITIONAL_NONE;
case nir_op_fmul: return BRW_CONDITIONAL_NONE;
case nir_op_imin: return BRW_CONDITIONAL_L;
case nir_op_umin: return BRW_CONDITIONAL_L;
case nir_op_fmin: return BRW_CONDITIONAL_L;
case nir_op_imax: return BRW_CONDITIONAL_GE;
case nir_op_umax: return BRW_CONDITIONAL_GE;
case nir_op_fmax: return BRW_CONDITIONAL_GE;
case nir_op_iand: return BRW_CONDITIONAL_NONE;
case nir_op_ior: return BRW_CONDITIONAL_NONE;
case nir_op_ixor: return BRW_CONDITIONAL_NONE;
default:
unreachable("Invalid reduction operation");
}
}
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
struct rebuild_resource {
unsigned idx;
std::vector<nir_def *> array;
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
};
static bool
add_rebuild_src(nir_src *src, void *state)
{
struct rebuild_resource *res = (struct rebuild_resource *) state;
for (nir_def *def : res->array) {
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
if (def == src->ssa)
return true;
}
nir_foreach_src(src->ssa->parent_instr, add_rebuild_src, state);
res->array.push_back(src->ssa);
return true;
}
fs_reg
fs_visitor::try_rebuild_resource(const brw::fs_builder &bld, nir_def *resource_def)
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
{
/* Create a build at the location of the resource_intel intrinsic */
intel/fs: switch from SIMD 1 to 8 instructions surface/sampler rematerialization SIMD1 instructions are problematic because they are considered partial writes. This increases the liveness of the destination register written by those instructions. To workaround this we use UNDEF instructions to bound the liveness of the register. But this causing other issues like in this case : undef(1) vgrf2 mov(1) vgrf2, u4.0 add(1) vgrf3, vgrf2.0, 64UD In this case the copy propagation pass in unable to see that vgrf2 in the add() instruction can be replaced with the uniform u4.0. To fix this problem, we switch NoMask SIMD8 instructions that cover the entire register. We can drop the UNDEF instructions and now copy propagation can do its job. Good results on 2 apps : Cyberpunk 2077 : Totals from 7258 (68.80% of 10549) affected shaders: Instrs: 6332210 -> 6073833 (-4.08%); split: -4.11%, +0.03% Cycles: 130667501 -> 127351268 (-2.54%); split: -3.12%, +0.58% Subgroup size: 90320 -> 90400 (+0.09%) Spill count: 90 -> 68 (-24.44%) Fill count: 82 -> 64 (-21.95%) Scratch Memory Size: 8192 -> 6144 (-25.00%) Max live registers: 385464 -> 375152 (-2.68%) Max dispatch width: 64336 -> 64424 (+0.14%); split: +0.96%, -0.82% Gaining 60 SIMD16/SIMD32 shaders, loosing 33 Strange Brigade : Totals from 2137 (53.12% of 4023) affected shaders: Instrs: 1544031 -> 1457544 (-5.60%); split: -5.60%, +0.00% Cycles: 22292564 -> 21868978 (-1.90%); split: -2.43%, +0.53% Subgroup size: 25328 -> 25344 (+0.06%) Max live registers: 113716 -> 111214 (-2.20%) Max dispatch width: 17232 -> 18608 (+7.99%); split: +8.36%, -0.37% Gaining 138 SIMD16/SIMD32 shaders, loosing 4 On app slightly negatively affected : Dota2 : Totals from 232 (14.73% of 1575) affected shaders: Instrs: 30029 -> 28194 (-6.11%) Cycles: 385155 -> 371422 (-3.57%); split: -3.59%, +0.02% Max live registers: 6792 -> 6780 (-0.18%) Max dispatch width: 2256 -> 2160 (-4.26%) Loosing 6 SIMD32 shaders Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24554>
2023-08-07 14:03:57 +03:00
fs_builder ubld8 = bld.exec_all().group(8, 0);
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
struct rebuild_resource resources = {};
resources.idx = 0;
if (!nir_foreach_src(resource_def->parent_instr,
add_rebuild_src, &resources))
return fs_reg();
resources.array.push_back(resource_def);
if (resources.array.size() == 1) {
nir_def *def = resources.array[0];
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
if (def->parent_instr->type == nir_instr_type_load_const) {
nir_load_const_instr *load_const =
nir_instr_as_load_const(def->parent_instr);
return brw_imm_ud(load_const->value[0].i32);
} else {
assert(def->parent_instr->type == nir_instr_type_intrinsic &&
(nir_instr_as_intrinsic(def->parent_instr)->intrinsic ==
nir_intrinsic_load_uniform));
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(def->parent_instr);
unsigned base_offset = nir_intrinsic_base(intrin);
unsigned load_offset = nir_src_as_uint(intrin->src[0]);
fs_reg src(UNIFORM, base_offset / 4, BRW_REGISTER_TYPE_UD);
src.offset = load_offset + base_offset % 4;
return src;
}
}
for (unsigned i = 0; i < resources.array.size(); i++) {
nir_def *def = resources.array[i];
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
nir_instr *instr = def->parent_instr;
switch (instr->type) {
case nir_instr_type_load_const: {
nir_load_const_instr *load_const =
nir_instr_as_load_const(instr);
intel/fs: switch from SIMD 1 to 8 instructions surface/sampler rematerialization SIMD1 instructions are problematic because they are considered partial writes. This increases the liveness of the destination register written by those instructions. To workaround this we use UNDEF instructions to bound the liveness of the register. But this causing other issues like in this case : undef(1) vgrf2 mov(1) vgrf2, u4.0 add(1) vgrf3, vgrf2.0, 64UD In this case the copy propagation pass in unable to see that vgrf2 in the add() instruction can be replaced with the uniform u4.0. To fix this problem, we switch NoMask SIMD8 instructions that cover the entire register. We can drop the UNDEF instructions and now copy propagation can do its job. Good results on 2 apps : Cyberpunk 2077 : Totals from 7258 (68.80% of 10549) affected shaders: Instrs: 6332210 -> 6073833 (-4.08%); split: -4.11%, +0.03% Cycles: 130667501 -> 127351268 (-2.54%); split: -3.12%, +0.58% Subgroup size: 90320 -> 90400 (+0.09%) Spill count: 90 -> 68 (-24.44%) Fill count: 82 -> 64 (-21.95%) Scratch Memory Size: 8192 -> 6144 (-25.00%) Max live registers: 385464 -> 375152 (-2.68%) Max dispatch width: 64336 -> 64424 (+0.14%); split: +0.96%, -0.82% Gaining 60 SIMD16/SIMD32 shaders, loosing 33 Strange Brigade : Totals from 2137 (53.12% of 4023) affected shaders: Instrs: 1544031 -> 1457544 (-5.60%); split: -5.60%, +0.00% Cycles: 22292564 -> 21868978 (-1.90%); split: -2.43%, +0.53% Subgroup size: 25328 -> 25344 (+0.06%) Max live registers: 113716 -> 111214 (-2.20%) Max dispatch width: 17232 -> 18608 (+7.99%); split: +8.36%, -0.37% Gaining 138 SIMD16/SIMD32 shaders, loosing 4 On app slightly negatively affected : Dota2 : Totals from 232 (14.73% of 1575) affected shaders: Instrs: 30029 -> 28194 (-6.11%) Cycles: 385155 -> 371422 (-3.57%); split: -3.59%, +0.02% Max live registers: 6792 -> 6780 (-0.18%) Max dispatch width: 2256 -> 2160 (-4.26%) Loosing 6 SIMD32 shaders Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24554>
2023-08-07 14:03:57 +03:00
fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
nir_resource_insts[def->index] =
intel/fs: switch from SIMD 1 to 8 instructions surface/sampler rematerialization SIMD1 instructions are problematic because they are considered partial writes. This increases the liveness of the destination register written by those instructions. To workaround this we use UNDEF instructions to bound the liveness of the register. But this causing other issues like in this case : undef(1) vgrf2 mov(1) vgrf2, u4.0 add(1) vgrf3, vgrf2.0, 64UD In this case the copy propagation pass in unable to see that vgrf2 in the add() instruction can be replaced with the uniform u4.0. To fix this problem, we switch NoMask SIMD8 instructions that cover the entire register. We can drop the UNDEF instructions and now copy propagation can do its job. Good results on 2 apps : Cyberpunk 2077 : Totals from 7258 (68.80% of 10549) affected shaders: Instrs: 6332210 -> 6073833 (-4.08%); split: -4.11%, +0.03% Cycles: 130667501 -> 127351268 (-2.54%); split: -3.12%, +0.58% Subgroup size: 90320 -> 90400 (+0.09%) Spill count: 90 -> 68 (-24.44%) Fill count: 82 -> 64 (-21.95%) Scratch Memory Size: 8192 -> 6144 (-25.00%) Max live registers: 385464 -> 375152 (-2.68%) Max dispatch width: 64336 -> 64424 (+0.14%); split: +0.96%, -0.82% Gaining 60 SIMD16/SIMD32 shaders, loosing 33 Strange Brigade : Totals from 2137 (53.12% of 4023) affected shaders: Instrs: 1544031 -> 1457544 (-5.60%); split: -5.60%, +0.00% Cycles: 22292564 -> 21868978 (-1.90%); split: -2.43%, +0.53% Subgroup size: 25328 -> 25344 (+0.06%) Max live registers: 113716 -> 111214 (-2.20%) Max dispatch width: 17232 -> 18608 (+7.99%); split: +8.36%, -0.37% Gaining 138 SIMD16/SIMD32 shaders, loosing 4 On app slightly negatively affected : Dota2 : Totals from 232 (14.73% of 1575) affected shaders: Instrs: 30029 -> 28194 (-6.11%) Cycles: 385155 -> 371422 (-3.57%); split: -3.59%, +0.02% Max live registers: 6792 -> 6780 (-0.18%) Max dispatch width: 2256 -> 2160 (-4.26%) Loosing 6 SIMD32 shaders Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24554>
2023-08-07 14:03:57 +03:00
ubld8.MOV(dst, brw_imm_ud(load_const->value[0].i32));
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
break;
}
case nir_instr_type_alu: {
nir_alu_instr *alu = nir_instr_as_alu(instr);
if (nir_op_infos[alu->op].num_inputs == 2) {
if (alu->src[0].swizzle[0] != 0 ||
alu->src[1].swizzle[0] != 0)
break;
} else if (nir_op_infos[alu->op].num_inputs == 3) {
if (alu->src[0].swizzle[0] != 0 ||
alu->src[1].swizzle[0] != 0 ||
alu->src[2].swizzle[0] != 0)
break;
} else {
/* Not supported ALU input count */
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
break;
}
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
switch (alu->op) {
case nir_op_iadd: {
intel/fs: switch from SIMD 1 to 8 instructions surface/sampler rematerialization SIMD1 instructions are problematic because they are considered partial writes. This increases the liveness of the destination register written by those instructions. To workaround this we use UNDEF instructions to bound the liveness of the register. But this causing other issues like in this case : undef(1) vgrf2 mov(1) vgrf2, u4.0 add(1) vgrf3, vgrf2.0, 64UD In this case the copy propagation pass in unable to see that vgrf2 in the add() instruction can be replaced with the uniform u4.0. To fix this problem, we switch NoMask SIMD8 instructions that cover the entire register. We can drop the UNDEF instructions and now copy propagation can do its job. Good results on 2 apps : Cyberpunk 2077 : Totals from 7258 (68.80% of 10549) affected shaders: Instrs: 6332210 -> 6073833 (-4.08%); split: -4.11%, +0.03% Cycles: 130667501 -> 127351268 (-2.54%); split: -3.12%, +0.58% Subgroup size: 90320 -> 90400 (+0.09%) Spill count: 90 -> 68 (-24.44%) Fill count: 82 -> 64 (-21.95%) Scratch Memory Size: 8192 -> 6144 (-25.00%) Max live registers: 385464 -> 375152 (-2.68%) Max dispatch width: 64336 -> 64424 (+0.14%); split: +0.96%, -0.82% Gaining 60 SIMD16/SIMD32 shaders, loosing 33 Strange Brigade : Totals from 2137 (53.12% of 4023) affected shaders: Instrs: 1544031 -> 1457544 (-5.60%); split: -5.60%, +0.00% Cycles: 22292564 -> 21868978 (-1.90%); split: -2.43%, +0.53% Subgroup size: 25328 -> 25344 (+0.06%) Max live registers: 113716 -> 111214 (-2.20%) Max dispatch width: 17232 -> 18608 (+7.99%); split: +8.36%, -0.37% Gaining 138 SIMD16/SIMD32 shaders, loosing 4 On app slightly negatively affected : Dota2 : Totals from 232 (14.73% of 1575) affected shaders: Instrs: 30029 -> 28194 (-6.11%) Cycles: 385155 -> 371422 (-3.57%); split: -3.59%, +0.02% Max live registers: 6792 -> 6780 (-0.18%) Max dispatch width: 2256 -> 2160 (-4.26%) Loosing 6 SIMD32 shaders Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24554>
2023-08-07 14:03:57 +03:00
fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
fs_reg src0 = nir_resource_insts[alu->src[0].src.ssa->index]->dst;
fs_reg src1 = nir_resource_insts[alu->src[1].src.ssa->index]->dst;
assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
assert(src0.type == BRW_REGISTER_TYPE_UD);
nir_resource_insts[def->index] =
intel/fs: switch from SIMD 1 to 8 instructions surface/sampler rematerialization SIMD1 instructions are problematic because they are considered partial writes. This increases the liveness of the destination register written by those instructions. To workaround this we use UNDEF instructions to bound the liveness of the register. But this causing other issues like in this case : undef(1) vgrf2 mov(1) vgrf2, u4.0 add(1) vgrf3, vgrf2.0, 64UD In this case the copy propagation pass in unable to see that vgrf2 in the add() instruction can be replaced with the uniform u4.0. To fix this problem, we switch NoMask SIMD8 instructions that cover the entire register. We can drop the UNDEF instructions and now copy propagation can do its job. Good results on 2 apps : Cyberpunk 2077 : Totals from 7258 (68.80% of 10549) affected shaders: Instrs: 6332210 -> 6073833 (-4.08%); split: -4.11%, +0.03% Cycles: 130667501 -> 127351268 (-2.54%); split: -3.12%, +0.58% Subgroup size: 90320 -> 90400 (+0.09%) Spill count: 90 -> 68 (-24.44%) Fill count: 82 -> 64 (-21.95%) Scratch Memory Size: 8192 -> 6144 (-25.00%) Max live registers: 385464 -> 375152 (-2.68%) Max dispatch width: 64336 -> 64424 (+0.14%); split: +0.96%, -0.82% Gaining 60 SIMD16/SIMD32 shaders, loosing 33 Strange Brigade : Totals from 2137 (53.12% of 4023) affected shaders: Instrs: 1544031 -> 1457544 (-5.60%); split: -5.60%, +0.00% Cycles: 22292564 -> 21868978 (-1.90%); split: -2.43%, +0.53% Subgroup size: 25328 -> 25344 (+0.06%) Max live registers: 113716 -> 111214 (-2.20%) Max dispatch width: 17232 -> 18608 (+7.99%); split: +8.36%, -0.37% Gaining 138 SIMD16/SIMD32 shaders, loosing 4 On app slightly negatively affected : Dota2 : Totals from 232 (14.73% of 1575) affected shaders: Instrs: 30029 -> 28194 (-6.11%) Cycles: 385155 -> 371422 (-3.57%); split: -3.59%, +0.02% Max live registers: 6792 -> 6780 (-0.18%) Max dispatch width: 2256 -> 2160 (-4.26%) Loosing 6 SIMD32 shaders Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24554>
2023-08-07 14:03:57 +03:00
ubld8.ADD(dst,
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
src0.file != IMM ? src0 : src1,
src0.file != IMM ? src1 : src0);
break;
}
case nir_op_iadd3: {
intel/fs: switch from SIMD 1 to 8 instructions surface/sampler rematerialization SIMD1 instructions are problematic because they are considered partial writes. This increases the liveness of the destination register written by those instructions. To workaround this we use UNDEF instructions to bound the liveness of the register. But this causing other issues like in this case : undef(1) vgrf2 mov(1) vgrf2, u4.0 add(1) vgrf3, vgrf2.0, 64UD In this case the copy propagation pass in unable to see that vgrf2 in the add() instruction can be replaced with the uniform u4.0. To fix this problem, we switch NoMask SIMD8 instructions that cover the entire register. We can drop the UNDEF instructions and now copy propagation can do its job. Good results on 2 apps : Cyberpunk 2077 : Totals from 7258 (68.80% of 10549) affected shaders: Instrs: 6332210 -> 6073833 (-4.08%); split: -4.11%, +0.03% Cycles: 130667501 -> 127351268 (-2.54%); split: -3.12%, +0.58% Subgroup size: 90320 -> 90400 (+0.09%) Spill count: 90 -> 68 (-24.44%) Fill count: 82 -> 64 (-21.95%) Scratch Memory Size: 8192 -> 6144 (-25.00%) Max live registers: 385464 -> 375152 (-2.68%) Max dispatch width: 64336 -> 64424 (+0.14%); split: +0.96%, -0.82% Gaining 60 SIMD16/SIMD32 shaders, loosing 33 Strange Brigade : Totals from 2137 (53.12% of 4023) affected shaders: Instrs: 1544031 -> 1457544 (-5.60%); split: -5.60%, +0.00% Cycles: 22292564 -> 21868978 (-1.90%); split: -2.43%, +0.53% Subgroup size: 25328 -> 25344 (+0.06%) Max live registers: 113716 -> 111214 (-2.20%) Max dispatch width: 17232 -> 18608 (+7.99%); split: +8.36%, -0.37% Gaining 138 SIMD16/SIMD32 shaders, loosing 4 On app slightly negatively affected : Dota2 : Totals from 232 (14.73% of 1575) affected shaders: Instrs: 30029 -> 28194 (-6.11%) Cycles: 385155 -> 371422 (-3.57%); split: -3.59%, +0.02% Max live registers: 6792 -> 6780 (-0.18%) Max dispatch width: 2256 -> 2160 (-4.26%) Loosing 6 SIMD32 shaders Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24554>
2023-08-07 14:03:57 +03:00
fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
fs_reg src0 = nir_resource_insts[alu->src[0].src.ssa->index]->dst;
fs_reg src1 = nir_resource_insts[alu->src[1].src.ssa->index]->dst;
fs_reg src2 = nir_resource_insts[alu->src[2].src.ssa->index]->dst;
assert(src0.file != BAD_FILE && src1.file != BAD_FILE && src2.file != BAD_FILE);
assert(src0.type == BRW_REGISTER_TYPE_UD);
nir_resource_insts[def->index] =
intel/fs: switch from SIMD 1 to 8 instructions surface/sampler rematerialization SIMD1 instructions are problematic because they are considered partial writes. This increases the liveness of the destination register written by those instructions. To workaround this we use UNDEF instructions to bound the liveness of the register. But this causing other issues like in this case : undef(1) vgrf2 mov(1) vgrf2, u4.0 add(1) vgrf3, vgrf2.0, 64UD In this case the copy propagation pass in unable to see that vgrf2 in the add() instruction can be replaced with the uniform u4.0. To fix this problem, we switch NoMask SIMD8 instructions that cover the entire register. We can drop the UNDEF instructions and now copy propagation can do its job. Good results on 2 apps : Cyberpunk 2077 : Totals from 7258 (68.80% of 10549) affected shaders: Instrs: 6332210 -> 6073833 (-4.08%); split: -4.11%, +0.03% Cycles: 130667501 -> 127351268 (-2.54%); split: -3.12%, +0.58% Subgroup size: 90320 -> 90400 (+0.09%) Spill count: 90 -> 68 (-24.44%) Fill count: 82 -> 64 (-21.95%) Scratch Memory Size: 8192 -> 6144 (-25.00%) Max live registers: 385464 -> 375152 (-2.68%) Max dispatch width: 64336 -> 64424 (+0.14%); split: +0.96%, -0.82% Gaining 60 SIMD16/SIMD32 shaders, loosing 33 Strange Brigade : Totals from 2137 (53.12% of 4023) affected shaders: Instrs: 1544031 -> 1457544 (-5.60%); split: -5.60%, +0.00% Cycles: 22292564 -> 21868978 (-1.90%); split: -2.43%, +0.53% Subgroup size: 25328 -> 25344 (+0.06%) Max live registers: 113716 -> 111214 (-2.20%) Max dispatch width: 17232 -> 18608 (+7.99%); split: +8.36%, -0.37% Gaining 138 SIMD16/SIMD32 shaders, loosing 4 On app slightly negatively affected : Dota2 : Totals from 232 (14.73% of 1575) affected shaders: Instrs: 30029 -> 28194 (-6.11%) Cycles: 385155 -> 371422 (-3.57%); split: -3.59%, +0.02% Max live registers: 6792 -> 6780 (-0.18%) Max dispatch width: 2256 -> 2160 (-4.26%) Loosing 6 SIMD32 shaders Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24554>
2023-08-07 14:03:57 +03:00
ubld8.ADD3(dst,
src1.file == IMM ? src1 : src0,
src1.file == IMM ? src0 : src1,
src2);
break;
}
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
case nir_op_ushr: {
intel/fs: switch from SIMD 1 to 8 instructions surface/sampler rematerialization SIMD1 instructions are problematic because they are considered partial writes. This increases the liveness of the destination register written by those instructions. To workaround this we use UNDEF instructions to bound the liveness of the register. But this causing other issues like in this case : undef(1) vgrf2 mov(1) vgrf2, u4.0 add(1) vgrf3, vgrf2.0, 64UD In this case the copy propagation pass in unable to see that vgrf2 in the add() instruction can be replaced with the uniform u4.0. To fix this problem, we switch NoMask SIMD8 instructions that cover the entire register. We can drop the UNDEF instructions and now copy propagation can do its job. Good results on 2 apps : Cyberpunk 2077 : Totals from 7258 (68.80% of 10549) affected shaders: Instrs: 6332210 -> 6073833 (-4.08%); split: -4.11%, +0.03% Cycles: 130667501 -> 127351268 (-2.54%); split: -3.12%, +0.58% Subgroup size: 90320 -> 90400 (+0.09%) Spill count: 90 -> 68 (-24.44%) Fill count: 82 -> 64 (-21.95%) Scratch Memory Size: 8192 -> 6144 (-25.00%) Max live registers: 385464 -> 375152 (-2.68%) Max dispatch width: 64336 -> 64424 (+0.14%); split: +0.96%, -0.82% Gaining 60 SIMD16/SIMD32 shaders, loosing 33 Strange Brigade : Totals from 2137 (53.12% of 4023) affected shaders: Instrs: 1544031 -> 1457544 (-5.60%); split: -5.60%, +0.00% Cycles: 22292564 -> 21868978 (-1.90%); split: -2.43%, +0.53% Subgroup size: 25328 -> 25344 (+0.06%) Max live registers: 113716 -> 111214 (-2.20%) Max dispatch width: 17232 -> 18608 (+7.99%); split: +8.36%, -0.37% Gaining 138 SIMD16/SIMD32 shaders, loosing 4 On app slightly negatively affected : Dota2 : Totals from 232 (14.73% of 1575) affected shaders: Instrs: 30029 -> 28194 (-6.11%) Cycles: 385155 -> 371422 (-3.57%); split: -3.59%, +0.02% Max live registers: 6792 -> 6780 (-0.18%) Max dispatch width: 2256 -> 2160 (-4.26%) Loosing 6 SIMD32 shaders Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24554>
2023-08-07 14:03:57 +03:00
fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
fs_reg src0 = nir_resource_insts[alu->src[0].src.ssa->index]->dst;
fs_reg src1 = nir_resource_insts[alu->src[1].src.ssa->index]->dst;
assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
assert(src0.type == BRW_REGISTER_TYPE_UD);
intel/fs: switch from SIMD 1 to 8 instructions surface/sampler rematerialization SIMD1 instructions are problematic because they are considered partial writes. This increases the liveness of the destination register written by those instructions. To workaround this we use UNDEF instructions to bound the liveness of the register. But this causing other issues like in this case : undef(1) vgrf2 mov(1) vgrf2, u4.0 add(1) vgrf3, vgrf2.0, 64UD In this case the copy propagation pass in unable to see that vgrf2 in the add() instruction can be replaced with the uniform u4.0. To fix this problem, we switch NoMask SIMD8 instructions that cover the entire register. We can drop the UNDEF instructions and now copy propagation can do its job. Good results on 2 apps : Cyberpunk 2077 : Totals from 7258 (68.80% of 10549) affected shaders: Instrs: 6332210 -> 6073833 (-4.08%); split: -4.11%, +0.03% Cycles: 130667501 -> 127351268 (-2.54%); split: -3.12%, +0.58% Subgroup size: 90320 -> 90400 (+0.09%) Spill count: 90 -> 68 (-24.44%) Fill count: 82 -> 64 (-21.95%) Scratch Memory Size: 8192 -> 6144 (-25.00%) Max live registers: 385464 -> 375152 (-2.68%) Max dispatch width: 64336 -> 64424 (+0.14%); split: +0.96%, -0.82% Gaining 60 SIMD16/SIMD32 shaders, loosing 33 Strange Brigade : Totals from 2137 (53.12% of 4023) affected shaders: Instrs: 1544031 -> 1457544 (-5.60%); split: -5.60%, +0.00% Cycles: 22292564 -> 21868978 (-1.90%); split: -2.43%, +0.53% Subgroup size: 25328 -> 25344 (+0.06%) Max live registers: 113716 -> 111214 (-2.20%) Max dispatch width: 17232 -> 18608 (+7.99%); split: +8.36%, -0.37% Gaining 138 SIMD16/SIMD32 shaders, loosing 4 On app slightly negatively affected : Dota2 : Totals from 232 (14.73% of 1575) affected shaders: Instrs: 30029 -> 28194 (-6.11%) Cycles: 385155 -> 371422 (-3.57%); split: -3.59%, +0.02% Max live registers: 6792 -> 6780 (-0.18%) Max dispatch width: 2256 -> 2160 (-4.26%) Loosing 6 SIMD32 shaders Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24554>
2023-08-07 14:03:57 +03:00
nir_resource_insts[def->index] = ubld8.SHR(dst, src0, src1);
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
break;
}
case nir_op_ishl: {
intel/fs: switch from SIMD 1 to 8 instructions surface/sampler rematerialization SIMD1 instructions are problematic because they are considered partial writes. This increases the liveness of the destination register written by those instructions. To workaround this we use UNDEF instructions to bound the liveness of the register. But this causing other issues like in this case : undef(1) vgrf2 mov(1) vgrf2, u4.0 add(1) vgrf3, vgrf2.0, 64UD In this case the copy propagation pass in unable to see that vgrf2 in the add() instruction can be replaced with the uniform u4.0. To fix this problem, we switch NoMask SIMD8 instructions that cover the entire register. We can drop the UNDEF instructions and now copy propagation can do its job. Good results on 2 apps : Cyberpunk 2077 : Totals from 7258 (68.80% of 10549) affected shaders: Instrs: 6332210 -> 6073833 (-4.08%); split: -4.11%, +0.03% Cycles: 130667501 -> 127351268 (-2.54%); split: -3.12%, +0.58% Subgroup size: 90320 -> 90400 (+0.09%) Spill count: 90 -> 68 (-24.44%) Fill count: 82 -> 64 (-21.95%) Scratch Memory Size: 8192 -> 6144 (-25.00%) Max live registers: 385464 -> 375152 (-2.68%) Max dispatch width: 64336 -> 64424 (+0.14%); split: +0.96%, -0.82% Gaining 60 SIMD16/SIMD32 shaders, loosing 33 Strange Brigade : Totals from 2137 (53.12% of 4023) affected shaders: Instrs: 1544031 -> 1457544 (-5.60%); split: -5.60%, +0.00% Cycles: 22292564 -> 21868978 (-1.90%); split: -2.43%, +0.53% Subgroup size: 25328 -> 25344 (+0.06%) Max live registers: 113716 -> 111214 (-2.20%) Max dispatch width: 17232 -> 18608 (+7.99%); split: +8.36%, -0.37% Gaining 138 SIMD16/SIMD32 shaders, loosing 4 On app slightly negatively affected : Dota2 : Totals from 232 (14.73% of 1575) affected shaders: Instrs: 30029 -> 28194 (-6.11%) Cycles: 385155 -> 371422 (-3.57%); split: -3.59%, +0.02% Max live registers: 6792 -> 6780 (-0.18%) Max dispatch width: 2256 -> 2160 (-4.26%) Loosing 6 SIMD32 shaders Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24554>
2023-08-07 14:03:57 +03:00
fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
fs_reg src0 = nir_resource_insts[alu->src[0].src.ssa->index]->dst;
fs_reg src1 = nir_resource_insts[alu->src[1].src.ssa->index]->dst;
assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
assert(src0.type == BRW_REGISTER_TYPE_UD);
intel/fs: switch from SIMD 1 to 8 instructions surface/sampler rematerialization SIMD1 instructions are problematic because they are considered partial writes. This increases the liveness of the destination register written by those instructions. To workaround this we use UNDEF instructions to bound the liveness of the register. But this causing other issues like in this case : undef(1) vgrf2 mov(1) vgrf2, u4.0 add(1) vgrf3, vgrf2.0, 64UD In this case the copy propagation pass in unable to see that vgrf2 in the add() instruction can be replaced with the uniform u4.0. To fix this problem, we switch NoMask SIMD8 instructions that cover the entire register. We can drop the UNDEF instructions and now copy propagation can do its job. Good results on 2 apps : Cyberpunk 2077 : Totals from 7258 (68.80% of 10549) affected shaders: Instrs: 6332210 -> 6073833 (-4.08%); split: -4.11%, +0.03% Cycles: 130667501 -> 127351268 (-2.54%); split: -3.12%, +0.58% Subgroup size: 90320 -> 90400 (+0.09%) Spill count: 90 -> 68 (-24.44%) Fill count: 82 -> 64 (-21.95%) Scratch Memory Size: 8192 -> 6144 (-25.00%) Max live registers: 385464 -> 375152 (-2.68%) Max dispatch width: 64336 -> 64424 (+0.14%); split: +0.96%, -0.82% Gaining 60 SIMD16/SIMD32 shaders, loosing 33 Strange Brigade : Totals from 2137 (53.12% of 4023) affected shaders: Instrs: 1544031 -> 1457544 (-5.60%); split: -5.60%, +0.00% Cycles: 22292564 -> 21868978 (-1.90%); split: -2.43%, +0.53% Subgroup size: 25328 -> 25344 (+0.06%) Max live registers: 113716 -> 111214 (-2.20%) Max dispatch width: 17232 -> 18608 (+7.99%); split: +8.36%, -0.37% Gaining 138 SIMD16/SIMD32 shaders, loosing 4 On app slightly negatively affected : Dota2 : Totals from 232 (14.73% of 1575) affected shaders: Instrs: 30029 -> 28194 (-6.11%) Cycles: 385155 -> 371422 (-3.57%); split: -3.59%, +0.02% Max live registers: 6792 -> 6780 (-0.18%) Max dispatch width: 2256 -> 2160 (-4.26%) Loosing 6 SIMD32 shaders Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24554>
2023-08-07 14:03:57 +03:00
nir_resource_insts[def->index] = ubld8.SHL(dst, src0, src1);
break;
}
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
case nir_op_mov: {
break;
}
default:
break;
}
break;
}
case nir_instr_type_intrinsic: {
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_resource_intel:
nir_resource_insts[def->index] =
nir_resource_insts[intrin->src[1].ssa->index];
break;
case nir_intrinsic_load_uniform: {
if (!nir_src_is_const(intrin->src[0]))
break;
unsigned base_offset = nir_intrinsic_base(intrin);
unsigned load_offset = nir_src_as_uint(intrin->src[0]);
intel/fs: switch from SIMD 1 to 8 instructions surface/sampler rematerialization SIMD1 instructions are problematic because they are considered partial writes. This increases the liveness of the destination register written by those instructions. To workaround this we use UNDEF instructions to bound the liveness of the register. But this causing other issues like in this case : undef(1) vgrf2 mov(1) vgrf2, u4.0 add(1) vgrf3, vgrf2.0, 64UD In this case the copy propagation pass in unable to see that vgrf2 in the add() instruction can be replaced with the uniform u4.0. To fix this problem, we switch NoMask SIMD8 instructions that cover the entire register. We can drop the UNDEF instructions and now copy propagation can do its job. Good results on 2 apps : Cyberpunk 2077 : Totals from 7258 (68.80% of 10549) affected shaders: Instrs: 6332210 -> 6073833 (-4.08%); split: -4.11%, +0.03% Cycles: 130667501 -> 127351268 (-2.54%); split: -3.12%, +0.58% Subgroup size: 90320 -> 90400 (+0.09%) Spill count: 90 -> 68 (-24.44%) Fill count: 82 -> 64 (-21.95%) Scratch Memory Size: 8192 -> 6144 (-25.00%) Max live registers: 385464 -> 375152 (-2.68%) Max dispatch width: 64336 -> 64424 (+0.14%); split: +0.96%, -0.82% Gaining 60 SIMD16/SIMD32 shaders, loosing 33 Strange Brigade : Totals from 2137 (53.12% of 4023) affected shaders: Instrs: 1544031 -> 1457544 (-5.60%); split: -5.60%, +0.00% Cycles: 22292564 -> 21868978 (-1.90%); split: -2.43%, +0.53% Subgroup size: 25328 -> 25344 (+0.06%) Max live registers: 113716 -> 111214 (-2.20%) Max dispatch width: 17232 -> 18608 (+7.99%); split: +8.36%, -0.37% Gaining 138 SIMD16/SIMD32 shaders, loosing 4 On app slightly negatively affected : Dota2 : Totals from 232 (14.73% of 1575) affected shaders: Instrs: 30029 -> 28194 (-6.11%) Cycles: 385155 -> 371422 (-3.57%); split: -3.59%, +0.02% Max live registers: 6792 -> 6780 (-0.18%) Max dispatch width: 2256 -> 2160 (-4.26%) Loosing 6 SIMD32 shaders Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24554>
2023-08-07 14:03:57 +03:00
fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
fs_reg src(UNIFORM, base_offset / 4, BRW_REGISTER_TYPE_UD);
src.offset = load_offset + base_offset % 4;
intel/fs: switch from SIMD 1 to 8 instructions surface/sampler rematerialization SIMD1 instructions are problematic because they are considered partial writes. This increases the liveness of the destination register written by those instructions. To workaround this we use UNDEF instructions to bound the liveness of the register. But this causing other issues like in this case : undef(1) vgrf2 mov(1) vgrf2, u4.0 add(1) vgrf3, vgrf2.0, 64UD In this case the copy propagation pass in unable to see that vgrf2 in the add() instruction can be replaced with the uniform u4.0. To fix this problem, we switch NoMask SIMD8 instructions that cover the entire register. We can drop the UNDEF instructions and now copy propagation can do its job. Good results on 2 apps : Cyberpunk 2077 : Totals from 7258 (68.80% of 10549) affected shaders: Instrs: 6332210 -> 6073833 (-4.08%); split: -4.11%, +0.03% Cycles: 130667501 -> 127351268 (-2.54%); split: -3.12%, +0.58% Subgroup size: 90320 -> 90400 (+0.09%) Spill count: 90 -> 68 (-24.44%) Fill count: 82 -> 64 (-21.95%) Scratch Memory Size: 8192 -> 6144 (-25.00%) Max live registers: 385464 -> 375152 (-2.68%) Max dispatch width: 64336 -> 64424 (+0.14%); split: +0.96%, -0.82% Gaining 60 SIMD16/SIMD32 shaders, loosing 33 Strange Brigade : Totals from 2137 (53.12% of 4023) affected shaders: Instrs: 1544031 -> 1457544 (-5.60%); split: -5.60%, +0.00% Cycles: 22292564 -> 21868978 (-1.90%); split: -2.43%, +0.53% Subgroup size: 25328 -> 25344 (+0.06%) Max live registers: 113716 -> 111214 (-2.20%) Max dispatch width: 17232 -> 18608 (+7.99%); split: +8.36%, -0.37% Gaining 138 SIMD16/SIMD32 shaders, loosing 4 On app slightly negatively affected : Dota2 : Totals from 232 (14.73% of 1575) affected shaders: Instrs: 30029 -> 28194 (-6.11%) Cycles: 385155 -> 371422 (-3.57%); split: -3.59%, +0.02% Max live registers: 6792 -> 6780 (-0.18%) Max dispatch width: 2256 -> 2160 (-4.26%) Loosing 6 SIMD32 shaders Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24554>
2023-08-07 14:03:57 +03:00
nir_resource_insts[def->index] = ubld8.MOV(dst, src);
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
break;
}
default:
break;
}
break;
}
default:
break;
}
if (nir_resource_insts[def->index] == NULL)
return fs_reg();
}
assert(nir_resource_insts[resource_def->index] != NULL);
return component(nir_resource_insts[resource_def->index]->dst, 0);
}
anv,i965: Lower away image derefs in the driver Previously, the back-end compiler turn image access into magic uniform reads and there was a complex contract between back-end compiler and driver about setting up and filling out those params. As of this commit, both drivers now lower image_deref_load_param_intel intrinsics to load_uniform intrinsics controlled by the driver and lower the other image_deref_* intrinsics to image_* intrinsics which take an actual binding table index. There are still "magic" uniforms but they are now added and controlled entirely by the driver and that contract no longer spans components. This also has the side-effect of making most image use compile-time binding table indices. Previously, all image access pulled the binding table index from a uniform. Part of the reason for this was that the magic uniforms made it difficult to decouple binding table indices from the uniforms and, since they are indexed completely differently (especially in Vulkan), it was hard to pull them apart. Now that the driver is handling both, it's trivial to decouple the two and provide actual binding table indices. Shader-db results on Kaby Lake: total instructions in shared programs: 15166872 -> 15164293 (-0.02%) instructions in affected programs: 115834 -> 113255 (-2.23%) helped: 191 HURT: 0 total cycles in shared programs: 571311495 -> 571196465 (-0.02%) cycles in affected programs: 4757115 -> 4642085 (-2.42%) helped: 73 HURT: 67 total spills in shared programs: 10951 -> 10926 (-0.23%) spills in affected programs: 742 -> 717 (-3.37%) helped: 7 HURT: 0 total fills in shared programs: 22226 -> 22201 (-0.11%) fills in affected programs: 1146 -> 1121 (-2.18%) helped: 7 HURT: 0 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2018-08-16 16:23:10 -05:00
fs_reg
fs_visitor::get_nir_image_intrinsic_image(const brw::fs_builder &bld,
nir_intrinsic_instr *instr)
{
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
if (is_resource_src(instr->src[0])) {
fs_reg surf_index = get_resource_nir_src(instr->src[0]);
if (surf_index.file != BAD_FILE)
return surf_index;
}
anv,i965: Lower away image derefs in the driver Previously, the back-end compiler turn image access into magic uniform reads and there was a complex contract between back-end compiler and driver about setting up and filling out those params. As of this commit, both drivers now lower image_deref_load_param_intel intrinsics to load_uniform intrinsics controlled by the driver and lower the other image_deref_* intrinsics to image_* intrinsics which take an actual binding table index. There are still "magic" uniforms but they are now added and controlled entirely by the driver and that contract no longer spans components. This also has the side-effect of making most image use compile-time binding table indices. Previously, all image access pulled the binding table index from a uniform. Part of the reason for this was that the magic uniforms made it difficult to decouple binding table indices from the uniforms and, since they are indexed completely differently (especially in Vulkan), it was hard to pull them apart. Now that the driver is handling both, it's trivial to decouple the two and provide actual binding table indices. Shader-db results on Kaby Lake: total instructions in shared programs: 15166872 -> 15164293 (-0.02%) instructions in affected programs: 115834 -> 113255 (-2.23%) helped: 191 HURT: 0 total cycles in shared programs: 571311495 -> 571196465 (-0.02%) cycles in affected programs: 4757115 -> 4642085 (-2.42%) helped: 73 HURT: 67 total spills in shared programs: 10951 -> 10926 (-0.23%) spills in affected programs: 742 -> 717 (-3.37%) helped: 7 HURT: 0 total fills in shared programs: 22226 -> 22201 (-0.11%) fills in affected programs: 1146 -> 1121 (-2.18%) helped: 7 HURT: 0 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2018-08-16 16:23:10 -05:00
fs_reg image = retype(get_nir_src_imm(instr->src[0]), BRW_REGISTER_TYPE_UD);
fs_reg surf_index = image;
anv,i965: Lower away image derefs in the driver Previously, the back-end compiler turn image access into magic uniform reads and there was a complex contract between back-end compiler and driver about setting up and filling out those params. As of this commit, both drivers now lower image_deref_load_param_intel intrinsics to load_uniform intrinsics controlled by the driver and lower the other image_deref_* intrinsics to image_* intrinsics which take an actual binding table index. There are still "magic" uniforms but they are now added and controlled entirely by the driver and that contract no longer spans components. This also has the side-effect of making most image use compile-time binding table indices. Previously, all image access pulled the binding table index from a uniform. Part of the reason for this was that the magic uniforms made it difficult to decouple binding table indices from the uniforms and, since they are indexed completely differently (especially in Vulkan), it was hard to pull them apart. Now that the driver is handling both, it's trivial to decouple the two and provide actual binding table indices. Shader-db results on Kaby Lake: total instructions in shared programs: 15166872 -> 15164293 (-0.02%) instructions in affected programs: 115834 -> 113255 (-2.23%) helped: 191 HURT: 0 total cycles in shared programs: 571311495 -> 571196465 (-0.02%) cycles in affected programs: 4757115 -> 4642085 (-2.42%) helped: 73 HURT: 67 total spills in shared programs: 10951 -> 10926 (-0.23%) spills in affected programs: 742 -> 717 (-3.37%) helped: 7 HURT: 0 total fills in shared programs: 22226 -> 22201 (-0.11%) fills in affected programs: 1146 -> 1121 (-2.18%) helped: 7 HURT: 0 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2018-08-16 16:23:10 -05:00
return bld.emit_uniformize(surf_index);
anv,i965: Lower away image derefs in the driver Previously, the back-end compiler turn image access into magic uniform reads and there was a complex contract between back-end compiler and driver about setting up and filling out those params. As of this commit, both drivers now lower image_deref_load_param_intel intrinsics to load_uniform intrinsics controlled by the driver and lower the other image_deref_* intrinsics to image_* intrinsics which take an actual binding table index. There are still "magic" uniforms but they are now added and controlled entirely by the driver and that contract no longer spans components. This also has the side-effect of making most image use compile-time binding table indices. Previously, all image access pulled the binding table index from a uniform. Part of the reason for this was that the magic uniforms made it difficult to decouple binding table indices from the uniforms and, since they are indexed completely differently (especially in Vulkan), it was hard to pull them apart. Now that the driver is handling both, it's trivial to decouple the two and provide actual binding table indices. Shader-db results on Kaby Lake: total instructions in shared programs: 15166872 -> 15164293 (-0.02%) instructions in affected programs: 115834 -> 113255 (-2.23%) helped: 191 HURT: 0 total cycles in shared programs: 571311495 -> 571196465 (-0.02%) cycles in affected programs: 4757115 -> 4642085 (-2.42%) helped: 73 HURT: 67 total spills in shared programs: 10951 -> 10926 (-0.23%) spills in affected programs: 742 -> 717 (-3.37%) helped: 7 HURT: 0 total fills in shared programs: 22226 -> 22201 (-0.11%) fills in affected programs: 1146 -> 1121 (-2.18%) helped: 7 HURT: 0 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2018-08-16 16:23:10 -05:00
}
fs_reg
fs_visitor::get_nir_buffer_intrinsic_index(const brw::fs_builder &bld,
nir_intrinsic_instr *instr)
{
/* SSBO stores are weird in that their index is in src[1] */
const bool is_store =
instr->intrinsic == nir_intrinsic_store_ssbo ||
instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
nir_src src = is_store ? instr->src[1] : instr->src[0];
if (nir_src_is_const(src)) {
return brw_imm_ud(nir_src_as_uint(src));
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
} else if (is_resource_src(src)) {
fs_reg surf_index = get_resource_nir_src(src);
if (surf_index.file != BAD_FILE)
return surf_index;
}
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
return bld.emit_uniformize(get_nir_src(src));
}
/**
* The offsets we get from NIR act as if each SIMD channel has it's own blob
* of contiguous space. However, if we actually place each SIMD channel in
* it's own space, we end up with terrible cache performance because each SIMD
* channel accesses a different cache line even when they're all accessing the
* same byte offset. To deal with this problem, we swizzle the address using
* a simple algorithm which ensures that any time a SIMD message reads or
* writes the same address, it's all in the same cache line. We have to keep
* the bottom two bits fixed so that we can read/write up to a dword at a time
* and the individual element is contiguous. We do this by splitting the
* address as follows:
*
* 31 4-6 2 0
* +-------------------------------+------------+----------+
* | Hi address bits | chan index | addr low |
* +-------------------------------+------------+----------+
*
* In other words, the bottom two address bits stay, and the top 30 get
* shifted up so that we can stick the SIMD channel index in the middle. This
* way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit
* at the same logical offset, the scratch read/write instruction acts on
* continuous elements and we get good cache locality.
*/
fs_reg
fs_visitor::swizzle_nir_scratch_addr(const brw::fs_builder &bld,
const fs_reg &nir_addr,
bool in_dwords)
{
const fs_reg &chan_index =
nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
const unsigned chan_index_bits = ffs(dispatch_width) - 1;
fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD);
if (in_dwords) {
/* In this case, we know the address is aligned to a DWORD and we want
* the final address in DWORDs.
*/
bld.SHL(addr, nir_addr, brw_imm_ud(chan_index_bits - 2));
bld.OR(addr, addr, chan_index);
} else {
/* This case substantially more annoying because we have to pay
* attention to those pesky two bottom bits.
*/
fs_reg addr_hi = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.AND(addr_hi, nir_addr, brw_imm_ud(~0x3u));
bld.SHL(addr_hi, addr_hi, brw_imm_ud(chan_index_bits));
fs_reg chan_addr = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.SHL(chan_addr, chan_index, brw_imm_ud(2));
bld.AND(addr, nir_addr, brw_imm_ud(0x3u));
bld.OR(addr, addr, addr_hi);
bld.OR(addr, addr, chan_addr);
}
return addr;
}
static unsigned
choose_oword_block_size_dwords(const struct intel_device_info *devinfo,
unsigned dwords)
{
unsigned block;
if (devinfo->has_lsc && dwords >= 64) {
block = 64;
} else if (dwords >= 32) {
block = 32;
} else if (dwords >= 16) {
block = 16;
} else {
block = 8;
}
assert(block <= dwords);
return block;
}
static void
increment_a64_address(const fs_builder &bld, fs_reg address, uint32_t v)
{
if (bld.shader->devinfo->has_64bit_int) {
bld.ADD(address, address, brw_imm_ud(v));
} else {
fs_reg low = retype(address, BRW_REGISTER_TYPE_UD);
fs_reg high = offset(low, bld, 1);
/* Add low and if that overflows, add carry to high. */
bld.ADD(low, low, brw_imm_ud(v))->conditional_mod = BRW_CONDITIONAL_O;
bld.ADD(high, high, brw_imm_ud(0x1))->predicate = BRW_PREDICATE_NORMAL;
}
}
intel/fs: Rework fence handling in brw_fs_nir.cpp Start off making everything look like LSC where we have three types of fences: TGM, UGM, and SLM. Then, emit the actual code in a generation- aware way. There are three HW generation cases we care about: XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a lot to deduplicate, it only increases the number of ubld.emit() calls from 5 to 7 and entirely gets rid of the SFID juggling and other weirdness we've introduced along the way to make those cases "general". While we're here, also clean up the code for stalling after fences and clearly document every case where we insert a stall. There are only three known functional changes from this commit: 1. We now avoid the render cache fence on IVB if we don't need image barriers. 2. On ICL+, we no longer unconditionally stall on barriers. We still stall if we have more than one to help tie them together but independent barriers are independent. Barrier instructions will still operate in write-commit mode and still be scheduling barriers but won't necessarily stall. 3. We now assert-fail for URB fences on LSC platforms. We'll be adding in the new URB fence message for those platforms in a follow-on commit. It is a big enough refactor, however, that other minor changes may be present. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
static fs_reg
emit_fence(const fs_builder &bld, enum opcode opcode,
uint8_t sfid, uint32_t desc,
bool commit_enable, uint8_t bti)
intel/fs: Rework fence handling in brw_fs_nir.cpp Start off making everything look like LSC where we have three types of fences: TGM, UGM, and SLM. Then, emit the actual code in a generation- aware way. There are three HW generation cases we care about: XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a lot to deduplicate, it only increases the number of ubld.emit() calls from 5 to 7 and entirely gets rid of the SFID juggling and other weirdness we've introduced along the way to make those cases "general". While we're here, also clean up the code for stalling after fences and clearly document every case where we insert a stall. There are only three known functional changes from this commit: 1. We now avoid the render cache fence on IVB if we don't need image barriers. 2. On ICL+, we no longer unconditionally stall on barriers. We still stall if we have more than one to help tie them together but independent barriers are independent. Barrier instructions will still operate in write-commit mode and still be scheduling barriers but won't necessarily stall. 3. We now assert-fail for URB fences on LSC platforms. We'll be adding in the new URB fence message for those platforms in a follow-on commit. It is a big enough refactor, however, that other minor changes may be present. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
{
assert(opcode == SHADER_OPCODE_INTERLOCK ||
opcode == SHADER_OPCODE_MEMORY_FENCE);
fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
fs_inst *fence = bld.emit(opcode, dst, brw_vec8_grf(0, 0),
brw_imm_ud(commit_enable),
brw_imm_ud(bti));
fence->sfid = sfid;
fence->desc = desc;
intel/fs: Rework fence handling in brw_fs_nir.cpp Start off making everything look like LSC where we have three types of fences: TGM, UGM, and SLM. Then, emit the actual code in a generation- aware way. There are three HW generation cases we care about: XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a lot to deduplicate, it only increases the number of ubld.emit() calls from 5 to 7 and entirely gets rid of the SFID juggling and other weirdness we've introduced along the way to make those cases "general". While we're here, also clean up the code for stalling after fences and clearly document every case where we insert a stall. There are only three known functional changes from this commit: 1. We now avoid the render cache fence on IVB if we don't need image barriers. 2. On ICL+, we no longer unconditionally stall on barriers. We still stall if we have more than one to help tie them together but independent barriers are independent. Barrier instructions will still operate in write-commit mode and still be scheduling barriers but won't necessarily stall. 3. We now assert-fail for URB fences on LSC platforms. We'll be adding in the new URB fence message for those platforms in a follow-on commit. It is a big enough refactor, however, that other minor changes may be present. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
return dst;
}
static uint32_t
lsc_fence_descriptor_for_intrinsic(const struct intel_device_info *devinfo,
nir_intrinsic_instr *instr)
{
assert(devinfo->has_lsc);
enum lsc_fence_scope scope = LSC_FENCE_LOCAL;
enum lsc_flush_type flush_type = LSC_FLUSH_TYPE_NONE;
if (nir_intrinsic_has_memory_scope(instr)) {
switch (nir_intrinsic_memory_scope(instr)) {
case SCOPE_DEVICE:
case SCOPE_QUEUE_FAMILY:
scope = LSC_FENCE_TILE;
flush_type = LSC_FLUSH_TYPE_EVICT;
break;
case SCOPE_WORKGROUP:
scope = LSC_FENCE_THREADGROUP;
break;
case SCOPE_SHADER_CALL:
case SCOPE_INVOCATION:
case SCOPE_SUBGROUP:
case SCOPE_NONE:
break;
}
} else {
/* No scope defined. */
scope = LSC_FENCE_TILE;
flush_type = LSC_FLUSH_TYPE_EVICT;
}
return lsc_fence_msg_desc(devinfo, scope, flush_type, true);
}
void
fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
{
/* We handle this as a special case */
if (instr->intrinsic == nir_intrinsic_decl_reg) {
assert(nir_intrinsic_num_array_elems(instr) == 0);
unsigned bit_size = nir_intrinsic_bit_size(instr);
unsigned num_components = nir_intrinsic_num_components(instr);
const brw_reg_type reg_type =
brw_reg_type_from_bit_size(bit_size, bit_size == 8 ?
BRW_REGISTER_TYPE_D :
BRW_REGISTER_TYPE_F);
/* Re-use the destination's slot in the table for the register */
nir_ssa_values[instr->def.index] =
bld.vgrf(reg_type, num_components);
return;
}
fs_reg dest;
if (nir_intrinsic_infos[instr->intrinsic].has_dest)
dest = get_nir_def(instr->def);
switch (instr->intrinsic) {
case nir_intrinsic_resource_intel:
nir_ssa_bind_infos[instr->def.index].valid = true;
nir_ssa_bind_infos[instr->def.index].bindless =
(nir_intrinsic_resource_access_intel(instr) &
nir_resource_intel_bindless) != 0;
nir_ssa_bind_infos[instr->def.index].block =
nir_intrinsic_resource_block_intel(instr);
nir_ssa_bind_infos[instr->def.index].set =
nir_intrinsic_desc_set(instr);
nir_ssa_bind_infos[instr->def.index].binding =
nir_intrinsic_binding(instr);
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
if (nir_intrinsic_resource_access_intel(instr) &
nir_resource_intel_non_uniform) {
nir_resource_values[instr->def.index] = fs_reg();
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
} else {
nir_resource_values[instr->def.index] =
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
try_rebuild_resource(bld, instr->src[1].ssa);
}
nir_ssa_values[instr->def.index] =
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
nir_ssa_values[instr->src[1].ssa->index];
break;
case nir_intrinsic_load_reg:
case nir_intrinsic_store_reg:
/* Nothing to do with these. */
break;
anv,i965: Lower away image derefs in the driver Previously, the back-end compiler turn image access into magic uniform reads and there was a complex contract between back-end compiler and driver about setting up and filling out those params. As of this commit, both drivers now lower image_deref_load_param_intel intrinsics to load_uniform intrinsics controlled by the driver and lower the other image_deref_* intrinsics to image_* intrinsics which take an actual binding table index. There are still "magic" uniforms but they are now added and controlled entirely by the driver and that contract no longer spans components. This also has the side-effect of making most image use compile-time binding table indices. Previously, all image access pulled the binding table index from a uniform. Part of the reason for this was that the magic uniforms made it difficult to decouple binding table indices from the uniforms and, since they are indexed completely differently (especially in Vulkan), it was hard to pull them apart. Now that the driver is handling both, it's trivial to decouple the two and provide actual binding table indices. Shader-db results on Kaby Lake: total instructions in shared programs: 15166872 -> 15164293 (-0.02%) instructions in affected programs: 115834 -> 113255 (-2.23%) helped: 191 HURT: 0 total cycles in shared programs: 571311495 -> 571196465 (-0.02%) cycles in affected programs: 4757115 -> 4642085 (-2.42%) helped: 73 HURT: 67 total spills in shared programs: 10951 -> 10926 (-0.23%) spills in affected programs: 742 -> 717 (-3.37%) helped: 7 HURT: 0 total fills in shared programs: 22226 -> 22201 (-0.11%) fills in affected programs: 1146 -> 1121 (-2.18%) helped: 7 HURT: 0 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2018-08-16 16:23:10 -05:00
case nir_intrinsic_image_load:
case nir_intrinsic_image_store:
case nir_intrinsic_image_atomic:
case nir_intrinsic_image_atomic_swap:
case nir_intrinsic_bindless_image_load:
case nir_intrinsic_bindless_image_store:
case nir_intrinsic_bindless_image_atomic:
case nir_intrinsic_bindless_image_atomic_swap: {
/* Get some metadata from the image intrinsic. */
const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
switch (instr->intrinsic) {
case nir_intrinsic_image_load:
case nir_intrinsic_image_store:
case nir_intrinsic_image_atomic:
case nir_intrinsic_image_atomic_swap:
srcs[SURFACE_LOGICAL_SRC_SURFACE] =
get_nir_image_intrinsic_image(bld, instr);
break;
default:
/* Bindless */
srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] =
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
get_nir_image_intrinsic_image(bld, instr);
break;
}
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] =
brw_imm_ud(nir_image_intrinsic_coord_components(instr));
/* Emit an image load, store or atomic op. */
if (instr->intrinsic == nir_intrinsic_image_load ||
instr->intrinsic == nir_intrinsic_bindless_image_load) {
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
fs_inst *inst =
bld.emit(SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
inst->size_written = instr->num_components * dispatch_width * 4;
} else if (instr->intrinsic == nir_intrinsic_image_store ||
instr->intrinsic == nir_intrinsic_bindless_image_store) {
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[3]);
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
bld.emit(SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
} else {
2018-08-24 17:41:01 -07:00
unsigned num_srcs = info->num_srcs;
int op = lsc_aop_for_nir_intrinsic(instr);
if (op == LSC_OP_ATOMIC_INC || op == LSC_OP_ATOMIC_DEC) {
2018-08-24 17:41:01 -07:00
assert(num_srcs == 4);
num_srcs = 3;
}
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
fs_reg data;
if (num_srcs >= 4)
data = get_nir_src(instr->src[3]);
if (num_srcs >= 5) {
fs_reg tmp = bld.vgrf(data.type, 2);
fs_reg sources[2] = { data, get_nir_src(instr->src[4]) };
bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
data = tmp;
}
srcs[SURFACE_LOGICAL_SRC_DATA] = data;
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
bld.emit(SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
}
break;
}
case nir_intrinsic_image_size:
case nir_intrinsic_bindless_image_size: {
/* Cube image sizes should have previously been lowered to a 2D array */
assert(nir_intrinsic_image_dim(instr) != GLSL_SAMPLER_DIM_CUBE);
/* Unlike the [un]typed load and store opcodes, the TXS that this turns
* into will handle the binding table index for us in the geneerator.
* Incidentally, this means that we can handle bindless with exactly the
* same code.
*/
fs_reg image = retype(get_nir_src_imm(instr->src[0]),
BRW_REGISTER_TYPE_UD);
image = bld.emit_uniformize(image);
assert(nir_src_as_uint(instr->src[1]) == 0);
fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
if (instr->intrinsic == nir_intrinsic_image_size)
srcs[TEX_LOGICAL_SRC_SURFACE] = image;
else
srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = image;
srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0);
srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(0);
srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_d(0);
/* Since the image size is always uniform, we can just emit a SIMD8
* query instruction and splat the result out.
*/
const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
fs_inst *inst = ubld.emit(SHADER_OPCODE_IMAGE_SIZE_LOGICAL,
tmp, srcs, ARRAY_SIZE(srcs));
inst->size_written = 4 * REG_SIZE;
for (unsigned c = 0; c < instr->def.num_components; ++c) {
bld.MOV(offset(retype(dest, tmp.type), bld, c),
component(offset(tmp, ubld, c), 0));
}
break;
}
anv,i965: Lower away image derefs in the driver Previously, the back-end compiler turn image access into magic uniform reads and there was a complex contract between back-end compiler and driver about setting up and filling out those params. As of this commit, both drivers now lower image_deref_load_param_intel intrinsics to load_uniform intrinsics controlled by the driver and lower the other image_deref_* intrinsics to image_* intrinsics which take an actual binding table index. There are still "magic" uniforms but they are now added and controlled entirely by the driver and that contract no longer spans components. This also has the side-effect of making most image use compile-time binding table indices. Previously, all image access pulled the binding table index from a uniform. Part of the reason for this was that the magic uniforms made it difficult to decouple binding table indices from the uniforms and, since they are indexed completely differently (especially in Vulkan), it was hard to pull them apart. Now that the driver is handling both, it's trivial to decouple the two and provide actual binding table indices. Shader-db results on Kaby Lake: total instructions in shared programs: 15166872 -> 15164293 (-0.02%) instructions in affected programs: 115834 -> 113255 (-2.23%) helped: 191 HURT: 0 total cycles in shared programs: 571311495 -> 571196465 (-0.02%) cycles in affected programs: 4757115 -> 4642085 (-2.42%) helped: 73 HURT: 67 total spills in shared programs: 10951 -> 10926 (-0.23%) spills in affected programs: 742 -> 717 (-3.37%) helped: 7 HURT: 0 total fills in shared programs: 22226 -> 22201 (-0.11%) fills in affected programs: 1146 -> 1121 (-2.18%) helped: 7 HURT: 0 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2018-08-16 16:23:10 -05:00
case nir_intrinsic_image_load_raw_intel: {
fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
srcs[SURFACE_LOGICAL_SRC_SURFACE] =
get_nir_image_intrinsic_image(bld, instr);
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
fs_inst *inst =
bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
inst->size_written = instr->num_components * dispatch_width * 4;
break;
}
anv,i965: Lower away image derefs in the driver Previously, the back-end compiler turn image access into magic uniform reads and there was a complex contract between back-end compiler and driver about setting up and filling out those params. As of this commit, both drivers now lower image_deref_load_param_intel intrinsics to load_uniform intrinsics controlled by the driver and lower the other image_deref_* intrinsics to image_* intrinsics which take an actual binding table index. There are still "magic" uniforms but they are now added and controlled entirely by the driver and that contract no longer spans components. This also has the side-effect of making most image use compile-time binding table indices. Previously, all image access pulled the binding table index from a uniform. Part of the reason for this was that the magic uniforms made it difficult to decouple binding table indices from the uniforms and, since they are indexed completely differently (especially in Vulkan), it was hard to pull them apart. Now that the driver is handling both, it's trivial to decouple the two and provide actual binding table indices. Shader-db results on Kaby Lake: total instructions in shared programs: 15166872 -> 15164293 (-0.02%) instructions in affected programs: 115834 -> 113255 (-2.23%) helped: 191 HURT: 0 total cycles in shared programs: 571311495 -> 571196465 (-0.02%) cycles in affected programs: 4757115 -> 4642085 (-2.42%) helped: 73 HURT: 67 total spills in shared programs: 10951 -> 10926 (-0.23%) spills in affected programs: 742 -> 717 (-3.37%) helped: 7 HURT: 0 total fills in shared programs: 22226 -> 22201 (-0.11%) fills in affected programs: 1146 -> 1121 (-2.18%) helped: 7 HURT: 0 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2018-08-16 16:23:10 -05:00
case nir_intrinsic_image_store_raw_intel: {
fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
srcs[SURFACE_LOGICAL_SRC_SURFACE] =
get_nir_image_intrinsic_image(bld, instr);
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[2]);
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
break;
}
case nir_intrinsic_barrier:
intel/fs,vec4: Pull stall logic for memory fences up into the IR Instead of emitting the stall MOV "inside" the SHADER_OPCODE_MEMORY_FENCE generation, use the scheduling fences when creating the IR. For IvyBridge, every (data cache) fence is accompained by a render cache fence, that now is explicit in the IR, two SHADER_OPCODE_MEMORY_FENCEs are emitted (with different SFIDs). Because Begin and End interlock intrinsics are effectively memory barriers, move its handling alongside the other memory barrier intrinsics. The SHADER_OPCODE_INTERLOCK is still used to distinguish if we are going to use a SENDC (for Begin) or regular SEND (for End). This change is a preparation to allow emitting both SENDs in Gen11+ before we can stall on them. Shader-db results for IVB (i965): total instructions in shared programs: 11971190 -> 11971200 (<.01%) instructions in affected programs: 11482 -> 11492 (0.09%) helped: 0 HURT: 8 HURT stats (abs) min: 1 max: 3 x̄: 1.25 x̃: 1 HURT stats (rel) min: 0.03% max: 0.50% x̄: 0.14% x̃: 0.10% 95% mean confidence interval for instructions value: 0.66 1.84 95% mean confidence interval for instructions %-change: 0.01% 0.27% Instructions are HURT. Unlike the previous code, that used the `mov g1 g2` trick to force both `g1` and `g2` to stall, the scheduling fence will generate `mov null g1` and `mov null g2`. During review it was decided it was not worth keeping the special codepath for the small effect will have. Shader-db results for HSW (i965), BDW and SKL don't have a change on instruction count, but do report changes in cycles count, showing SKL results below total cycles in shared programs: 341738444 -> 341710570 (<.01%) cycles in affected programs: 7240002 -> 7212128 (-0.38%) helped: 46 HURT: 5 helped stats (abs) min: 14 max: 1940 x̄: 676.22 x̃: 154 helped stats (rel) min: <.01% max: 2.62% x̄: 1.28% x̃: 0.95% HURT stats (abs) min: 2 max: 1768 x̄: 646.40 x̃: 362 HURT stats (rel) min: <.01% max: 0.83% x̄: 0.28% x̃: 0.08% 95% mean confidence interval for cycles value: -777.71 -315.38 95% mean confidence interval for cycles %-change: -1.42% -0.83% Cycles are helped. This seems to be the effect of allocating two registers separatedly instead of a single one with size 2, which causes different register allocation, affecting the cycle estimates. while ICL also has not change on instruction count but report changes negative changes in cycles total cycles in shared programs: 352665369 -> 352707484 (0.01%) cycles in affected programs: 9608288 -> 9650403 (0.44%) helped: 4 HURT: 104 helped stats (abs) min: 24 max: 128 x̄: 88.50 x̃: 101 helped stats (rel) min: <.01% max: 0.85% x̄: 0.46% x̃: 0.49% HURT stats (abs) min: 2 max: 2016 x̄: 408.36 x̃: 48 HURT stats (rel) min: <.01% max: 3.31% x̄: 0.88% x̃: 0.45% 95% mean confidence interval for cycles value: 256.67 523.24 95% mean confidence interval for cycles %-change: 0.63% 1.03% Cycles are HURT. AFAICT this is the result of the case above. Shader-db results for TGL have similar cycles result as ICL, but also affect instructions total instructions in shared programs: 17690586 -> 17690597 (<.01%) instructions in affected programs: 64617 -> 64628 (0.02%) helped: 55 HURT: 32 helped stats (abs) min: 1 max: 16 x̄: 4.13 x̃: 3 helped stats (rel) min: 0.05% max: 2.78% x̄: 0.86% x̃: 0.74% HURT stats (abs) min: 1 max: 65 x̄: 7.44 x̃: 2 HURT stats (rel) min: 0.05% max: 4.58% x̄: 1.13% x̃: 0.69% 95% mean confidence interval for instructions value: -2.03 2.28 95% mean confidence interval for instructions %-change: -0.41% 0.15% Inconclusive result (value mean confidence interval includes 0). Now that more is done in the IR, more dependencies are visible and more SWSB annotations are emitted. Mixed with different register allocation decisions like above, some shaders will see more `sync nops` while others able to avoid them. Most of the new `sync nops` are also redundant and could be dropped, which will be fixed in a separate change. Reviewed-by: Francisco Jerez <currojerez@riseup.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3278>
2020-01-17 15:07:44 -08:00
case nir_intrinsic_begin_invocation_interlock:
case nir_intrinsic_end_invocation_interlock: {
intel/fs: Rework fence handling in brw_fs_nir.cpp Start off making everything look like LSC where we have three types of fences: TGM, UGM, and SLM. Then, emit the actual code in a generation- aware way. There are three HW generation cases we care about: XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a lot to deduplicate, it only increases the number of ubld.emit() calls from 5 to 7 and entirely gets rid of the SFID juggling and other weirdness we've introduced along the way to make those cases "general". While we're here, also clean up the code for stalling after fences and clearly document every case where we insert a stall. There are only three known functional changes from this commit: 1. We now avoid the render cache fence on IVB if we don't need image barriers. 2. On ICL+, we no longer unconditionally stall on barriers. We still stall if we have more than one to help tie them together but independent barriers are independent. Barrier instructions will still operate in write-commit mode and still be scheduling barriers but won't necessarily stall. 3. We now assert-fail for URB fences on LSC platforms. We'll be adding in the new URB fence message for those platforms in a follow-on commit. It is a big enough refactor, however, that other minor changes may be present. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
bool ugm_fence, slm_fence, tgm_fence, urb_fence;
enum opcode opcode = BRW_OPCODE_NOP;
/* Handling interlock intrinsics here will allow the logic for IVB
* render cache (see below) to be reused.
*/
intel/fs,vec4: Pull stall logic for memory fences up into the IR Instead of emitting the stall MOV "inside" the SHADER_OPCODE_MEMORY_FENCE generation, use the scheduling fences when creating the IR. For IvyBridge, every (data cache) fence is accompained by a render cache fence, that now is explicit in the IR, two SHADER_OPCODE_MEMORY_FENCEs are emitted (with different SFIDs). Because Begin and End interlock intrinsics are effectively memory barriers, move its handling alongside the other memory barrier intrinsics. The SHADER_OPCODE_INTERLOCK is still used to distinguish if we are going to use a SENDC (for Begin) or regular SEND (for End). This change is a preparation to allow emitting both SENDs in Gen11+ before we can stall on them. Shader-db results for IVB (i965): total instructions in shared programs: 11971190 -> 11971200 (<.01%) instructions in affected programs: 11482 -> 11492 (0.09%) helped: 0 HURT: 8 HURT stats (abs) min: 1 max: 3 x̄: 1.25 x̃: 1 HURT stats (rel) min: 0.03% max: 0.50% x̄: 0.14% x̃: 0.10% 95% mean confidence interval for instructions value: 0.66 1.84 95% mean confidence interval for instructions %-change: 0.01% 0.27% Instructions are HURT. Unlike the previous code, that used the `mov g1 g2` trick to force both `g1` and `g2` to stall, the scheduling fence will generate `mov null g1` and `mov null g2`. During review it was decided it was not worth keeping the special codepath for the small effect will have. Shader-db results for HSW (i965), BDW and SKL don't have a change on instruction count, but do report changes in cycles count, showing SKL results below total cycles in shared programs: 341738444 -> 341710570 (<.01%) cycles in affected programs: 7240002 -> 7212128 (-0.38%) helped: 46 HURT: 5 helped stats (abs) min: 14 max: 1940 x̄: 676.22 x̃: 154 helped stats (rel) min: <.01% max: 2.62% x̄: 1.28% x̃: 0.95% HURT stats (abs) min: 2 max: 1768 x̄: 646.40 x̃: 362 HURT stats (rel) min: <.01% max: 0.83% x̄: 0.28% x̃: 0.08% 95% mean confidence interval for cycles value: -777.71 -315.38 95% mean confidence interval for cycles %-change: -1.42% -0.83% Cycles are helped. This seems to be the effect of allocating two registers separatedly instead of a single one with size 2, which causes different register allocation, affecting the cycle estimates. while ICL also has not change on instruction count but report changes negative changes in cycles total cycles in shared programs: 352665369 -> 352707484 (0.01%) cycles in affected programs: 9608288 -> 9650403 (0.44%) helped: 4 HURT: 104 helped stats (abs) min: 24 max: 128 x̄: 88.50 x̃: 101 helped stats (rel) min: <.01% max: 0.85% x̄: 0.46% x̃: 0.49% HURT stats (abs) min: 2 max: 2016 x̄: 408.36 x̃: 48 HURT stats (rel) min: <.01% max: 3.31% x̄: 0.88% x̃: 0.45% 95% mean confidence interval for cycles value: 256.67 523.24 95% mean confidence interval for cycles %-change: 0.63% 1.03% Cycles are HURT. AFAICT this is the result of the case above. Shader-db results for TGL have similar cycles result as ICL, but also affect instructions total instructions in shared programs: 17690586 -> 17690597 (<.01%) instructions in affected programs: 64617 -> 64628 (0.02%) helped: 55 HURT: 32 helped stats (abs) min: 1 max: 16 x̄: 4.13 x̃: 3 helped stats (rel) min: 0.05% max: 2.78% x̄: 0.86% x̃: 0.74% HURT stats (abs) min: 1 max: 65 x̄: 7.44 x̃: 2 HURT stats (rel) min: 0.05% max: 4.58% x̄: 1.13% x̃: 0.69% 95% mean confidence interval for instructions value: -2.03 2.28 95% mean confidence interval for instructions %-change: -0.41% 0.15% Inconclusive result (value mean confidence interval includes 0). Now that more is done in the IR, more dependencies are visible and more SWSB annotations are emitted. Mixed with different register allocation decisions like above, some shaders will see more `sync nops` while others able to avoid them. Most of the new `sync nops` are also redundant and could be dropped, which will be fixed in a separate change. Reviewed-by: Francisco Jerez <currojerez@riseup.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3278>
2020-01-17 15:07:44 -08:00
switch (instr->intrinsic) {
case nir_intrinsic_barrier: {
/* Note we only care about the memory part of the
* barrier. The execution part will be taken care
* of by the stage specific intrinsic handler functions.
*/
nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
intel/fs: Rework fence handling in brw_fs_nir.cpp Start off making everything look like LSC where we have three types of fences: TGM, UGM, and SLM. Then, emit the actual code in a generation- aware way. There are three HW generation cases we care about: XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a lot to deduplicate, it only increases the number of ubld.emit() calls from 5 to 7 and entirely gets rid of the SFID juggling and other weirdness we've introduced along the way to make those cases "general". While we're here, also clean up the code for stalling after fences and clearly document every case where we insert a stall. There are only three known functional changes from this commit: 1. We now avoid the render cache fence on IVB if we don't need image barriers. 2. On ICL+, we no longer unconditionally stall on barriers. We still stall if we have more than one to help tie them together but independent barriers are independent. Barrier instructions will still operate in write-commit mode and still be scheduling barriers but won't necessarily stall. 3. We now assert-fail for URB fences on LSC platforms. We'll be adding in the new URB fence message for those platforms in a follow-on commit. It is a big enough refactor, however, that other minor changes may be present. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
ugm_fence = modes & (nir_var_mem_ssbo | nir_var_mem_global);
slm_fence = modes & nir_var_mem_shared;
tgm_fence = modes & nir_var_image;
urb_fence = modes & (nir_var_shader_out | nir_var_mem_task_payload);
if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
opcode = SHADER_OPCODE_MEMORY_FENCE;
intel/fs,vec4: Pull stall logic for memory fences up into the IR Instead of emitting the stall MOV "inside" the SHADER_OPCODE_MEMORY_FENCE generation, use the scheduling fences when creating the IR. For IvyBridge, every (data cache) fence is accompained by a render cache fence, that now is explicit in the IR, two SHADER_OPCODE_MEMORY_FENCEs are emitted (with different SFIDs). Because Begin and End interlock intrinsics are effectively memory barriers, move its handling alongside the other memory barrier intrinsics. The SHADER_OPCODE_INTERLOCK is still used to distinguish if we are going to use a SENDC (for Begin) or regular SEND (for End). This change is a preparation to allow emitting both SENDs in Gen11+ before we can stall on them. Shader-db results for IVB (i965): total instructions in shared programs: 11971190 -> 11971200 (<.01%) instructions in affected programs: 11482 -> 11492 (0.09%) helped: 0 HURT: 8 HURT stats (abs) min: 1 max: 3 x̄: 1.25 x̃: 1 HURT stats (rel) min: 0.03% max: 0.50% x̄: 0.14% x̃: 0.10% 95% mean confidence interval for instructions value: 0.66 1.84 95% mean confidence interval for instructions %-change: 0.01% 0.27% Instructions are HURT. Unlike the previous code, that used the `mov g1 g2` trick to force both `g1` and `g2` to stall, the scheduling fence will generate `mov null g1` and `mov null g2`. During review it was decided it was not worth keeping the special codepath for the small effect will have. Shader-db results for HSW (i965), BDW and SKL don't have a change on instruction count, but do report changes in cycles count, showing SKL results below total cycles in shared programs: 341738444 -> 341710570 (<.01%) cycles in affected programs: 7240002 -> 7212128 (-0.38%) helped: 46 HURT: 5 helped stats (abs) min: 14 max: 1940 x̄: 676.22 x̃: 154 helped stats (rel) min: <.01% max: 2.62% x̄: 1.28% x̃: 0.95% HURT stats (abs) min: 2 max: 1768 x̄: 646.40 x̃: 362 HURT stats (rel) min: <.01% max: 0.83% x̄: 0.28% x̃: 0.08% 95% mean confidence interval for cycles value: -777.71 -315.38 95% mean confidence interval for cycles %-change: -1.42% -0.83% Cycles are helped. This seems to be the effect of allocating two registers separatedly instead of a single one with size 2, which causes different register allocation, affecting the cycle estimates. while ICL also has not change on instruction count but report changes negative changes in cycles total cycles in shared programs: 352665369 -> 352707484 (0.01%) cycles in affected programs: 9608288 -> 9650403 (0.44%) helped: 4 HURT: 104 helped stats (abs) min: 24 max: 128 x̄: 88.50 x̃: 101 helped stats (rel) min: <.01% max: 0.85% x̄: 0.46% x̃: 0.49% HURT stats (abs) min: 2 max: 2016 x̄: 408.36 x̃: 48 HURT stats (rel) min: <.01% max: 3.31% x̄: 0.88% x̃: 0.45% 95% mean confidence interval for cycles value: 256.67 523.24 95% mean confidence interval for cycles %-change: 0.63% 1.03% Cycles are HURT. AFAICT this is the result of the case above. Shader-db results for TGL have similar cycles result as ICL, but also affect instructions total instructions in shared programs: 17690586 -> 17690597 (<.01%) instructions in affected programs: 64617 -> 64628 (0.02%) helped: 55 HURT: 32 helped stats (abs) min: 1 max: 16 x̄: 4.13 x̃: 3 helped stats (rel) min: 0.05% max: 2.78% x̄: 0.86% x̃: 0.74% HURT stats (abs) min: 1 max: 65 x̄: 7.44 x̃: 2 HURT stats (rel) min: 0.05% max: 4.58% x̄: 1.13% x̃: 0.69% 95% mean confidence interval for instructions value: -2.03 2.28 95% mean confidence interval for instructions %-change: -0.41% 0.15% Inconclusive result (value mean confidence interval includes 0). Now that more is done in the IR, more dependencies are visible and more SWSB annotations are emitted. Mixed with different register allocation decisions like above, some shaders will see more `sync nops` while others able to avoid them. Most of the new `sync nops` are also redundant and could be dropped, which will be fixed in a separate change. Reviewed-by: Francisco Jerez <currojerez@riseup.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3278>
2020-01-17 15:07:44 -08:00
break;
}
case nir_intrinsic_begin_invocation_interlock:
/* For beginInvocationInterlockARB(), we will generate a memory fence
* but with a different opcode so that generator can pick SENDC
* instead of SEND.
*/
assert(stage == MESA_SHADER_FRAGMENT);
ugm_fence = tgm_fence = true;
slm_fence = urb_fence = false;
opcode = SHADER_OPCODE_INTERLOCK;
break;
case nir_intrinsic_end_invocation_interlock:
/* For endInvocationInterlockARB(), we need to insert a memory fence which
intel/fs,vec4: Pull stall logic for memory fences up into the IR Instead of emitting the stall MOV "inside" the SHADER_OPCODE_MEMORY_FENCE generation, use the scheduling fences when creating the IR. For IvyBridge, every (data cache) fence is accompained by a render cache fence, that now is explicit in the IR, two SHADER_OPCODE_MEMORY_FENCEs are emitted (with different SFIDs). Because Begin and End interlock intrinsics are effectively memory barriers, move its handling alongside the other memory barrier intrinsics. The SHADER_OPCODE_INTERLOCK is still used to distinguish if we are going to use a SENDC (for Begin) or regular SEND (for End). This change is a preparation to allow emitting both SENDs in Gen11+ before we can stall on them. Shader-db results for IVB (i965): total instructions in shared programs: 11971190 -> 11971200 (<.01%) instructions in affected programs: 11482 -> 11492 (0.09%) helped: 0 HURT: 8 HURT stats (abs) min: 1 max: 3 x̄: 1.25 x̃: 1 HURT stats (rel) min: 0.03% max: 0.50% x̄: 0.14% x̃: 0.10% 95% mean confidence interval for instructions value: 0.66 1.84 95% mean confidence interval for instructions %-change: 0.01% 0.27% Instructions are HURT. Unlike the previous code, that used the `mov g1 g2` trick to force both `g1` and `g2` to stall, the scheduling fence will generate `mov null g1` and `mov null g2`. During review it was decided it was not worth keeping the special codepath for the small effect will have. Shader-db results for HSW (i965), BDW and SKL don't have a change on instruction count, but do report changes in cycles count, showing SKL results below total cycles in shared programs: 341738444 -> 341710570 (<.01%) cycles in affected programs: 7240002 -> 7212128 (-0.38%) helped: 46 HURT: 5 helped stats (abs) min: 14 max: 1940 x̄: 676.22 x̃: 154 helped stats (rel) min: <.01% max: 2.62% x̄: 1.28% x̃: 0.95% HURT stats (abs) min: 2 max: 1768 x̄: 646.40 x̃: 362 HURT stats (rel) min: <.01% max: 0.83% x̄: 0.28% x̃: 0.08% 95% mean confidence interval for cycles value: -777.71 -315.38 95% mean confidence interval for cycles %-change: -1.42% -0.83% Cycles are helped. This seems to be the effect of allocating two registers separatedly instead of a single one with size 2, which causes different register allocation, affecting the cycle estimates. while ICL also has not change on instruction count but report changes negative changes in cycles total cycles in shared programs: 352665369 -> 352707484 (0.01%) cycles in affected programs: 9608288 -> 9650403 (0.44%) helped: 4 HURT: 104 helped stats (abs) min: 24 max: 128 x̄: 88.50 x̃: 101 helped stats (rel) min: <.01% max: 0.85% x̄: 0.46% x̃: 0.49% HURT stats (abs) min: 2 max: 2016 x̄: 408.36 x̃: 48 HURT stats (rel) min: <.01% max: 3.31% x̄: 0.88% x̃: 0.45% 95% mean confidence interval for cycles value: 256.67 523.24 95% mean confidence interval for cycles %-change: 0.63% 1.03% Cycles are HURT. AFAICT this is the result of the case above. Shader-db results for TGL have similar cycles result as ICL, but also affect instructions total instructions in shared programs: 17690586 -> 17690597 (<.01%) instructions in affected programs: 64617 -> 64628 (0.02%) helped: 55 HURT: 32 helped stats (abs) min: 1 max: 16 x̄: 4.13 x̃: 3 helped stats (rel) min: 0.05% max: 2.78% x̄: 0.86% x̃: 0.74% HURT stats (abs) min: 1 max: 65 x̄: 7.44 x̃: 2 HURT stats (rel) min: 0.05% max: 4.58% x̄: 1.13% x̃: 0.69% 95% mean confidence interval for instructions value: -2.03 2.28 95% mean confidence interval for instructions %-change: -0.41% 0.15% Inconclusive result (value mean confidence interval includes 0). Now that more is done in the IR, more dependencies are visible and more SWSB annotations are emitted. Mixed with different register allocation decisions like above, some shaders will see more `sync nops` while others able to avoid them. Most of the new `sync nops` are also redundant and could be dropped, which will be fixed in a separate change. Reviewed-by: Francisco Jerez <currojerez@riseup.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3278>
2020-01-17 15:07:44 -08:00
* stalls in the shader until the memory transactions prior to that
* fence are complete. This ensures that the shader does not end before
* any writes from its critical section have landed. Otherwise, you can
* end up with a case where the next invocation on that pixel properly
* stalls for previous FS invocation on its pixel to complete but
* doesn't actually wait for the dataport memory transactions from that
* thread to land before submitting its own.
*/
intel/fs: Rework fence handling in brw_fs_nir.cpp Start off making everything look like LSC where we have three types of fences: TGM, UGM, and SLM. Then, emit the actual code in a generation- aware way. There are three HW generation cases we care about: XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a lot to deduplicate, it only increases the number of ubld.emit() calls from 5 to 7 and entirely gets rid of the SFID juggling and other weirdness we've introduced along the way to make those cases "general". While we're here, also clean up the code for stalling after fences and clearly document every case where we insert a stall. There are only three known functional changes from this commit: 1. We now avoid the render cache fence on IVB if we don't need image barriers. 2. On ICL+, we no longer unconditionally stall on barriers. We still stall if we have more than one to help tie them together but independent barriers are independent. Barrier instructions will still operate in write-commit mode and still be scheduling barriers but won't necessarily stall. 3. We now assert-fail for URB fences on LSC platforms. We'll be adding in the new URB fence message for those platforms in a follow-on commit. It is a big enough refactor, however, that other minor changes may be present. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
assert(stage == MESA_SHADER_FRAGMENT);
ugm_fence = tgm_fence = true;
slm_fence = urb_fence = false;
opcode = SHADER_OPCODE_MEMORY_FENCE;
intel/fs,vec4: Pull stall logic for memory fences up into the IR Instead of emitting the stall MOV "inside" the SHADER_OPCODE_MEMORY_FENCE generation, use the scheduling fences when creating the IR. For IvyBridge, every (data cache) fence is accompained by a render cache fence, that now is explicit in the IR, two SHADER_OPCODE_MEMORY_FENCEs are emitted (with different SFIDs). Because Begin and End interlock intrinsics are effectively memory barriers, move its handling alongside the other memory barrier intrinsics. The SHADER_OPCODE_INTERLOCK is still used to distinguish if we are going to use a SENDC (for Begin) or regular SEND (for End). This change is a preparation to allow emitting both SENDs in Gen11+ before we can stall on them. Shader-db results for IVB (i965): total instructions in shared programs: 11971190 -> 11971200 (<.01%) instructions in affected programs: 11482 -> 11492 (0.09%) helped: 0 HURT: 8 HURT stats (abs) min: 1 max: 3 x̄: 1.25 x̃: 1 HURT stats (rel) min: 0.03% max: 0.50% x̄: 0.14% x̃: 0.10% 95% mean confidence interval for instructions value: 0.66 1.84 95% mean confidence interval for instructions %-change: 0.01% 0.27% Instructions are HURT. Unlike the previous code, that used the `mov g1 g2` trick to force both `g1` and `g2` to stall, the scheduling fence will generate `mov null g1` and `mov null g2`. During review it was decided it was not worth keeping the special codepath for the small effect will have. Shader-db results for HSW (i965), BDW and SKL don't have a change on instruction count, but do report changes in cycles count, showing SKL results below total cycles in shared programs: 341738444 -> 341710570 (<.01%) cycles in affected programs: 7240002 -> 7212128 (-0.38%) helped: 46 HURT: 5 helped stats (abs) min: 14 max: 1940 x̄: 676.22 x̃: 154 helped stats (rel) min: <.01% max: 2.62% x̄: 1.28% x̃: 0.95% HURT stats (abs) min: 2 max: 1768 x̄: 646.40 x̃: 362 HURT stats (rel) min: <.01% max: 0.83% x̄: 0.28% x̃: 0.08% 95% mean confidence interval for cycles value: -777.71 -315.38 95% mean confidence interval for cycles %-change: -1.42% -0.83% Cycles are helped. This seems to be the effect of allocating two registers separatedly instead of a single one with size 2, which causes different register allocation, affecting the cycle estimates. while ICL also has not change on instruction count but report changes negative changes in cycles total cycles in shared programs: 352665369 -> 352707484 (0.01%) cycles in affected programs: 9608288 -> 9650403 (0.44%) helped: 4 HURT: 104 helped stats (abs) min: 24 max: 128 x̄: 88.50 x̃: 101 helped stats (rel) min: <.01% max: 0.85% x̄: 0.46% x̃: 0.49% HURT stats (abs) min: 2 max: 2016 x̄: 408.36 x̃: 48 HURT stats (rel) min: <.01% max: 3.31% x̄: 0.88% x̃: 0.45% 95% mean confidence interval for cycles value: 256.67 523.24 95% mean confidence interval for cycles %-change: 0.63% 1.03% Cycles are HURT. AFAICT this is the result of the case above. Shader-db results for TGL have similar cycles result as ICL, but also affect instructions total instructions in shared programs: 17690586 -> 17690597 (<.01%) instructions in affected programs: 64617 -> 64628 (0.02%) helped: 55 HURT: 32 helped stats (abs) min: 1 max: 16 x̄: 4.13 x̃: 3 helped stats (rel) min: 0.05% max: 2.78% x̄: 0.86% x̃: 0.74% HURT stats (abs) min: 1 max: 65 x̄: 7.44 x̃: 2 HURT stats (rel) min: 0.05% max: 4.58% x̄: 1.13% x̃: 0.69% 95% mean confidence interval for instructions value: -2.03 2.28 95% mean confidence interval for instructions %-change: -0.41% 0.15% Inconclusive result (value mean confidence interval includes 0). Now that more is done in the IR, more dependencies are visible and more SWSB annotations are emitted. Mixed with different register allocation decisions like above, some shaders will see more `sync nops` while others able to avoid them. Most of the new `sync nops` are also redundant and could be dropped, which will be fixed in a separate change. Reviewed-by: Francisco Jerez <currojerez@riseup.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3278>
2020-01-17 15:07:44 -08:00
break;
default:
unreachable("invalid intrinsic");
intel/fs: Add support for SLM fence in Gen11 Gen11 SLM is not on L3 anymore, so now the hardware has two separate fences. Add a way to control which fence types to use. At this time, we don't have enough information in NIR to control the visibility of the memory being fenced, so for now be conservative and assume that fences will need a stall. With more information later we'll be able to reduce those. Fixes Vulkan CTS tests in ICL: dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.workgroup.guard_local.buffer.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_local.buffer.guard_nonlocal.workgroup.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_local.image.guard_nonlocal.workgroup.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_local.buffer.guard_nonlocal.workgroup.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_local.image.guard_nonlocal.workgroup.comp The whole set of supported tests in dEQP-VK.memory_model.* group should be passing in ICL now. v2: Pass BTI around instead of having an enum. (Jason) Emit two SHADER_OPCODE_MEMORY_FENCE instead of one that gets transformed into two. (Jason) List tests fixed. (Lionel) v3: For clarity, split the decision of which fences to emit from the emission code. (Jason) Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> Acked-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
2019-07-10 12:02:23 -07:00
}
if (opcode == BRW_OPCODE_NOP)
break;
if (nir->info.shared_size > 0) {
assert(gl_shader_stage_uses_workgroup(stage));
} else {
slm_fence = false;
}
/* If the workgroup fits in a single HW thread, the messages for SLM are
* processed in-order and the shader itself is already synchronized so
* the memory fence is not necessary.
*
* TODO: Check if applies for many HW threads sharing same Data Port.
*/
if (!nir->info.workgroup_size_variable &&
slm_fence && workgroup_size() <= dispatch_width)
slm_fence = false;
switch (stage) {
case MESA_SHADER_TESS_CTRL:
case MESA_SHADER_TASK:
case MESA_SHADER_MESH:
break;
default:
urb_fence = false;
break;
}
intel/fs: Add support for SLM fence in Gen11 Gen11 SLM is not on L3 anymore, so now the hardware has two separate fences. Add a way to control which fence types to use. At this time, we don't have enough information in NIR to control the visibility of the memory being fenced, so for now be conservative and assume that fences will need a stall. With more information later we'll be able to reduce those. Fixes Vulkan CTS tests in ICL: dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.workgroup.guard_local.buffer.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_local.buffer.guard_nonlocal.workgroup.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_local.image.guard_nonlocal.workgroup.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_local.buffer.guard_nonlocal.workgroup.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_local.image.guard_nonlocal.workgroup.comp The whole set of supported tests in dEQP-VK.memory_model.* group should be passing in ICL now. v2: Pass BTI around instead of having an enum. (Jason) Emit two SHADER_OPCODE_MEMORY_FENCE instead of one that gets transformed into two. (Jason) List tests fixed. (Lionel) v3: For clarity, split the decision of which fences to emit from the emission code. (Jason) Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> Acked-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
2019-07-10 12:02:23 -07:00
unsigned fence_regs_count = 0;
fs_reg fence_regs[4] = {};
const fs_builder ubld = bld.group(8, 0);
intel/fs: Add support for SLM fence in Gen11 Gen11 SLM is not on L3 anymore, so now the hardware has two separate fences. Add a way to control which fence types to use. At this time, we don't have enough information in NIR to control the visibility of the memory being fenced, so for now be conservative and assume that fences will need a stall. With more information later we'll be able to reduce those. Fixes Vulkan CTS tests in ICL: dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.workgroup.guard_local.buffer.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_local.buffer.guard_nonlocal.workgroup.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_local.image.guard_nonlocal.workgroup.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_local.buffer.guard_nonlocal.workgroup.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_local.image.guard_nonlocal.workgroup.comp The whole set of supported tests in dEQP-VK.memory_model.* group should be passing in ICL now. v2: Pass BTI around instead of having an enum. (Jason) Emit two SHADER_OPCODE_MEMORY_FENCE instead of one that gets transformed into two. (Jason) List tests fixed. (Lionel) v3: For clarity, split the decision of which fences to emit from the emission code. (Jason) Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> Acked-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
2019-07-10 12:02:23 -07:00
intel/fs/gfx12: Ensure that prior reads have executed before barrier with acquire semantics. This avoids a violation of the Vulkan memory model that was leading to intermittent failures of at least 8k test-cases of the Vulkan CTS (within the group dEQP-VK.memory_model.*) on TGL and DG2 platforms. In theory the issue may be reproducible on earlier platforms like IVB and ICL, but the SYNC.ALLWR instruction is not available on those platforms so a different (likely costlier) fix will be needed. The issue occurs within the sequence we emit for a NIR memory barrier with acquire semantics requiring the synchronization of multiple caches, e.g. in pseudocode for a barrier involving the TGM and UGM caches on DG2: x <- load.ugm // Atomic read sequenced-before the barrier y <- fence.ugm z <- fence.tgm wait(y, z) w <- load.tgm // Read sequenced-after the barrier In the example we must provide the guarantee that the memory load for x is completed before the one for w, however this ordering can be reversed with the intervention of a concurrent thread, since the UGM fence will block on the prior UGM load and potentially take a long time, while the TGM fence may complete and invalidate the TGM cache immediately, so a concurrent thread could pollute the TGM cache with stale contents for the w location *before* the UGM load has completed, leading to an inversion of the expected memory ordering. v2: Apply the workaround regardless of whether the NIR barrier intrinsic specifies multiple storage classes or a single one, since an acquire barrier is required to order subsequent requests relative to previous atomic requests of unknown storage class not necessarily specified by the memory scope information of the intrinsic. Cc: mesa-stable Reviewed-by: Ivan Briano <ivan.briano@intel.com> Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20690>
2023-01-09 15:31:33 -08:00
/* A memory barrier with acquire semantics requires us to
* guarantee that memory operations of the specified storage
* class sequenced-after the barrier aren't reordered before the
* barrier, nor before any previous atomic operation
* sequenced-before the barrier which may be synchronizing this
* acquire barrier with a prior release sequence.
*
* In order to guarantee the latter we must make sure that any
* such previous operation has completed execution before
* invalidating the relevant caches, since otherwise some cache
* could be polluted by a concurrent thread after its
* invalidation but before the previous atomic completes, which
* could lead to a violation of the expected memory ordering if
* a subsequent memory read hits the polluted cacheline, which
* would return a stale value read from memory before the
* completion of the atomic sequenced-before the barrier.
*
* This ordering inversion can be avoided trivially if the
* operations we need to order are all handled by a single
* in-order cache, since the flush implied by the memory fence
* occurs after any pending operations have completed, however
* that doesn't help us when dealing with multiple caches
* processing requests out of order, in which case we need to
* explicitly stall the EU until any pending memory operations
* have executed.
*
* Note that that might be somewhat heavy handed in some cases.
* In particular when this memory fence was inserted by
* spirv_to_nir() lowering an atomic with acquire semantics into
* an atomic+barrier sequence we could do a better job by
* synchronizing with respect to that one atomic *only*, but
* that would require additional information not currently
* available to the backend.
*
* XXX - Use an alternative workaround on IVB and ICL, since
* SYNC.ALLWR is only available on Gfx12+.
*/
if (devinfo->ver >= 12 &&
(!nir_intrinsic_has_memory_scope(instr) ||
(nir_intrinsic_memory_semantics(instr) & NIR_MEMORY_ACQUIRE))) {
ubld.exec_all().group(1, 0).emit(
BRW_OPCODE_SYNC, ubld.null_reg_ud(), brw_imm_ud(TGL_SYNC_ALLWR));
}
intel/fs: Rework fence handling in brw_fs_nir.cpp Start off making everything look like LSC where we have three types of fences: TGM, UGM, and SLM. Then, emit the actual code in a generation- aware way. There are three HW generation cases we care about: XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a lot to deduplicate, it only increases the number of ubld.emit() calls from 5 to 7 and entirely gets rid of the SFID juggling and other weirdness we've introduced along the way to make those cases "general". While we're here, also clean up the code for stalling after fences and clearly document every case where we insert a stall. There are only three known functional changes from this commit: 1. We now avoid the render cache fence on IVB if we don't need image barriers. 2. On ICL+, we no longer unconditionally stall on barriers. We still stall if we have more than one to help tie them together but independent barriers are independent. Barrier instructions will still operate in write-commit mode and still be scheduling barriers but won't necessarily stall. 3. We now assert-fail for URB fences on LSC platforms. We'll be adding in the new URB fence message for those platforms in a follow-on commit. It is a big enough refactor, however, that other minor changes may be present. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
if (devinfo->has_lsc) {
assert(devinfo->verx10 >= 125);
uint32_t desc =
lsc_fence_descriptor_for_intrinsic(devinfo, instr);
intel/fs: Rework fence handling in brw_fs_nir.cpp Start off making everything look like LSC where we have three types of fences: TGM, UGM, and SLM. Then, emit the actual code in a generation- aware way. There are three HW generation cases we care about: XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a lot to deduplicate, it only increases the number of ubld.emit() calls from 5 to 7 and entirely gets rid of the SFID juggling and other weirdness we've introduced along the way to make those cases "general". While we're here, also clean up the code for stalling after fences and clearly document every case where we insert a stall. There are only three known functional changes from this commit: 1. We now avoid the render cache fence on IVB if we don't need image barriers. 2. On ICL+, we no longer unconditionally stall on barriers. We still stall if we have more than one to help tie them together but independent barriers are independent. Barrier instructions will still operate in write-commit mode and still be scheduling barriers but won't necessarily stall. 3. We now assert-fail for URB fences on LSC platforms. We'll be adding in the new URB fence message for those platforms in a follow-on commit. It is a big enough refactor, however, that other minor changes may be present. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
if (ugm_fence) {
fence_regs[fence_regs_count++] =
emit_fence(ubld, opcode, GFX12_SFID_UGM, desc,
intel/fs: Rework fence handling in brw_fs_nir.cpp Start off making everything look like LSC where we have three types of fences: TGM, UGM, and SLM. Then, emit the actual code in a generation- aware way. There are three HW generation cases we care about: XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a lot to deduplicate, it only increases the number of ubld.emit() calls from 5 to 7 and entirely gets rid of the SFID juggling and other weirdness we've introduced along the way to make those cases "general". While we're here, also clean up the code for stalling after fences and clearly document every case where we insert a stall. There are only three known functional changes from this commit: 1. We now avoid the render cache fence on IVB if we don't need image barriers. 2. On ICL+, we no longer unconditionally stall on barriers. We still stall if we have more than one to help tie them together but independent barriers are independent. Barrier instructions will still operate in write-commit mode and still be scheduling barriers but won't necessarily stall. 3. We now assert-fail for URB fences on LSC platforms. We'll be adding in the new URB fence message for those platforms in a follow-on commit. It is a big enough refactor, however, that other minor changes may be present. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
true /* commit_enable */,
0 /* bti; ignored for LSC */);
intel/fs,vec4: Pull stall logic for memory fences up into the IR Instead of emitting the stall MOV "inside" the SHADER_OPCODE_MEMORY_FENCE generation, use the scheduling fences when creating the IR. For IvyBridge, every (data cache) fence is accompained by a render cache fence, that now is explicit in the IR, two SHADER_OPCODE_MEMORY_FENCEs are emitted (with different SFIDs). Because Begin and End interlock intrinsics are effectively memory barriers, move its handling alongside the other memory barrier intrinsics. The SHADER_OPCODE_INTERLOCK is still used to distinguish if we are going to use a SENDC (for Begin) or regular SEND (for End). This change is a preparation to allow emitting both SENDs in Gen11+ before we can stall on them. Shader-db results for IVB (i965): total instructions in shared programs: 11971190 -> 11971200 (<.01%) instructions in affected programs: 11482 -> 11492 (0.09%) helped: 0 HURT: 8 HURT stats (abs) min: 1 max: 3 x̄: 1.25 x̃: 1 HURT stats (rel) min: 0.03% max: 0.50% x̄: 0.14% x̃: 0.10% 95% mean confidence interval for instructions value: 0.66 1.84 95% mean confidence interval for instructions %-change: 0.01% 0.27% Instructions are HURT. Unlike the previous code, that used the `mov g1 g2` trick to force both `g1` and `g2` to stall, the scheduling fence will generate `mov null g1` and `mov null g2`. During review it was decided it was not worth keeping the special codepath for the small effect will have. Shader-db results for HSW (i965), BDW and SKL don't have a change on instruction count, but do report changes in cycles count, showing SKL results below total cycles in shared programs: 341738444 -> 341710570 (<.01%) cycles in affected programs: 7240002 -> 7212128 (-0.38%) helped: 46 HURT: 5 helped stats (abs) min: 14 max: 1940 x̄: 676.22 x̃: 154 helped stats (rel) min: <.01% max: 2.62% x̄: 1.28% x̃: 0.95% HURT stats (abs) min: 2 max: 1768 x̄: 646.40 x̃: 362 HURT stats (rel) min: <.01% max: 0.83% x̄: 0.28% x̃: 0.08% 95% mean confidence interval for cycles value: -777.71 -315.38 95% mean confidence interval for cycles %-change: -1.42% -0.83% Cycles are helped. This seems to be the effect of allocating two registers separatedly instead of a single one with size 2, which causes different register allocation, affecting the cycle estimates. while ICL also has not change on instruction count but report changes negative changes in cycles total cycles in shared programs: 352665369 -> 352707484 (0.01%) cycles in affected programs: 9608288 -> 9650403 (0.44%) helped: 4 HURT: 104 helped stats (abs) min: 24 max: 128 x̄: 88.50 x̃: 101 helped stats (rel) min: <.01% max: 0.85% x̄: 0.46% x̃: 0.49% HURT stats (abs) min: 2 max: 2016 x̄: 408.36 x̃: 48 HURT stats (rel) min: <.01% max: 3.31% x̄: 0.88% x̃: 0.45% 95% mean confidence interval for cycles value: 256.67 523.24 95% mean confidence interval for cycles %-change: 0.63% 1.03% Cycles are HURT. AFAICT this is the result of the case above. Shader-db results for TGL have similar cycles result as ICL, but also affect instructions total instructions in shared programs: 17690586 -> 17690597 (<.01%) instructions in affected programs: 64617 -> 64628 (0.02%) helped: 55 HURT: 32 helped stats (abs) min: 1 max: 16 x̄: 4.13 x̃: 3 helped stats (rel) min: 0.05% max: 2.78% x̄: 0.86% x̃: 0.74% HURT stats (abs) min: 1 max: 65 x̄: 7.44 x̃: 2 HURT stats (rel) min: 0.05% max: 4.58% x̄: 1.13% x̃: 0.69% 95% mean confidence interval for instructions value: -2.03 2.28 95% mean confidence interval for instructions %-change: -0.41% 0.15% Inconclusive result (value mean confidence interval includes 0). Now that more is done in the IR, more dependencies are visible and more SWSB annotations are emitted. Mixed with different register allocation decisions like above, some shaders will see more `sync nops` while others able to avoid them. Most of the new `sync nops` are also redundant and could be dropped, which will be fixed in a separate change. Reviewed-by: Francisco Jerez <currojerez@riseup.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3278>
2020-01-17 15:07:44 -08:00
}
intel/fs: Rework fence handling in brw_fs_nir.cpp Start off making everything look like LSC where we have three types of fences: TGM, UGM, and SLM. Then, emit the actual code in a generation- aware way. There are three HW generation cases we care about: XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a lot to deduplicate, it only increases the number of ubld.emit() calls from 5 to 7 and entirely gets rid of the SFID juggling and other weirdness we've introduced along the way to make those cases "general". While we're here, also clean up the code for stalling after fences and clearly document every case where we insert a stall. There are only three known functional changes from this commit: 1. We now avoid the render cache fence on IVB if we don't need image barriers. 2. On ICL+, we no longer unconditionally stall on barriers. We still stall if we have more than one to help tie them together but independent barriers are independent. Barrier instructions will still operate in write-commit mode and still be scheduling barriers but won't necessarily stall. 3. We now assert-fail for URB fences on LSC platforms. We'll be adding in the new URB fence message for those platforms in a follow-on commit. It is a big enough refactor, however, that other minor changes may be present. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
if (tgm_fence) {
fence_regs[fence_regs_count++] =
emit_fence(ubld, opcode, GFX12_SFID_TGM, desc,
intel/fs: Rework fence handling in brw_fs_nir.cpp Start off making everything look like LSC where we have three types of fences: TGM, UGM, and SLM. Then, emit the actual code in a generation- aware way. There are three HW generation cases we care about: XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a lot to deduplicate, it only increases the number of ubld.emit() calls from 5 to 7 and entirely gets rid of the SFID juggling and other weirdness we've introduced along the way to make those cases "general". While we're here, also clean up the code for stalling after fences and clearly document every case where we insert a stall. There are only three known functional changes from this commit: 1. We now avoid the render cache fence on IVB if we don't need image barriers. 2. On ICL+, we no longer unconditionally stall on barriers. We still stall if we have more than one to help tie them together but independent barriers are independent. Barrier instructions will still operate in write-commit mode and still be scheduling barriers but won't necessarily stall. 3. We now assert-fail for URB fences on LSC platforms. We'll be adding in the new URB fence message for those platforms in a follow-on commit. It is a big enough refactor, however, that other minor changes may be present. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
true /* commit_enable */,
0 /* bti; ignored for LSC */);
}
intel/fs: Add support for SLM fence in Gen11 Gen11 SLM is not on L3 anymore, so now the hardware has two separate fences. Add a way to control which fence types to use. At this time, we don't have enough information in NIR to control the visibility of the memory being fenced, so for now be conservative and assume that fences will need a stall. With more information later we'll be able to reduce those. Fixes Vulkan CTS tests in ICL: dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.workgroup.guard_local.buffer.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_local.buffer.guard_nonlocal.workgroup.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_local.image.guard_nonlocal.workgroup.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_local.buffer.guard_nonlocal.workgroup.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_local.image.guard_nonlocal.workgroup.comp The whole set of supported tests in dEQP-VK.memory_model.* group should be passing in ICL now. v2: Pass BTI around instead of having an enum. (Jason) Emit two SHADER_OPCODE_MEMORY_FENCE instead of one that gets transformed into two. (Jason) List tests fixed. (Lionel) v3: For clarity, split the decision of which fences to emit from the emission code. (Jason) Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> Acked-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
2019-07-10 12:02:23 -07:00
intel/fs: Rework fence handling in brw_fs_nir.cpp Start off making everything look like LSC where we have three types of fences: TGM, UGM, and SLM. Then, emit the actual code in a generation- aware way. There are three HW generation cases we care about: XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a lot to deduplicate, it only increases the number of ubld.emit() calls from 5 to 7 and entirely gets rid of the SFID juggling and other weirdness we've introduced along the way to make those cases "general". While we're here, also clean up the code for stalling after fences and clearly document every case where we insert a stall. There are only three known functional changes from this commit: 1. We now avoid the render cache fence on IVB if we don't need image barriers. 2. On ICL+, we no longer unconditionally stall on barriers. We still stall if we have more than one to help tie them together but independent barriers are independent. Barrier instructions will still operate in write-commit mode and still be scheduling barriers but won't necessarily stall. 3. We now assert-fail for URB fences on LSC platforms. We'll be adding in the new URB fence message for those platforms in a follow-on commit. It is a big enough refactor, however, that other minor changes may be present. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
if (slm_fence) {
assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
if (intel_needs_workaround(devinfo, 14014063774)) {
/* Wa_14014063774
*
* Before SLM fence compiler needs to insert SYNC.ALLWR in order
* to avoid the SLM data race.
*/
ubld.exec_all().group(1, 0).emit(
BRW_OPCODE_SYNC, ubld.null_reg_ud(),
brw_imm_ud(TGL_SYNC_ALLWR));
}
intel/fs: Rework fence handling in brw_fs_nir.cpp Start off making everything look like LSC where we have three types of fences: TGM, UGM, and SLM. Then, emit the actual code in a generation- aware way. There are three HW generation cases we care about: XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a lot to deduplicate, it only increases the number of ubld.emit() calls from 5 to 7 and entirely gets rid of the SFID juggling and other weirdness we've introduced along the way to make those cases "general". While we're here, also clean up the code for stalling after fences and clearly document every case where we insert a stall. There are only three known functional changes from this commit: 1. We now avoid the render cache fence on IVB if we don't need image barriers. 2. On ICL+, we no longer unconditionally stall on barriers. We still stall if we have more than one to help tie them together but independent barriers are independent. Barrier instructions will still operate in write-commit mode and still be scheduling barriers but won't necessarily stall. 3. We now assert-fail for URB fences on LSC platforms. We'll be adding in the new URB fence message for those platforms in a follow-on commit. It is a big enough refactor, however, that other minor changes may be present. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
fence_regs[fence_regs_count++] =
emit_fence(ubld, opcode, GFX12_SFID_SLM, desc,
intel/fs: Rework fence handling in brw_fs_nir.cpp Start off making everything look like LSC where we have three types of fences: TGM, UGM, and SLM. Then, emit the actual code in a generation- aware way. There are three HW generation cases we care about: XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a lot to deduplicate, it only increases the number of ubld.emit() calls from 5 to 7 and entirely gets rid of the SFID juggling and other weirdness we've introduced along the way to make those cases "general". While we're here, also clean up the code for stalling after fences and clearly document every case where we insert a stall. There are only three known functional changes from this commit: 1. We now avoid the render cache fence on IVB if we don't need image barriers. 2. On ICL+, we no longer unconditionally stall on barriers. We still stall if we have more than one to help tie them together but independent barriers are independent. Barrier instructions will still operate in write-commit mode and still be scheduling barriers but won't necessarily stall. 3. We now assert-fail for URB fences on LSC platforms. We'll be adding in the new URB fence message for those platforms in a follow-on commit. It is a big enough refactor, however, that other minor changes may be present. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
true /* commit_enable */,
0 /* BTI; ignored for LSC */);
}
if (urb_fence) {
assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
fence_regs[fence_regs_count++] =
emit_fence(ubld, opcode, BRW_SFID_URB, desc,
true /* commit_enable */,
0 /* BTI; ignored for LSC */);
intel/fs: Rework fence handling in brw_fs_nir.cpp Start off making everything look like LSC where we have three types of fences: TGM, UGM, and SLM. Then, emit the actual code in a generation- aware way. There are three HW generation cases we care about: XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a lot to deduplicate, it only increases the number of ubld.emit() calls from 5 to 7 and entirely gets rid of the SFID juggling and other weirdness we've introduced along the way to make those cases "general". While we're here, also clean up the code for stalling after fences and clearly document every case where we insert a stall. There are only three known functional changes from this commit: 1. We now avoid the render cache fence on IVB if we don't need image barriers. 2. On ICL+, we no longer unconditionally stall on barriers. We still stall if we have more than one to help tie them together but independent barriers are independent. Barrier instructions will still operate in write-commit mode and still be scheduling barriers but won't necessarily stall. 3. We now assert-fail for URB fences on LSC platforms. We'll be adding in the new URB fence message for those platforms in a follow-on commit. It is a big enough refactor, however, that other minor changes may be present. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
}
} else if (devinfo->ver >= 11) {
if (tgm_fence || ugm_fence || urb_fence) {
fence_regs[fence_regs_count++] =
emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
intel/fs: Rework fence handling in brw_fs_nir.cpp Start off making everything look like LSC where we have three types of fences: TGM, UGM, and SLM. Then, emit the actual code in a generation- aware way. There are three HW generation cases we care about: XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a lot to deduplicate, it only increases the number of ubld.emit() calls from 5 to 7 and entirely gets rid of the SFID juggling and other weirdness we've introduced along the way to make those cases "general". While we're here, also clean up the code for stalling after fences and clearly document every case where we insert a stall. There are only three known functional changes from this commit: 1. We now avoid the render cache fence on IVB if we don't need image barriers. 2. On ICL+, we no longer unconditionally stall on barriers. We still stall if we have more than one to help tie them together but independent barriers are independent. Barrier instructions will still operate in write-commit mode and still be scheduling barriers but won't necessarily stall. 3. We now assert-fail for URB fences on LSC platforms. We'll be adding in the new URB fence message for those platforms in a follow-on commit. It is a big enough refactor, however, that other minor changes may be present. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
true /* commit_enable HSD ES # 1404612949 */,
0 /* BTI = 0 means data cache */);
}
if (slm_fence) {
assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
fence_regs[fence_regs_count++] =
emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
intel/fs: Rework fence handling in brw_fs_nir.cpp Start off making everything look like LSC where we have three types of fences: TGM, UGM, and SLM. Then, emit the actual code in a generation- aware way. There are three HW generation cases we care about: XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a lot to deduplicate, it only increases the number of ubld.emit() calls from 5 to 7 and entirely gets rid of the SFID juggling and other weirdness we've introduced along the way to make those cases "general". While we're here, also clean up the code for stalling after fences and clearly document every case where we insert a stall. There are only three known functional changes from this commit: 1. We now avoid the render cache fence on IVB if we don't need image barriers. 2. On ICL+, we no longer unconditionally stall on barriers. We still stall if we have more than one to help tie them together but independent barriers are independent. Barrier instructions will still operate in write-commit mode and still be scheduling barriers but won't necessarily stall. 3. We now assert-fail for URB fences on LSC platforms. We'll be adding in the new URB fence message for those platforms in a follow-on commit. It is a big enough refactor, however, that other minor changes may be present. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
true /* commit_enable HSD ES # 1404612949 */,
GFX7_BTI_SLM);
}
} else {
/* Prior to Icelake, they're all lumped into a single cache except on
* Ivy Bridge and Bay Trail where typed messages actually go through
* the render cache. There, we need both fences because we may
* access storage images as either typed or untyped.
*/
const bool render_fence = tgm_fence && devinfo->verx10 == 70;
/* Simulation also complains on Gfx9 if we do not enable commit.
*/
intel/fs: Rework fence handling in brw_fs_nir.cpp Start off making everything look like LSC where we have three types of fences: TGM, UGM, and SLM. Then, emit the actual code in a generation- aware way. There are three HW generation cases we care about: XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a lot to deduplicate, it only increases the number of ubld.emit() calls from 5 to 7 and entirely gets rid of the SFID juggling and other weirdness we've introduced along the way to make those cases "general". While we're here, also clean up the code for stalling after fences and clearly document every case where we insert a stall. There are only three known functional changes from this commit: 1. We now avoid the render cache fence on IVB if we don't need image barriers. 2. On ICL+, we no longer unconditionally stall on barriers. We still stall if we have more than one to help tie them together but independent barriers are independent. Barrier instructions will still operate in write-commit mode and still be scheduling barriers but won't necessarily stall. 3. We now assert-fail for URB fences on LSC platforms. We'll be adding in the new URB fence message for those platforms in a follow-on commit. It is a big enough refactor, however, that other minor changes may be present. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
const bool commit_enable = render_fence ||
instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
devinfo->ver == 9;
intel/fs: Rework fence handling in brw_fs_nir.cpp Start off making everything look like LSC where we have three types of fences: TGM, UGM, and SLM. Then, emit the actual code in a generation- aware way. There are three HW generation cases we care about: XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a lot to deduplicate, it only increases the number of ubld.emit() calls from 5 to 7 and entirely gets rid of the SFID juggling and other weirdness we've introduced along the way to make those cases "general". While we're here, also clean up the code for stalling after fences and clearly document every case where we insert a stall. There are only three known functional changes from this commit: 1. We now avoid the render cache fence on IVB if we don't need image barriers. 2. On ICL+, we no longer unconditionally stall on barriers. We still stall if we have more than one to help tie them together but independent barriers are independent. Barrier instructions will still operate in write-commit mode and still be scheduling barriers but won't necessarily stall. 3. We now assert-fail for URB fences on LSC platforms. We'll be adding in the new URB fence message for those platforms in a follow-on commit. It is a big enough refactor, however, that other minor changes may be present. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
if (tgm_fence || ugm_fence || slm_fence || urb_fence) {
fence_regs[fence_regs_count++] =
emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
intel/fs: Rework fence handling in brw_fs_nir.cpp Start off making everything look like LSC where we have three types of fences: TGM, UGM, and SLM. Then, emit the actual code in a generation- aware way. There are three HW generation cases we care about: XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a lot to deduplicate, it only increases the number of ubld.emit() calls from 5 to 7 and entirely gets rid of the SFID juggling and other weirdness we've introduced along the way to make those cases "general". While we're here, also clean up the code for stalling after fences and clearly document every case where we insert a stall. There are only three known functional changes from this commit: 1. We now avoid the render cache fence on IVB if we don't need image barriers. 2. On ICL+, we no longer unconditionally stall on barriers. We still stall if we have more than one to help tie them together but independent barriers are independent. Barrier instructions will still operate in write-commit mode and still be scheduling barriers but won't necessarily stall. 3. We now assert-fail for URB fences on LSC platforms. We'll be adding in the new URB fence message for those platforms in a follow-on commit. It is a big enough refactor, however, that other minor changes may be present. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
commit_enable, 0 /* BTI */);
}
if (render_fence) {
fence_regs[fence_regs_count++] =
emit_fence(ubld, opcode, GFX6_SFID_DATAPORT_RENDER_CACHE, 0,
intel/fs: Rework fence handling in brw_fs_nir.cpp Start off making everything look like LSC where we have three types of fences: TGM, UGM, and SLM. Then, emit the actual code in a generation- aware way. There are three HW generation cases we care about: XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a lot to deduplicate, it only increases the number of ubld.emit() calls from 5 to 7 and entirely gets rid of the SFID juggling and other weirdness we've introduced along the way to make those cases "general". While we're here, also clean up the code for stalling after fences and clearly document every case where we insert a stall. There are only three known functional changes from this commit: 1. We now avoid the render cache fence on IVB if we don't need image barriers. 2. On ICL+, we no longer unconditionally stall on barriers. We still stall if we have more than one to help tie them together but independent barriers are independent. Barrier instructions will still operate in write-commit mode and still be scheduling barriers but won't necessarily stall. 3. We now assert-fail for URB fences on LSC platforms. We'll be adding in the new URB fence message for those platforms in a follow-on commit. It is a big enough refactor, however, that other minor changes may be present. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
commit_enable, /* bti */ 0);
}
intel/fs: Add support for SLM fence in Gen11 Gen11 SLM is not on L3 anymore, so now the hardware has two separate fences. Add a way to control which fence types to use. At this time, we don't have enough information in NIR to control the visibility of the memory being fenced, so for now be conservative and assume that fences will need a stall. With more information later we'll be able to reduce those. Fixes Vulkan CTS tests in ICL: dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.workgroup.guard_local.buffer.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_local.buffer.guard_nonlocal.workgroup.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_local.image.guard_nonlocal.workgroup.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_local.buffer.guard_nonlocal.workgroup.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_local.image.guard_nonlocal.workgroup.comp The whole set of supported tests in dEQP-VK.memory_model.* group should be passing in ICL now. v2: Pass BTI around instead of having an enum. (Jason) Emit two SHADER_OPCODE_MEMORY_FENCE instead of one that gets transformed into two. (Jason) List tests fixed. (Lionel) v3: For clarity, split the decision of which fences to emit from the emission code. (Jason) Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> Acked-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
2019-07-10 12:02:23 -07:00
}
intel/fs: Rework fence handling in brw_fs_nir.cpp Start off making everything look like LSC where we have three types of fences: TGM, UGM, and SLM. Then, emit the actual code in a generation- aware way. There are three HW generation cases we care about: XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a lot to deduplicate, it only increases the number of ubld.emit() calls from 5 to 7 and entirely gets rid of the SFID juggling and other weirdness we've introduced along the way to make those cases "general". While we're here, also clean up the code for stalling after fences and clearly document every case where we insert a stall. There are only three known functional changes from this commit: 1. We now avoid the render cache fence on IVB if we don't need image barriers. 2. On ICL+, we no longer unconditionally stall on barriers. We still stall if we have more than one to help tie them together but independent barriers are independent. Barrier instructions will still operate in write-commit mode and still be scheduling barriers but won't necessarily stall. 3. We now assert-fail for URB fences on LSC platforms. We'll be adding in the new URB fence message for those platforms in a follow-on commit. It is a big enough refactor, however, that other minor changes may be present. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
assert(fence_regs_count <= ARRAY_SIZE(fence_regs));
/* Be conservative in Gen11+ and always stall in a fence. Since
* there are two different fences, and shader might want to
* synchronize between them.
*
* TODO: Use scope and visibility information for the barriers from NIR
* to make a better decision on whether we need to stall.
*/
bool force_stall = devinfo->ver >= 11;
/* There are four cases where we want to insert a stall:
intel/fs: Rework fence handling in brw_fs_nir.cpp Start off making everything look like LSC where we have three types of fences: TGM, UGM, and SLM. Then, emit the actual code in a generation- aware way. There are three HW generation cases we care about: XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a lot to deduplicate, it only increases the number of ubld.emit() calls from 5 to 7 and entirely gets rid of the SFID juggling and other weirdness we've introduced along the way to make those cases "general". While we're here, also clean up the code for stalling after fences and clearly document every case where we insert a stall. There are only three known functional changes from this commit: 1. We now avoid the render cache fence on IVB if we don't need image barriers. 2. On ICL+, we no longer unconditionally stall on barriers. We still stall if we have more than one to help tie them together but independent barriers are independent. Barrier instructions will still operate in write-commit mode and still be scheduling barriers but won't necessarily stall. 3. We now assert-fail for URB fences on LSC platforms. We'll be adding in the new URB fence message for those platforms in a follow-on commit. It is a big enough refactor, however, that other minor changes may be present. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
*
* 1. If we're a nir_intrinsic_end_invocation_interlock. This is
* required to ensure that the shader EOT doesn't happen until
* after the fence returns. Otherwise, we might end up with the
* next shader invocation for that pixel not respecting our fence
* because it may happen on a different HW thread.
*
* 2. If we have multiple fences. This is required to ensure that
* they all complete and nothing gets weirdly out-of-order.
*
* 3. If we have no fences. In this case, we need at least a
* scheduling barrier to keep the compiler from moving things
* around in an invalid way.
*
* 4. On Gen11+ and platforms with LSC, we have multiple fence types,
* without further information about the fence, we need to force a
* stall.
intel/fs: Rework fence handling in brw_fs_nir.cpp Start off making everything look like LSC where we have three types of fences: TGM, UGM, and SLM. Then, emit the actual code in a generation- aware way. There are three HW generation cases we care about: XeHP+ (LSC), ICL-TGL, and IVB-SKL. Even though it looks like there's a lot to deduplicate, it only increases the number of ubld.emit() calls from 5 to 7 and entirely gets rid of the SFID juggling and other weirdness we've introduced along the way to make those cases "general". While we're here, also clean up the code for stalling after fences and clearly document every case where we insert a stall. There are only three known functional changes from this commit: 1. We now avoid the render cache fence on IVB if we don't need image barriers. 2. On ICL+, we no longer unconditionally stall on barriers. We still stall if we have more than one to help tie them together but independent barriers are independent. Barrier instructions will still operate in write-commit mode and still be scheduling barriers but won't necessarily stall. 3. We now assert-fail for URB fences on LSC platforms. We'll be adding in the new URB fence message for those platforms in a follow-on commit. It is a big enough refactor, however, that other minor changes may be present. Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> Reviewed-by: Sagar Ghuge <sagar.ghuge@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13092>
2021-09-15 12:58:04 -05:00
*/
if (instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
fence_regs_count != 1 || devinfo->has_lsc || force_stall) {
ubld.exec_all().group(1, 0).emit(
FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
fence_regs, fence_regs_count);
}
break;
}
case nir_intrinsic_shader_clock: {
/* We cannot do anything if there is an event, so ignore it for now */
const fs_reg shader_clock = get_timestamp(bld);
const fs_reg srcs[] = { component(shader_clock, 0),
component(shader_clock, 1) };
bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
break;
}
case nir_intrinsic_load_reloc_const_intel: {
uint32_t id = nir_intrinsic_param_idx(instr);
/* Emit the reloc in the smallest SIMD size to limit register usage. */
const fs_builder ubld = bld.exec_all().group(1, 0);
fs_reg small_dest = ubld.vgrf(dest.type);
ubld.UNDEF(small_dest);
ubld.exec_all().group(1, 0).emit(SHADER_OPCODE_MOV_RELOC_IMM,
small_dest, brw_imm_ud(id));
/* Copy propagation will get rid of this MOV. */
bld.MOV(dest, component(small_dest, 0));
break;
}
case nir_intrinsic_load_uniform: {
/* Offsets are in bytes but they should always aligned to
* the type size
*/
unsigned base_offset = nir_intrinsic_base(instr);
assert(base_offset % 4 == 0 || base_offset % type_sz(dest.type) == 0);
fs_reg src(UNIFORM, base_offset / 4, dest.type);
if (nir_src_is_const(instr->src[0])) {
unsigned load_offset = nir_src_as_uint(instr->src[0]);
assert(load_offset % type_sz(dest.type) == 0);
/* The base offset can only handle 32-bit units, so for 16-bit
* data take the modulo of the offset with 4 bytes and add it to
* the offset to read from within the source register.
*/
src.offset = load_offset + base_offset % 4;
for (unsigned j = 0; j < instr->num_components; j++) {
bld.MOV(offset(dest, bld, j), offset(src, bld, j));
}
nir: Get rid of *_indirect variants of input/output load/store intrinsics There is some special-casing needed in a competent back-end. However, they can do their special-casing easily enough based on whether or not the offset is a constant. In the mean time, having the *_indirect variants adds special cases a number of places where they don't need to be and, in general, only complicates things. To complicate matters, NIR had no way to convdert an indirect load/store to a direct one in the case that the indirect was a constant so we would still not really get what the back-ends wanted. The best solution seems to be to get rid of the *_indirect variants entirely. This commit is a bunch of different changes squashed together: - nir: Get rid of *_indirect variants of input/output load/store intrinsics - nir/glsl: Stop handling UBO/SSBO load/stores differently depending on indirect - nir/lower_io: Get rid of load/store_foo_indirect - i965/fs: Get rid of load/store_foo_indirect - i965/vec4: Get rid of load/store_foo_indirect - tgsi_to_nir: Get rid of load/store_foo_indirect - ir3/nir: Use the new unified io intrinsics - vc4: Do all uniform loads with byte offsets - vc4/nir: Use the new unified io intrinsics - vc4: Fix load_user_clip_plane crash - vc4: add missing src for store outputs - vc4: Fix state uniforms - nir/lower_clip: Update to the new load/store intrinsics - nir/lower_two_sided_color: Update to the new load intrinsic NIR and i965 changes are Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> NIR indirect declarations and vc4 changes are Reviewed-by: Eric Anholt <eric@anholt.net> ir3 changes are Reviewed-by: Rob Clark <robdclark@gmail.com> NIR changes are Acked-by: Rob Clark <robdclark@gmail.com>
2015-11-25 14:14:05 -08:00
} else {
fs_reg indirect = retype(get_nir_src(instr->src[0]),
BRW_REGISTER_TYPE_UD);
/* We need to pass a size to the MOV_INDIRECT but we don't want it to
* go past the end of the uniform. In order to keep the n'th
* component from running past, we subtract off the size of all but
* one component of the vector.
*/
assert(nir_intrinsic_range(instr) >=
instr->num_components * type_sz(dest.type));
unsigned read_size = nir_intrinsic_range(instr) -
(instr->num_components - 1) * type_sz(dest.type);
bool supports_64bit_indirects =
devinfo->platform != INTEL_PLATFORM_CHV && !intel_device_info_is_9lp(devinfo);
if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
for (unsigned j = 0; j < instr->num_components; j++) {
bld.emit(SHADER_OPCODE_MOV_INDIRECT,
offset(dest, bld, j), offset(src, bld, j),
indirect, brw_imm_ud(read_size));
}
} else {
const unsigned num_mov_indirects =
type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD);
/* We read a little bit less per MOV INDIRECT, as they are now
* 32-bits ones instead of 64-bit. Fix read_size then.
*/
const unsigned read_size_32bit = read_size -
(num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD);
for (unsigned j = 0; j < instr->num_components; j++) {
for (unsigned i = 0; i < num_mov_indirects; i++) {
bld.emit(SHADER_OPCODE_MOV_INDIRECT,
subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i),
subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i),
indirect, brw_imm_ud(read_size_32bit));
}
}
}
}
break;
}
intel/fs: make use of load_ubo_uniform_block_intel The principle is the same as the load_ssbo_uniform_block_intel. Whenever we see a uniform offset, load the data only once in GRFs to reduce register pressure. Iris shader-db run on DG2 : total instructions in shared programs: 23001325 -> 23094969 (0.41%) instructions in affected programs: 1775989 -> 1869633 (5.27%) helped: 764 HURT: 2097 helped stats (abs) min: 1 max: 102 x̄: 6.96 x̃: 2 helped stats (rel) min: 0.03% max: 16.91% x̄: 1.36% x̃: 0.63% HURT stats (abs) min: 1 max: 2461 x̄: 47.19 x̃: 7 HURT stats (rel) min: <.01% max: 199.34% x̄: 5.91% x̃: 2.60% 95% mean confidence interval for instructions value: 25.43 40.03 95% mean confidence interval for instructions %-change: 3.60% 4.33% Instructions are HURT. total loops in shared programs: 5847 -> 5847 (0.00%) loops in affected programs: 0 -> 0 helped: 0 HURT: 0 total cycles in shared programs: 839329852 -> 845491482 (0.73%) cycles in affected programs: 130229434 -> 136391064 (4.73%) helped: 1098 HURT: 2228 helped stats (abs) min: 1 max: 130102 x̄: 1340.64 x̃: 22 helped stats (rel) min: <.01% max: 64.25% x̄: 4.03% x̃: 0.71% HURT stats (abs) min: 1 max: 185309 x̄: 3426.24 x̃: 87 HURT stats (rel) min: <.01% max: 92.85% x̄: 8.12% x̃: 3.82% 95% mean confidence interval for cycles value: 1342.16 2362.97 95% mean confidence interval for cycles %-change: 3.70% 4.52% Cycles are HURT. total spills in shared programs: 10768 -> 11856 (10.10%) spills in affected programs: 9717 -> 10805 (11.20%) helped: 25 HURT: 28 total fills in shared programs: 13720 -> 16258 (18.50%) fills in affected programs: 12016 -> 14554 (21.12%) helped: 25 HURT: 28 total sends in shared programs: 1034790 -> 1031266 (-0.34%) sends in affected programs: 33416 -> 29892 (-10.55%) helped: 1005 HURT: 0 helped stats (abs) min: 1 max: 22 x̄: 3.51 x̃: 3 helped stats (rel) min: 1.69% max: 60.00% x̄: 15.20% x̃: 14.08% 95% mean confidence interval for sends value: -3.72 -3.29 95% mean confidence interval for sends %-change: -15.82% -14.57% Sends are helped. LOST: 26 GAINED: 183 shader-db on a number of VK/DX titles on DG2 : PERCENTAGE DELTAS Shaders Instrs Cycles age_of_wonders_III 1928 +0.02% -0.19% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Max live registers Max dispatch width assassins_creed_odyssey 2119 +1.12% -0.42% -0.03% -0.29% -9.10% -4.26% -0.64% +0.65% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Max live registers aztec_ruins_high 269 -0.05% -0.45% -0.29% -7.27% -0.33% PERCENTAGE DELTAS Shaders Instrs Cycles Max live registers Max dispatch width dark_souls_3_dxvk_g2 1420 +0.09% +0.24% +0.21% +0.12% (stats look bad, but it's just one shader affected) PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers fallout_4_dxvk_g2 1638 +0.67% +8.32% +16.02% +7.17% +100.00% +0.48% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Spill count Fill count Max live registers Max dispatch width red_dead_redemption2 5969 +0.16% -0.04% -0.04% +0.01% +0.05% -0.20% +0.04% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12129 +2.19% +1.36% -1.23% -0.36% +2.04% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers shooter-game 693 +0.07% -0.89% -0.09% -0.09% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers Max dispatch width talos_g2 1140 +0.37% +3.80% -0.86% -0.67% +0.19% PERCENTAGE DELTAS Shaders Instrs Cycles Max live registers Max dispatch width total_war_warhammer2 477 +0.25% +0.66% -0.17% +0.10% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers Max dispatch width witcher_3_dxvk_g2 1074 +0.75% -10.45% -0.15% -0.16% -0.16% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers wolfenstein_youngblood 1111 +0.52% +0.66% -0.59% -0.03% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23477>
2023-06-06 18:03:26 +03:00
case nir_intrinsic_load_ubo:
case nir_intrinsic_load_ubo_uniform_block_intel: {
fs_reg surface, surface_handle;
if (get_nir_src_bindless(instr->src[0]))
surface_handle = get_nir_buffer_intrinsic_index(bld, instr);
else
surface = get_nir_buffer_intrinsic_index(bld, instr);
if (!nir_src_is_const(instr->src[1])) {
intel/fs: make use of load_ubo_uniform_block_intel The principle is the same as the load_ssbo_uniform_block_intel. Whenever we see a uniform offset, load the data only once in GRFs to reduce register pressure. Iris shader-db run on DG2 : total instructions in shared programs: 23001325 -> 23094969 (0.41%) instructions in affected programs: 1775989 -> 1869633 (5.27%) helped: 764 HURT: 2097 helped stats (abs) min: 1 max: 102 x̄: 6.96 x̃: 2 helped stats (rel) min: 0.03% max: 16.91% x̄: 1.36% x̃: 0.63% HURT stats (abs) min: 1 max: 2461 x̄: 47.19 x̃: 7 HURT stats (rel) min: <.01% max: 199.34% x̄: 5.91% x̃: 2.60% 95% mean confidence interval for instructions value: 25.43 40.03 95% mean confidence interval for instructions %-change: 3.60% 4.33% Instructions are HURT. total loops in shared programs: 5847 -> 5847 (0.00%) loops in affected programs: 0 -> 0 helped: 0 HURT: 0 total cycles in shared programs: 839329852 -> 845491482 (0.73%) cycles in affected programs: 130229434 -> 136391064 (4.73%) helped: 1098 HURT: 2228 helped stats (abs) min: 1 max: 130102 x̄: 1340.64 x̃: 22 helped stats (rel) min: <.01% max: 64.25% x̄: 4.03% x̃: 0.71% HURT stats (abs) min: 1 max: 185309 x̄: 3426.24 x̃: 87 HURT stats (rel) min: <.01% max: 92.85% x̄: 8.12% x̃: 3.82% 95% mean confidence interval for cycles value: 1342.16 2362.97 95% mean confidence interval for cycles %-change: 3.70% 4.52% Cycles are HURT. total spills in shared programs: 10768 -> 11856 (10.10%) spills in affected programs: 9717 -> 10805 (11.20%) helped: 25 HURT: 28 total fills in shared programs: 13720 -> 16258 (18.50%) fills in affected programs: 12016 -> 14554 (21.12%) helped: 25 HURT: 28 total sends in shared programs: 1034790 -> 1031266 (-0.34%) sends in affected programs: 33416 -> 29892 (-10.55%) helped: 1005 HURT: 0 helped stats (abs) min: 1 max: 22 x̄: 3.51 x̃: 3 helped stats (rel) min: 1.69% max: 60.00% x̄: 15.20% x̃: 14.08% 95% mean confidence interval for sends value: -3.72 -3.29 95% mean confidence interval for sends %-change: -15.82% -14.57% Sends are helped. LOST: 26 GAINED: 183 shader-db on a number of VK/DX titles on DG2 : PERCENTAGE DELTAS Shaders Instrs Cycles age_of_wonders_III 1928 +0.02% -0.19% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Max live registers Max dispatch width assassins_creed_odyssey 2119 +1.12% -0.42% -0.03% -0.29% -9.10% -4.26% -0.64% +0.65% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Max live registers aztec_ruins_high 269 -0.05% -0.45% -0.29% -7.27% -0.33% PERCENTAGE DELTAS Shaders Instrs Cycles Max live registers Max dispatch width dark_souls_3_dxvk_g2 1420 +0.09% +0.24% +0.21% +0.12% (stats look bad, but it's just one shader affected) PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers fallout_4_dxvk_g2 1638 +0.67% +8.32% +16.02% +7.17% +100.00% +0.48% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Spill count Fill count Max live registers Max dispatch width red_dead_redemption2 5969 +0.16% -0.04% -0.04% +0.01% +0.05% -0.20% +0.04% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12129 +2.19% +1.36% -1.23% -0.36% +2.04% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers shooter-game 693 +0.07% -0.89% -0.09% -0.09% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers Max dispatch width talos_g2 1140 +0.37% +3.80% -0.86% -0.67% +0.19% PERCENTAGE DELTAS Shaders Instrs Cycles Max live registers Max dispatch width total_war_warhammer2 477 +0.25% +0.66% -0.17% +0.10% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers Max dispatch width witcher_3_dxvk_g2 1074 +0.75% -10.45% -0.15% -0.16% -0.16% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers wolfenstein_youngblood 1111 +0.52% +0.66% -0.59% -0.03% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23477>
2023-06-06 18:03:26 +03:00
if (instr->intrinsic == nir_intrinsic_load_ubo) {
/* load_ubo with non-uniform offset */
fs_reg base_offset = retype(get_nir_src(instr->src[1]),
BRW_REGISTER_TYPE_UD);
for (int i = 0; i < instr->num_components; i++)
VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i),
surface, surface_handle,
base_offset, i * type_sz(dest.type),
instr->def.bit_size / 8);
intel/fs: make use of load_ubo_uniform_block_intel The principle is the same as the load_ssbo_uniform_block_intel. Whenever we see a uniform offset, load the data only once in GRFs to reduce register pressure. Iris shader-db run on DG2 : total instructions in shared programs: 23001325 -> 23094969 (0.41%) instructions in affected programs: 1775989 -> 1869633 (5.27%) helped: 764 HURT: 2097 helped stats (abs) min: 1 max: 102 x̄: 6.96 x̃: 2 helped stats (rel) min: 0.03% max: 16.91% x̄: 1.36% x̃: 0.63% HURT stats (abs) min: 1 max: 2461 x̄: 47.19 x̃: 7 HURT stats (rel) min: <.01% max: 199.34% x̄: 5.91% x̃: 2.60% 95% mean confidence interval for instructions value: 25.43 40.03 95% mean confidence interval for instructions %-change: 3.60% 4.33% Instructions are HURT. total loops in shared programs: 5847 -> 5847 (0.00%) loops in affected programs: 0 -> 0 helped: 0 HURT: 0 total cycles in shared programs: 839329852 -> 845491482 (0.73%) cycles in affected programs: 130229434 -> 136391064 (4.73%) helped: 1098 HURT: 2228 helped stats (abs) min: 1 max: 130102 x̄: 1340.64 x̃: 22 helped stats (rel) min: <.01% max: 64.25% x̄: 4.03% x̃: 0.71% HURT stats (abs) min: 1 max: 185309 x̄: 3426.24 x̃: 87 HURT stats (rel) min: <.01% max: 92.85% x̄: 8.12% x̃: 3.82% 95% mean confidence interval for cycles value: 1342.16 2362.97 95% mean confidence interval for cycles %-change: 3.70% 4.52% Cycles are HURT. total spills in shared programs: 10768 -> 11856 (10.10%) spills in affected programs: 9717 -> 10805 (11.20%) helped: 25 HURT: 28 total fills in shared programs: 13720 -> 16258 (18.50%) fills in affected programs: 12016 -> 14554 (21.12%) helped: 25 HURT: 28 total sends in shared programs: 1034790 -> 1031266 (-0.34%) sends in affected programs: 33416 -> 29892 (-10.55%) helped: 1005 HURT: 0 helped stats (abs) min: 1 max: 22 x̄: 3.51 x̃: 3 helped stats (rel) min: 1.69% max: 60.00% x̄: 15.20% x̃: 14.08% 95% mean confidence interval for sends value: -3.72 -3.29 95% mean confidence interval for sends %-change: -15.82% -14.57% Sends are helped. LOST: 26 GAINED: 183 shader-db on a number of VK/DX titles on DG2 : PERCENTAGE DELTAS Shaders Instrs Cycles age_of_wonders_III 1928 +0.02% -0.19% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Max live registers Max dispatch width assassins_creed_odyssey 2119 +1.12% -0.42% -0.03% -0.29% -9.10% -4.26% -0.64% +0.65% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Max live registers aztec_ruins_high 269 -0.05% -0.45% -0.29% -7.27% -0.33% PERCENTAGE DELTAS Shaders Instrs Cycles Max live registers Max dispatch width dark_souls_3_dxvk_g2 1420 +0.09% +0.24% +0.21% +0.12% (stats look bad, but it's just one shader affected) PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers fallout_4_dxvk_g2 1638 +0.67% +8.32% +16.02% +7.17% +100.00% +0.48% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Spill count Fill count Max live registers Max dispatch width red_dead_redemption2 5969 +0.16% -0.04% -0.04% +0.01% +0.05% -0.20% +0.04% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12129 +2.19% +1.36% -1.23% -0.36% +2.04% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers shooter-game 693 +0.07% -0.89% -0.09% -0.09% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers Max dispatch width talos_g2 1140 +0.37% +3.80% -0.86% -0.67% +0.19% PERCENTAGE DELTAS Shaders Instrs Cycles Max live registers Max dispatch width total_war_warhammer2 477 +0.25% +0.66% -0.17% +0.10% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers Max dispatch width witcher_3_dxvk_g2 1074 +0.75% -10.45% -0.15% -0.16% -0.16% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers wolfenstein_youngblood 1111 +0.52% +0.66% -0.59% -0.03% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23477>
2023-06-06 18:03:26 +03:00
prog_data->has_ubo_pull = true;
} else {
/* load_ubo with uniform offset */
const fs_builder ubld1 = bld.exec_all().group(1, 0);
const fs_builder ubld8 = bld.exec_all().group(8, 0);
const fs_builder ubld16 = bld.exec_all().group(16, 0);
fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
srcs[SURFACE_LOGICAL_SRC_SURFACE] = surface;
srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = surface_handle;
const nir_src load_offset = instr->src[1];
if (nir_src_is_const(load_offset)) {
fs_reg addr = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
ubld8.MOV(addr, brw_imm_ud(nir_src_as_uint(load_offset)));
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = component(addr, 0);
} else {
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
bld.emit_uniformize(get_nir_src(load_offset));
}
const unsigned total_dwords = ALIGN(instr->num_components, REG_SIZE / 4);
unsigned loaded_dwords = 0;
const fs_reg packed_consts =
ubld1.vgrf(BRW_REGISTER_TYPE_UD, total_dwords);
intel/fs: make use of load_ubo_uniform_block_intel The principle is the same as the load_ssbo_uniform_block_intel. Whenever we see a uniform offset, load the data only once in GRFs to reduce register pressure. Iris shader-db run on DG2 : total instructions in shared programs: 23001325 -> 23094969 (0.41%) instructions in affected programs: 1775989 -> 1869633 (5.27%) helped: 764 HURT: 2097 helped stats (abs) min: 1 max: 102 x̄: 6.96 x̃: 2 helped stats (rel) min: 0.03% max: 16.91% x̄: 1.36% x̃: 0.63% HURT stats (abs) min: 1 max: 2461 x̄: 47.19 x̃: 7 HURT stats (rel) min: <.01% max: 199.34% x̄: 5.91% x̃: 2.60% 95% mean confidence interval for instructions value: 25.43 40.03 95% mean confidence interval for instructions %-change: 3.60% 4.33% Instructions are HURT. total loops in shared programs: 5847 -> 5847 (0.00%) loops in affected programs: 0 -> 0 helped: 0 HURT: 0 total cycles in shared programs: 839329852 -> 845491482 (0.73%) cycles in affected programs: 130229434 -> 136391064 (4.73%) helped: 1098 HURT: 2228 helped stats (abs) min: 1 max: 130102 x̄: 1340.64 x̃: 22 helped stats (rel) min: <.01% max: 64.25% x̄: 4.03% x̃: 0.71% HURT stats (abs) min: 1 max: 185309 x̄: 3426.24 x̃: 87 HURT stats (rel) min: <.01% max: 92.85% x̄: 8.12% x̃: 3.82% 95% mean confidence interval for cycles value: 1342.16 2362.97 95% mean confidence interval for cycles %-change: 3.70% 4.52% Cycles are HURT. total spills in shared programs: 10768 -> 11856 (10.10%) spills in affected programs: 9717 -> 10805 (11.20%) helped: 25 HURT: 28 total fills in shared programs: 13720 -> 16258 (18.50%) fills in affected programs: 12016 -> 14554 (21.12%) helped: 25 HURT: 28 total sends in shared programs: 1034790 -> 1031266 (-0.34%) sends in affected programs: 33416 -> 29892 (-10.55%) helped: 1005 HURT: 0 helped stats (abs) min: 1 max: 22 x̄: 3.51 x̃: 3 helped stats (rel) min: 1.69% max: 60.00% x̄: 15.20% x̃: 14.08% 95% mean confidence interval for sends value: -3.72 -3.29 95% mean confidence interval for sends %-change: -15.82% -14.57% Sends are helped. LOST: 26 GAINED: 183 shader-db on a number of VK/DX titles on DG2 : PERCENTAGE DELTAS Shaders Instrs Cycles age_of_wonders_III 1928 +0.02% -0.19% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Max live registers Max dispatch width assassins_creed_odyssey 2119 +1.12% -0.42% -0.03% -0.29% -9.10% -4.26% -0.64% +0.65% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Max live registers aztec_ruins_high 269 -0.05% -0.45% -0.29% -7.27% -0.33% PERCENTAGE DELTAS Shaders Instrs Cycles Max live registers Max dispatch width dark_souls_3_dxvk_g2 1420 +0.09% +0.24% +0.21% +0.12% (stats look bad, but it's just one shader affected) PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers fallout_4_dxvk_g2 1638 +0.67% +8.32% +16.02% +7.17% +100.00% +0.48% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Spill count Fill count Max live registers Max dispatch width red_dead_redemption2 5969 +0.16% -0.04% -0.04% +0.01% +0.05% -0.20% +0.04% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12129 +2.19% +1.36% -1.23% -0.36% +2.04% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers shooter-game 693 +0.07% -0.89% -0.09% -0.09% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers Max dispatch width talos_g2 1140 +0.37% +3.80% -0.86% -0.67% +0.19% PERCENTAGE DELTAS Shaders Instrs Cycles Max live registers Max dispatch width total_war_warhammer2 477 +0.25% +0.66% -0.17% +0.10% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers Max dispatch width witcher_3_dxvk_g2 1074 +0.75% -10.45% -0.15% -0.16% -0.16% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers wolfenstein_youngblood 1111 +0.52% +0.66% -0.59% -0.03% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23477>
2023-06-06 18:03:26 +03:00
while (loaded_dwords < total_dwords) {
const unsigned block =
choose_oword_block_size_dwords(devinfo,
total_dwords - loaded_dwords);
const unsigned block_bytes = block * 4;
intel/fs: make use of load_ubo_uniform_block_intel The principle is the same as the load_ssbo_uniform_block_intel. Whenever we see a uniform offset, load the data only once in GRFs to reduce register pressure. Iris shader-db run on DG2 : total instructions in shared programs: 23001325 -> 23094969 (0.41%) instructions in affected programs: 1775989 -> 1869633 (5.27%) helped: 764 HURT: 2097 helped stats (abs) min: 1 max: 102 x̄: 6.96 x̃: 2 helped stats (rel) min: 0.03% max: 16.91% x̄: 1.36% x̃: 0.63% HURT stats (abs) min: 1 max: 2461 x̄: 47.19 x̃: 7 HURT stats (rel) min: <.01% max: 199.34% x̄: 5.91% x̃: 2.60% 95% mean confidence interval for instructions value: 25.43 40.03 95% mean confidence interval for instructions %-change: 3.60% 4.33% Instructions are HURT. total loops in shared programs: 5847 -> 5847 (0.00%) loops in affected programs: 0 -> 0 helped: 0 HURT: 0 total cycles in shared programs: 839329852 -> 845491482 (0.73%) cycles in affected programs: 130229434 -> 136391064 (4.73%) helped: 1098 HURT: 2228 helped stats (abs) min: 1 max: 130102 x̄: 1340.64 x̃: 22 helped stats (rel) min: <.01% max: 64.25% x̄: 4.03% x̃: 0.71% HURT stats (abs) min: 1 max: 185309 x̄: 3426.24 x̃: 87 HURT stats (rel) min: <.01% max: 92.85% x̄: 8.12% x̃: 3.82% 95% mean confidence interval for cycles value: 1342.16 2362.97 95% mean confidence interval for cycles %-change: 3.70% 4.52% Cycles are HURT. total spills in shared programs: 10768 -> 11856 (10.10%) spills in affected programs: 9717 -> 10805 (11.20%) helped: 25 HURT: 28 total fills in shared programs: 13720 -> 16258 (18.50%) fills in affected programs: 12016 -> 14554 (21.12%) helped: 25 HURT: 28 total sends in shared programs: 1034790 -> 1031266 (-0.34%) sends in affected programs: 33416 -> 29892 (-10.55%) helped: 1005 HURT: 0 helped stats (abs) min: 1 max: 22 x̄: 3.51 x̃: 3 helped stats (rel) min: 1.69% max: 60.00% x̄: 15.20% x̃: 14.08% 95% mean confidence interval for sends value: -3.72 -3.29 95% mean confidence interval for sends %-change: -15.82% -14.57% Sends are helped. LOST: 26 GAINED: 183 shader-db on a number of VK/DX titles on DG2 : PERCENTAGE DELTAS Shaders Instrs Cycles age_of_wonders_III 1928 +0.02% -0.19% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Max live registers Max dispatch width assassins_creed_odyssey 2119 +1.12% -0.42% -0.03% -0.29% -9.10% -4.26% -0.64% +0.65% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Max live registers aztec_ruins_high 269 -0.05% -0.45% -0.29% -7.27% -0.33% PERCENTAGE DELTAS Shaders Instrs Cycles Max live registers Max dispatch width dark_souls_3_dxvk_g2 1420 +0.09% +0.24% +0.21% +0.12% (stats look bad, but it's just one shader affected) PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers fallout_4_dxvk_g2 1638 +0.67% +8.32% +16.02% +7.17% +100.00% +0.48% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Spill count Fill count Max live registers Max dispatch width red_dead_redemption2 5969 +0.16% -0.04% -0.04% +0.01% +0.05% -0.20% +0.04% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12129 +2.19% +1.36% -1.23% -0.36% +2.04% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers shooter-game 693 +0.07% -0.89% -0.09% -0.09% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers Max dispatch width talos_g2 1140 +0.37% +3.80% -0.86% -0.67% +0.19% PERCENTAGE DELTAS Shaders Instrs Cycles Max live registers Max dispatch width total_war_warhammer2 477 +0.25% +0.66% -0.17% +0.10% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers Max dispatch width witcher_3_dxvk_g2 1074 +0.75% -10.45% -0.15% -0.16% -0.16% PERCENTAGE DELTAS Shaders Instrs Cycles Send messages Max live registers wolfenstein_youngblood 1111 +0.52% +0.66% -0.59% -0.03% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23477>
2023-06-06 18:03:26 +03:00
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block);
const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
ubld.emit(SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
retype(byte_offset(packed_consts, loaded_dwords * 4), BRW_REGISTER_TYPE_UD),
srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written = align(block_bytes, REG_SIZE);
loaded_dwords += block;
ubld1.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
srcs[SURFACE_LOGICAL_SRC_ADDRESS],
brw_imm_ud(block_bytes));
}
for (unsigned c = 0; c < instr->num_components; c++) {
bld.MOV(retype(offset(dest, bld, c), BRW_REGISTER_TYPE_UD),
component(packed_consts, c));
}
prog_data->has_ubo_pull = true;
}
} else {
/* Even if we are loading doubles, a pull constant load will load
* a 32-bit vec4, so should only reserve vgrf space for that. If we
* need to load a full dvec4 we will have to emit 2 loads. This is
* similar to demote_pull_constants(), except that in that case we
* see individual accesses to each component of the vector and then
* we let CSE deal with duplicate loads. Here we see a vector access
* and we have to split it if necessary.
*/
const unsigned type_size = type_sz(dest.type);
const unsigned load_offset = nir_src_as_uint(instr->src[1]);
const unsigned ubo_block =
brw_nir_ubo_surface_index_get_push_block(instr->src[0]);
const unsigned offset_256b = load_offset / 32;
const unsigned end_256b =
DIV_ROUND_UP(load_offset + type_size * instr->num_components, 32);
/* See if we've selected this as a push constant candidate */
fs_reg push_reg;
for (int i = 0; i < 4; i++) {
const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
if (range->block == ubo_block &&
offset_256b >= range->start &&
end_256b <= range->start + range->length) {
push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type);
push_reg.offset = load_offset - 32 * range->start;
break;
}
}
if (push_reg.file != BAD_FILE) {
for (unsigned i = 0; i < instr->num_components; i++) {
bld.MOV(offset(dest, bld, i),
byte_offset(push_reg, i * type_size));
}
break;
}
prog_data->has_ubo_pull = true;
i965/fs: Fetch one cacheline of pull constants at a time. Asking the DC for less than one cacheline (4 owords) of data for uniform pull constants is suboptimal because the DC cannot request less than that from L3, resulting in wasted bandwidth and unnecessary message dispatch overhead, and exacerbating the IVB L3 serialization bug. The following table summarizes the overall framerate improvement (with statistical significance of 5% and sample size ~10) from the whole series up to this patch for several benchmarks and hardware generations: | SKL | BDW | HSW SynMark2 OglShMapPcf | 24.63% ±0.45% | 4.01% ±0.70% | 10.31% ±0.38% GfxBench4 gl_manhattan31 | 5.93% ±0.35% | 3.92% ±0.31% | 6.62% ±0.22% GfxBench4 gl_4 | 2.52% ±0.44% | 1.23% ±0.10% | N/A Unigine Valley | 0.83% ±0.17% | 0.23% ±0.05% | 0.74% ±0.45% Note that there are two versions of the Manhattan demo shipped with GfxBench4, one of them is the original gl_manhattan demo which doesn't use UBOs, so this patch will have no effect on it, and another one is the gl_manhattan31 demo based on GL 4.3/GLES 3.1, which this patch benefits as shown above. I haven't observed any statistically significant regressions in the benchmarks I have at hand. Note that the comparatively huge improvement on SKL in the OglShMapPcf test case is due to the combined effect of this patch and the register pressure benefit on SKL+ of "i965/fs: Switch to the constant cache for uniform pull constants.", part of the same series. Going up to 8 oword blocks would improve performance of pull constants even more, but at the cost of some additional bandwidth and register pressure, so it would have to be done on-demand based on the number of constants actually used by the shader. v2: Fix for Gen4 and 5. v3: Non-trivial rebase. Rework to allow the visitor specifiy arbitrary pull constant block sizes. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2016-12-08 19:18:00 -08:00
const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
for (unsigned c = 0; c < instr->num_components;) {
const unsigned base = load_offset + c * type_size;
i965/fs: Fetch one cacheline of pull constants at a time. Asking the DC for less than one cacheline (4 owords) of data for uniform pull constants is suboptimal because the DC cannot request less than that from L3, resulting in wasted bandwidth and unnecessary message dispatch overhead, and exacerbating the IVB L3 serialization bug. The following table summarizes the overall framerate improvement (with statistical significance of 5% and sample size ~10) from the whole series up to this patch for several benchmarks and hardware generations: | SKL | BDW | HSW SynMark2 OglShMapPcf | 24.63% ±0.45% | 4.01% ±0.70% | 10.31% ±0.38% GfxBench4 gl_manhattan31 | 5.93% ±0.35% | 3.92% ±0.31% | 6.62% ±0.22% GfxBench4 gl_4 | 2.52% ±0.44% | 1.23% ±0.10% | N/A Unigine Valley | 0.83% ±0.17% | 0.23% ±0.05% | 0.74% ±0.45% Note that there are two versions of the Manhattan demo shipped with GfxBench4, one of them is the original gl_manhattan demo which doesn't use UBOs, so this patch will have no effect on it, and another one is the gl_manhattan31 demo based on GL 4.3/GLES 3.1, which this patch benefits as shown above. I haven't observed any statistically significant regressions in the benchmarks I have at hand. Note that the comparatively huge improvement on SKL in the OglShMapPcf test case is due to the combined effect of this patch and the register pressure benefit on SKL+ of "i965/fs: Switch to the constant cache for uniform pull constants.", part of the same series. Going up to 8 oword blocks would improve performance of pull constants even more, but at the cost of some additional bandwidth and register pressure, so it would have to be done on-demand based on the number of constants actually used by the shader. v2: Fix for Gen4 and 5. v3: Non-trivial rebase. Rework to allow the visitor specifiy arbitrary pull constant block sizes. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2016-12-08 19:18:00 -08:00
/* Number of usable components in the next block-aligned load. */
const unsigned count = MIN2(instr->num_components - c,
i965/fs: Fetch one cacheline of pull constants at a time. Asking the DC for less than one cacheline (4 owords) of data for uniform pull constants is suboptimal because the DC cannot request less than that from L3, resulting in wasted bandwidth and unnecessary message dispatch overhead, and exacerbating the IVB L3 serialization bug. The following table summarizes the overall framerate improvement (with statistical significance of 5% and sample size ~10) from the whole series up to this patch for several benchmarks and hardware generations: | SKL | BDW | HSW SynMark2 OglShMapPcf | 24.63% ±0.45% | 4.01% ±0.70% | 10.31% ±0.38% GfxBench4 gl_manhattan31 | 5.93% ±0.35% | 3.92% ±0.31% | 6.62% ±0.22% GfxBench4 gl_4 | 2.52% ±0.44% | 1.23% ±0.10% | N/A Unigine Valley | 0.83% ±0.17% | 0.23% ±0.05% | 0.74% ±0.45% Note that there are two versions of the Manhattan demo shipped with GfxBench4, one of them is the original gl_manhattan demo which doesn't use UBOs, so this patch will have no effect on it, and another one is the gl_manhattan31 demo based on GL 4.3/GLES 3.1, which this patch benefits as shown above. I haven't observed any statistically significant regressions in the benchmarks I have at hand. Note that the comparatively huge improvement on SKL in the OglShMapPcf test case is due to the combined effect of this patch and the register pressure benefit on SKL+ of "i965/fs: Switch to the constant cache for uniform pull constants.", part of the same series. Going up to 8 oword blocks would improve performance of pull constants even more, but at the cost of some additional bandwidth and register pressure, so it would have to be done on-demand based on the number of constants actually used by the shader. v2: Fix for Gen4 and 5. v3: Non-trivial rebase. Rework to allow the visitor specifiy arbitrary pull constant block sizes. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2016-12-08 19:18:00 -08:00
(block_sz - base % block_sz) / type_size);
const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);
fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = surface;
srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1));
srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz);
ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
srcs, PULL_UNIFORM_CONSTANT_SRCS);
const fs_reg consts =
i965/fs: Fetch one cacheline of pull constants at a time. Asking the DC for less than one cacheline (4 owords) of data for uniform pull constants is suboptimal because the DC cannot request less than that from L3, resulting in wasted bandwidth and unnecessary message dispatch overhead, and exacerbating the IVB L3 serialization bug. The following table summarizes the overall framerate improvement (with statistical significance of 5% and sample size ~10) from the whole series up to this patch for several benchmarks and hardware generations: | SKL | BDW | HSW SynMark2 OglShMapPcf | 24.63% ±0.45% | 4.01% ±0.70% | 10.31% ±0.38% GfxBench4 gl_manhattan31 | 5.93% ±0.35% | 3.92% ±0.31% | 6.62% ±0.22% GfxBench4 gl_4 | 2.52% ±0.44% | 1.23% ±0.10% | N/A Unigine Valley | 0.83% ±0.17% | 0.23% ±0.05% | 0.74% ±0.45% Note that there are two versions of the Manhattan demo shipped with GfxBench4, one of them is the original gl_manhattan demo which doesn't use UBOs, so this patch will have no effect on it, and another one is the gl_manhattan31 demo based on GL 4.3/GLES 3.1, which this patch benefits as shown above. I haven't observed any statistically significant regressions in the benchmarks I have at hand. Note that the comparatively huge improvement on SKL in the OglShMapPcf test case is due to the combined effect of this patch and the register pressure benefit on SKL+ of "i965/fs: Switch to the constant cache for uniform pull constants.", part of the same series. Going up to 8 oword blocks would improve performance of pull constants even more, but at the cost of some additional bandwidth and register pressure, so it would have to be done on-demand based on the number of constants actually used by the shader. v2: Fix for Gen4 and 5. v3: Non-trivial rebase. Rework to allow the visitor specifiy arbitrary pull constant block sizes. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2016-12-08 19:18:00 -08:00
retype(byte_offset(packed_consts, base & (block_sz - 1)),
dest.type);
for (unsigned d = 0; d < count; d++)
bld.MOV(offset(dest, bld, c + d), component(consts, d));
c += count;
}
}
break;
}
case nir_intrinsic_load_global:
case nir_intrinsic_load_global_constant: {
assert(devinfo->ver >= 8);
assert(instr->def.bit_size <= 32);
assert(nir_intrinsic_align(instr) > 0);
fs_reg srcs[A64_LOGICAL_NUM_SRCS];
srcs[A64_LOGICAL_ADDRESS] = get_nir_src(instr->src[0]);
srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */
srcs[A64_LOGICAL_ENABLE_HELPERS] =
brw_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS);
if (instr->def.bit_size == 32 &&
nir_intrinsic_align(instr) >= 4) {
assert(instr->def.num_components <= 4);
srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components);
fs_inst *inst =
bld.emit(SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL, dest,
srcs, A64_LOGICAL_NUM_SRCS);
inst->size_written = instr->num_components *
inst->dst.component_size(inst->exec_size);
} else {
const unsigned bit_size = instr->def.bit_size;
assert(instr->def.num_components == 1);
fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
srcs[A64_LOGICAL_ARG] = brw_imm_ud(bit_size);
bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL, tmp,
srcs, A64_LOGICAL_NUM_SRCS);
bld.MOV(dest, subscript(tmp, dest.type, 0));
}
break;
}
case nir_intrinsic_store_global: {
assert(devinfo->ver >= 8);
assert(nir_src_bit_size(instr->src[0]) <= 32);
assert(nir_intrinsic_write_mask(instr) ==
(1u << instr->num_components) - 1);
assert(nir_intrinsic_align(instr) > 0);
fs_reg srcs[A64_LOGICAL_NUM_SRCS];
srcs[A64_LOGICAL_ADDRESS] = get_nir_src(instr->src[1]);
srcs[A64_LOGICAL_ENABLE_HELPERS] =
brw_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS);
if (nir_src_bit_size(instr->src[0]) == 32 &&
nir_intrinsic_align(instr) >= 4) {
assert(nir_src_num_components(instr->src[0]) <= 4);
srcs[A64_LOGICAL_SRC] = get_nir_src(instr->src[0]); /* Data */
srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components);
bld.emit(SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL, fs_reg(),
srcs, A64_LOGICAL_NUM_SRCS);
} else {
assert(nir_src_num_components(instr->src[0]) == 1);
const unsigned bit_size = nir_src_bit_size(instr->src[0]);
brw_reg_type data_type =
brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.MOV(tmp, retype(get_nir_src(instr->src[0]), data_type));
srcs[A64_LOGICAL_SRC] = tmp;
srcs[A64_LOGICAL_ARG] = brw_imm_ud(bit_size);
bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL, fs_reg(),
srcs, A64_LOGICAL_NUM_SRCS);
}
break;
}
case nir_intrinsic_global_atomic:
case nir_intrinsic_global_atomic_swap:
nir_emit_global_atomic(bld, instr);
break;
case nir_intrinsic_load_global_const_block_intel: {
assert(instr->def.bit_size == 32);
assert(instr->num_components == 8 || instr->num_components == 16);
const fs_builder ubld = bld.exec_all().group(instr->num_components, 0);
fs_reg load_val;
bool is_pred_const = nir_src_is_const(instr->src[1]);
if (is_pred_const && nir_src_as_uint(instr->src[1]) == 0) {
/* In this case, we don't want the UBO load at all. We really
* shouldn't get here but it's possible.
*/
load_val = brw_imm_ud(0);
} else {
/* The uniform process may stomp the flag so do this first */
fs_reg addr = bld.emit_uniformize(get_nir_src(instr->src[0]));
load_val = ubld.vgrf(BRW_REGISTER_TYPE_UD);
/* If the predicate is constant and we got here, then it's non-zero
* and we don't need the predicate at all.
*/
if (!is_pred_const) {
/* Load the predicate */
fs_reg pred = bld.emit_uniformize(get_nir_src(instr->src[1]));
fs_inst *mov = ubld.MOV(bld.null_reg_d(), pred);
mov->conditional_mod = BRW_CONDITIONAL_NZ;
/* Stomp the destination with 0 if we're OOB */
mov = ubld.MOV(load_val, brw_imm_ud(0));
mov->predicate = BRW_PREDICATE_NORMAL;
mov->predicate_inverse = true;
}
fs_reg srcs[A64_LOGICAL_NUM_SRCS];
srcs[A64_LOGICAL_ADDRESS] = addr;
srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */
srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components);
/* This intrinsic loads memory from a uniform address, sometimes
* shared across lanes. We never need to mask it.
*/
srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0);
fs_inst *load = ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL,
load_val, srcs, A64_LOGICAL_NUM_SRCS);
if (!is_pred_const)
load->predicate = BRW_PREDICATE_NORMAL;
}
/* From the HW perspective, we just did a single SIMD16 instruction
* which loaded a dword in each SIMD channel. From NIR's perspective,
* this instruction returns a vec16. Any users of this data in the
* back-end will expect a vec16 per SIMD channel so we have to emit a
* pile of MOVs to resolve this discrepancy. Fortunately, copy-prop
* will generally clean them up for us.
*/
for (unsigned i = 0; i < instr->num_components; i++) {
bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD),
component(load_val, i));
}
break;
}
intel/fs: handle load_global_constant_uniform_block_intel Again, load the data just once in GRF, share it across lanes. Shader-db on dg2: total instructions in shared programs: 23214555 -> 23215400 (<.01%) instructions in affected programs: 199977 -> 200822 (0.42%) helped: 3 HURT: 38 helped stats (abs) min: 5 max: 670 x̄: 283.67 x̃: 176 helped stats (rel) min: 1.34% max: 49.41% x̄: 22.15% x̃: 15.70% HURT stats (abs) min: 1 max: 185 x̄: 44.63 x̃: 32 HURT stats (rel) min: 0.13% max: 42.86% x̄: 10.25% x̃: 9.30% 95% mean confidence interval for instructions value: -18.65 59.87 95% mean confidence interval for instructions %-change: 3.29% 12.47% Inconclusive result (value mean confidence interval includes 0). total loops in shared programs: 5928 -> 5928 (0.00%) loops in affected programs: 0 -> 0 helped: 0 HURT: 0 total cycles in shared programs: 851137495 -> 851152449 (<.01%) cycles in affected programs: 16406137 -> 16421091 (0.09%) helped: 9 HURT: 32 helped stats (abs) min: 10 max: 13498 x̄: 6443.22 x̃: 5581 helped stats (rel) min: 0.11% max: 4.75% x̄: 1.45% x̃: 0.34% HURT stats (abs) min: 3 max: 15056 x̄: 2279.47 x̃: 735 HURT stats (rel) min: 0.10% max: 23.71% x̄: 4.58% x̃: 4.65% 95% mean confidence interval for cycles value: -1315.40 2044.87 95% mean confidence interval for cycles %-change: 1.71% 4.80% Inconclusive result (value mean confidence interval includes 0). total spills in shared programs: 11856 -> 11825 (-0.26%) spills in affected programs: 2368 -> 2337 (-1.31%) helped: 4 HURT: 0 total fills in shared programs: 16258 -> 16207 (-0.31%) fills in affected programs: 2930 -> 2879 (-1.74%) helped: 4 HURT: 0 total sends in shared programs: 1038194 -> 1038185 (<.01%) sends in affected programs: 40 -> 31 (-22.50%) helped: 4 HURT: 0 helped stats (abs) min: 1 max: 4 x̄: 2.25 x̃: 2 helped stats (rel) min: 10.00% max: 33.33% x̄: 21.46% x̃: 21.25% 95% mean confidence interval for sends value: -4.64 0.14 95% mean confidence interval for sends %-change: -40.41% -2.51% Inconclusive result (value mean confidence interval includes 0). LOST: 0 GAINED: 0 Some VK/DX titles result (on DG2 only), it's mostly additional instruction counts except for the unity spaceship demo where a CS shader gets additional SIMDness. The reason for additional instructions is that since we're doing block loads, we need to find the live channels in control flow to select a single lane value that is valid. aztec_ruins_high: Totals from 3 (1.12% of 269) affected shaders: Instrs: 17732 -> 17896 (+0.92%) Cycles: 796518 -> 819302 (+2.86%) cyberpunk_2077: Totals from 17 (0.17% of 10301) affected shaders: Instrs: 10848 -> 11658 (+7.47%) Cycles: 248243 -> 259168 (+4.40%); split: -0.57%, +4.97% fallout_4_dxvk_g2: Totals from 2 (0.12% of 1638) affected shaders: Instrs: 3157 -> 3368 (+6.68%) Cycles: 487807 -> 490426 (+0.54%); split: -0.26%, +0.79% Max live registers: 139 -> 141 (+1.44%) red_dead_redemption2: Totals from 68 (1.14% of 5970) affected shaders: Instrs: 34871 -> 36486 (+4.63%) Cycles: 551430 -> 565211 (+2.50%) Send messages: 2074 -> 2072 (-0.10%) Max live registers: 5078 -> 5077 (-0.02%) total_war_warhammer2: Totals from 5 (1.05% of 478) affected shaders: Instrs: 6905 -> 6971 (+0.96%); split: -0.16%, +1.12% Cycles: 97035 -> 97989 (+0.98%); split: -0.07%, +1.05% unity spaceship demo (instruction count going up due to a CS shader bump from SIMD8->16): Totals from 53 (9.71% of 546) affected shaders: Instrs: 223748 -> 233223 (+4.23%); split: -0.01%, +4.25% Cycles: 23134697 -> 25207080 (+8.96%); split: -0.17%, +9.13% Subgroup size: 480 -> 488 (+1.67%) Spill count: 2156 -> 2242 (+3.99%); split: -0.19%, +4.17% Fill count: 4617 -> 4845 (+4.94%); split: -0.09%, +5.02% Max live registers: 5991 -> 6050 (+0.98%); split: -0.40%, +1.39% Max dispatch width: 480 -> 488 (+1.67%) witcher_3_dxvk_g2: Totals from 27 (2.51% of 1074) affected shaders: Instrs: 57067 -> 57677 (+1.07%); split: -0.03%, +1.10% Cycles: 1397871 -> 1436704 (+2.78%); split: -0.35%, +3.13% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23477>
2023-06-08 21:53:02 +03:00
case nir_intrinsic_load_global_constant_uniform_block_intel: {
const unsigned total_dwords = ALIGN(instr->num_components, REG_SIZE / 4);
unsigned loaded_dwords = 0;
const fs_builder ubld1 = bld.exec_all().group(1, 0);
const fs_builder ubld8 = bld.exec_all().group(8, 0);
const fs_builder ubld16 = bld.exec_all().group(16, 0);
const fs_reg packed_consts =
ubld1.vgrf(BRW_REGISTER_TYPE_UD, total_dwords);
fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[0]));
while (loaded_dwords < total_dwords) {
const unsigned block =
choose_oword_block_size_dwords(devinfo,
total_dwords - loaded_dwords);
const unsigned block_bytes = block * 4;
const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
fs_reg srcs[A64_LOGICAL_NUM_SRCS];
srcs[A64_LOGICAL_ADDRESS] = address;
srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */
srcs[A64_LOGICAL_ARG] = brw_imm_ud(block);
srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0);
ubld.emit(SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
retype(byte_offset(packed_consts, loaded_dwords * 4), BRW_REGISTER_TYPE_UD),
srcs, A64_LOGICAL_NUM_SRCS)->size_written = block_bytes;
increment_a64_address(ubld1, address, block_bytes);
loaded_dwords += block;
}
for (unsigned c = 0; c < instr->num_components; c++)
bld.MOV(retype(offset(dest, bld, c), BRW_REGISTER_TYPE_UD),
component(packed_consts, c));
break;
}
case nir_intrinsic_load_ssbo: {
assert(devinfo->ver >= 7);
const unsigned bit_size = instr->def.bit_size;
fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
srcs[get_nir_src_bindless(instr->src[0]) ?
SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
SURFACE_LOGICAL_SRC_SURFACE] =
get_nir_buffer_intrinsic_index(bld, instr);
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
/* Make dest unsigned because that's what the temporary will be */
dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
/* Read the vector */
assert(bit_size <= 32);
assert(nir_intrinsic_align(instr) > 0);
if (bit_size == 32 &&
nir_intrinsic_align(instr) >= 4) {
assert(instr->def.num_components <= 4);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
fs_inst *inst =
bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
inst->size_written = instr->num_components * dispatch_width * 4;
} else {
assert(instr->def.num_components == 1);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
bld.MOV(dest, subscript(read_result, dest.type, 0));
}
break;
}
case nir_intrinsic_store_ssbo: {
assert(devinfo->ver >= 7);
const unsigned bit_size = nir_src_bit_size(instr->src[0]);
fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
srcs[get_nir_src_bindless(instr->src[1]) ?
SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
SURFACE_LOGICAL_SRC_SURFACE] =
get_nir_buffer_intrinsic_index(bld, instr);
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[2]);
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
fs_reg data = get_nir_src(instr->src[0]);
data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
assert(bit_size <= 32);
assert(nir_intrinsic_write_mask(instr) ==
(1u << instr->num_components) - 1);
assert(nir_intrinsic_align(instr) > 0);
if (bit_size == 32 &&
nir_intrinsic_align(instr) >= 4) {
assert(nir_src_num_components(instr->src[0]) <= 4);
srcs[SURFACE_LOGICAL_SRC_DATA] = data;
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
} else {
assert(nir_src_num_components(instr->src[0]) == 1);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
}
break;
}
case nir_intrinsic_load_ssbo_uniform_block_intel:
case nir_intrinsic_load_shared_uniform_block_intel: {
fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
const bool is_ssbo =
instr->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel;
if (is_ssbo) {
srcs[get_nir_src_bindless(instr->src[0]) ?
SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
SURFACE_LOGICAL_SRC_SURFACE] =
get_nir_buffer_intrinsic_index(bld, instr);
} else {
srcs[SURFACE_LOGICAL_SRC_SURFACE] = fs_reg(brw_imm_ud(GFX7_BTI_SLM));
}
const unsigned total_dwords = ALIGN(instr->num_components,
REG_SIZE * reg_unit(devinfo) / 4);
unsigned loaded_dwords = 0;
const fs_builder ubld1 = bld.exec_all().group(1, 0);
const fs_builder ubld8 = bld.exec_all().group(8, 0);
const fs_builder ubld16 = bld.exec_all().group(16, 0);
const fs_reg packed_consts =
ubld1.vgrf(BRW_REGISTER_TYPE_UD, total_dwords);
const nir_src load_offset = is_ssbo ? instr->src[1] : instr->src[0];
if (nir_src_is_const(load_offset)) {
fs_reg addr = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
ubld8.MOV(addr, brw_imm_ud(nir_src_as_uint(load_offset)));
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = component(addr, 0);
} else {
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
bld.emit_uniformize(get_nir_src(load_offset));
}
while (loaded_dwords < total_dwords) {
const unsigned block =
choose_oword_block_size_dwords(devinfo,
total_dwords - loaded_dwords);
const unsigned block_bytes = block * 4;
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block);
const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
ubld.emit(SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
retype(byte_offset(packed_consts, loaded_dwords * 4), BRW_REGISTER_TYPE_UD),
srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written =
align(block_bytes, REG_SIZE * reg_unit(devinfo));
loaded_dwords += block;
ubld1.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
srcs[SURFACE_LOGICAL_SRC_ADDRESS],
brw_imm_ud(block_bytes));
}
for (unsigned c = 0; c < instr->num_components; c++)
bld.MOV(retype(offset(dest, bld, c), BRW_REGISTER_TYPE_UD),
component(packed_consts, c));
break;
}
case nir_intrinsic_store_output: {
assert(nir_src_bit_size(instr->src[0]) == 32);
fs_reg src = get_nir_src(instr->src[0]);
unsigned store_offset = nir_src_as_uint(instr->src[1]);
unsigned num_components = instr->num_components;
unsigned first_component = nir_intrinsic_component(instr);
fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
4 * store_offset), src.type);
for (unsigned j = 0; j < num_components; j++) {
bld.MOV(offset(new_dest, bld, j + first_component),
offset(src, bld, j));
}
break;
}
case nir_intrinsic_ssbo_atomic:
case nir_intrinsic_ssbo_atomic_swap:
nir_emit_surface_atomic(bld, instr,
get_nir_buffer_intrinsic_index(bld, instr),
get_nir_src_bindless(instr->src[0]));
break;
case nir_intrinsic_get_ssbo_size: {
assert(nir_src_num_components(instr->src[0]) == 1);
/* A resinfo's sampler message is used to get the buffer size. The
* SIMD8's writeback message consists of four registers and SIMD16's
* writeback message consists of 8 destination registers (two per each
* component). Because we are only interested on the first channel of
* the first returned component, where resinfo returns the buffer size
* for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
* the dispatch width.
*/
const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
/* Set LOD = 0 */
ubld.MOV(src_payload, brw_imm_d(0));
fs_reg srcs[GET_BUFFER_SIZE_SRCS];
srcs[get_nir_src_bindless(instr->src[0]) ?
GET_BUFFER_SIZE_SRC_SURFACE_HANDLE :
GET_BUFFER_SIZE_SRC_SURFACE] =
get_nir_buffer_intrinsic_index(bld, instr);
srcs[GET_BUFFER_SIZE_SRC_LOD] = src_payload;
fs_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,
srcs, GET_BUFFER_SIZE_SRCS);
inst->header_size = 0;
inst->mlen = reg_unit(devinfo);
inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
isl/i965/fs: SSBO/UBO buffers need size padding if not multiple of 32-bit The surfaces that backup the GPU buffers have a boundary check that considers that access to partial dwords are considered out-of-bounds. For example, buffers with 1,3 16-bit elements has size 2 or 6 and the last two bytes would always be read as 0 or its writting ignored. The introduction of 16-bit types implies that we need to align the size to 4-bytew multiples so that partial dwords could be read/written. Adding an inconditional +2 size to buffers not being multiple of 2 solves this issue for the general cases of UBO or SSBO. But, when unsized arrays of 16-bit elements are used it is not possible to know if the size was padded or not. To solve this issue the implementation calculates the needed size of the buffer surfaces, as suggested by Jason: surface_size = isl_align(buffer_size, 4) + (isl_align(buffer_size, 4) - buffer_size) So when we calculate backwards the buffer_size in the backend we update the resinfo return value with: buffer_size = (surface_size & ~3) - (surface_size & 3) It is also exposed this buffer requirements when robust buffer access is enabled so these buffer sizes recommend being multiple of 4. v2: (Jason Ekstrand) Move padding logic fron anv to isl_surface_state. Move calculus of original size from spirv to driver backend. v3: (Jason Ekstrand) Rename some variables and use a similar expresion when calculating. padding than when obtaining the original buffer size. Avoid use of unnecesary component call at brw_fs_nir. v4: (Jason Ekstrand) Complete comment with buffer size calculus explanation in brw_fs_nir. Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2018-01-30 09:59:34 +01:00
/* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
*
* "Out-of-bounds checking is always performed at a DWord granularity. If
* any part of the DWord is out-of-bounds then the whole DWord is
* considered out-of-bounds."
*
* This implies that types with size smaller than 4-bytes need to be
* padded if they don't complete the last dword of the buffer. But as we
* need to maintain the original size we need to reverse the padding
* calculation to return the correct size to know the number of elements
* of an unsized array. As we stored in the last two bits of the surface
* size the needed padding for the buffer, we calculate here the
* original buffer_size reversing the surface_size calculation:
*
* surface_size = isl_align(buffer_size, 4) +
* (isl_align(buffer_size) - buffer_size)
*
* buffer_size = surface_size & ~3 - surface_size & 3
*/
fs_reg size_aligned4 = ubld.vgrf(BRW_REGISTER_TYPE_UD);
fs_reg size_padding = ubld.vgrf(BRW_REGISTER_TYPE_UD);
fs_reg buffer_size = ubld.vgrf(BRW_REGISTER_TYPE_UD);
ubld.AND(size_padding, ret_payload, brw_imm_ud(3));
ubld.AND(size_aligned4, ret_payload, brw_imm_ud(~3));
ubld.ADD(buffer_size, size_aligned4, negate(size_padding));
bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0));
break;
}
case nir_intrinsic_load_scratch: {
assert(devinfo->ver >= 7);
assert(instr->def.num_components == 1);
const unsigned bit_size = instr->def.bit_size;
fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
if (devinfo->verx10 >= 125) {
const fs_builder ubld = bld.exec_all().group(1, 0);
fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0);
ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
brw_imm_ud(INTEL_MASK(31, 10)));
srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX125_NON_BINDLESS);
srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle;
} else if (devinfo->ver >= 8) {
srcs[SURFACE_LOGICAL_SRC_SURFACE] =
brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
} else {
srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS);
}
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
const fs_reg nir_addr = get_nir_src(instr->src[0]);
/* Make dest unsigned because that's what the temporary will be */
dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
/* Read the vector */
assert(instr->def.num_components == 1);
assert(bit_size <= 32);
assert(nir_intrinsic_align(instr) > 0);
if (bit_size == 32 &&
nir_intrinsic_align(instr) >= 4) {
if (devinfo->verx10 >= 125) {
assert(bit_size == 32 &&
nir_intrinsic_align(instr) >= 4);
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
swizzle_nir_scratch_addr(bld, nir_addr, false);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1);
bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
} else {
/* The offset for a DWORD scattered message is in dwords. */
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
swizzle_nir_scratch_addr(bld, nir_addr, true);
bld.emit(SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL,
dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
}
} else {
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
swizzle_nir_scratch_addr(bld, nir_addr, false);
fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
bld.MOV(dest, read_result);
}
shader_stats.fill_count += DIV_ROUND_UP(dispatch_width, 16);
break;
}
case nir_intrinsic_store_scratch: {
assert(devinfo->ver >= 7);
assert(nir_src_num_components(instr->src[0]) == 1);
const unsigned bit_size = nir_src_bit_size(instr->src[0]);
fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
if (devinfo->verx10 >= 125) {
const fs_builder ubld = bld.exec_all().group(1, 0);
fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0);
ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
brw_imm_ud(INTEL_MASK(31, 10)));
srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX125_NON_BINDLESS);
srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle;
} else if (devinfo->ver >= 8) {
srcs[SURFACE_LOGICAL_SRC_SURFACE] =
brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
} else {
srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS);
}
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
/**
* While this instruction has side-effects, it should not be predicated
* on sample mask, because otherwise fs helper invocations would
* load undefined values from scratch memory. And scratch memory
* load-stores are produced from operations without side-effects, thus
* they should not have different behaviour in the helper invocations.
*/
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
const fs_reg nir_addr = get_nir_src(instr->src[1]);
fs_reg data = get_nir_src(instr->src[0]);
data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
assert(nir_src_num_components(instr->src[0]) == 1);
assert(bit_size <= 32);
assert(nir_intrinsic_write_mask(instr) == 1);
assert(nir_intrinsic_align(instr) > 0);
if (bit_size == 32 &&
nir_intrinsic_align(instr) >= 4) {
if (devinfo->verx10 >= 125) {
srcs[SURFACE_LOGICAL_SRC_DATA] = data;
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
swizzle_nir_scratch_addr(bld, nir_addr, false);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1);
bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
} else {
srcs[SURFACE_LOGICAL_SRC_DATA] = data;
/* The offset for a DWORD scattered message is in dwords. */
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
swizzle_nir_scratch_addr(bld, nir_addr, true);
bld.emit(SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL,
fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
}
} else {
srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
swizzle_nir_scratch_addr(bld, nir_addr, false);
bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
}
shader_stats.spill_count += DIV_ROUND_UP(dispatch_width, 16);
break;
}
case nir_intrinsic_load_subgroup_size:
/* This should only happen for fragment shaders because every other case
* is lowered in NIR so we can optimize on it.
*/
assert(stage == MESA_SHADER_FRAGMENT);
bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(dispatch_width));
break;
case nir_intrinsic_load_subgroup_invocation:
bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
break;
case nir_intrinsic_load_subgroup_eq_mask:
case nir_intrinsic_load_subgroup_ge_mask:
case nir_intrinsic_load_subgroup_gt_mask:
case nir_intrinsic_load_subgroup_le_mask:
case nir_intrinsic_load_subgroup_lt_mask:
unreachable("not reached");
case nir_intrinsic_vote_any: {
const fs_builder ubld = bld.exec_all().group(1, 0);
/* The any/all predicates do not consider channel enables. To prevent
* dead channels from affecting the result, we initialize the flag with
* with the identity value for the logical operation.
*/
if (dispatch_width == 32) {
/* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
brw_imm_ud(0));
} else {
ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0));
}
bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
/* For some reason, the any/all predicates don't work properly with
* SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H
* doesn't read the correct subset of the flag register and you end up
* getting garbage in the second half. Work around this by using a pair
* of 1-wide MOVs and scattering the result.
*/
fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
ubld.MOV(res1, brw_imm_d(0));
set_predicate(dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ANY8H :
dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H :
BRW_PREDICATE_ALIGN1_ANY32H,
ubld.MOV(res1, brw_imm_d(-1)));
bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
break;
}
case nir_intrinsic_vote_all: {
const fs_builder ubld = bld.exec_all().group(1, 0);
/* The any/all predicates do not consider channel enables. To prevent
* dead channels from affecting the result, we initialize the flag with
* with the identity value for the logical operation.
*/
if (dispatch_width == 32) {
/* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
brw_imm_ud(0xffffffff));
} else {
ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
}
bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
/* For some reason, the any/all predicates don't work properly with
* SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H
* doesn't read the correct subset of the flag register and you end up
* getting garbage in the second half. Work around this by using a pair
* of 1-wide MOVs and scattering the result.
*/
fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
ubld.MOV(res1, brw_imm_d(0));
set_predicate(dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ALL8H :
dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
BRW_PREDICATE_ALIGN1_ALL32H,
ubld.MOV(res1, brw_imm_d(-1)));
bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
break;
}
case nir_intrinsic_vote_feq:
case nir_intrinsic_vote_ieq: {
fs_reg value = get_nir_src(instr->src[0]);
if (instr->intrinsic == nir_intrinsic_vote_feq) {
const unsigned bit_size = nir_src_bit_size(instr->src[0]);
value.type = bit_size == 8 ? BRW_REGISTER_TYPE_B :
brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F);
}
fs_reg uniformized = bld.emit_uniformize(value);
const fs_builder ubld = bld.exec_all().group(1, 0);
/* The any/all predicates do not consider channel enables. To prevent
* dead channels from affecting the result, we initialize the flag with
* with the identity value for the logical operation.
*/
if (dispatch_width == 32) {
/* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
brw_imm_ud(0xffffffff));
} else {
ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
}
bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z);
/* For some reason, the any/all predicates don't work properly with
* SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H
* doesn't read the correct subset of the flag register and you end up
* getting garbage in the second half. Work around this by using a pair
* of 1-wide MOVs and scattering the result.
*/
fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
ubld.MOV(res1, brw_imm_d(0));
set_predicate(dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ALL8H :
dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
BRW_PREDICATE_ALIGN1_ALL32H,
ubld.MOV(res1, brw_imm_d(-1)));
bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
break;
}
case nir_intrinsic_ballot: {
const fs_reg value = retype(get_nir_src(instr->src[0]),
BRW_REGISTER_TYPE_UD);
struct brw_reg flag = brw_flag_reg(0, 0);
/* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well
* as f0.0. This is a problem for fragment programs as we currently use
* f0.1 for discards. Fortunately, we don't support SIMD32 fragment
* programs yet so this isn't a problem. When we do, something will
* have to change.
*/
if (dispatch_width == 32)
flag.type = BRW_REGISTER_TYPE_UD;
bld.exec_all().group(1, 0).MOV(flag, brw_imm_ud(0u));
bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
if (instr->def.bit_size > 32) {
dest.type = BRW_REGISTER_TYPE_UQ;
} else {
dest.type = BRW_REGISTER_TYPE_UD;
}
bld.MOV(dest, flag);
break;
}
case nir_intrinsic_read_invocation: {
const fs_reg value = get_nir_src(instr->src[0]);
const fs_reg invocation = get_nir_src(instr->src[1]);
fs_reg tmp = bld.vgrf(value.type);
/* When for some reason the subgroup_size picked by NIR is larger than
* the dispatch size picked by the backend (this could happen in RT,
* FS), bound the invocation to the dispatch size.
*/
fs_reg bound_invocation;
if (api_subgroup_size == 0 ||
bld.dispatch_width() < api_subgroup_size) {
bound_invocation = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.AND(bound_invocation, invocation, brw_imm_ud(dispatch_width - 1));
} else {
bound_invocation = invocation;
}
bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value,
bld.emit_uniformize(bound_invocation));
bld.MOV(retype(dest, value.type), fs_reg(component(tmp, 0)));
break;
}
case nir_intrinsic_read_first_invocation: {
const fs_reg value = get_nir_src(instr->src[0]);
bld.MOV(retype(dest, value.type), bld.emit_uniformize(value));
break;
}
case nir_intrinsic_shuffle: {
const fs_reg value = get_nir_src(instr->src[0]);
const fs_reg index = get_nir_src(instr->src[1]);
bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
break;
}
case nir_intrinsic_first_invocation: {
fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
fs_reg(component(tmp, 0)));
break;
}
case nir_intrinsic_last_invocation: {
fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.exec_all().emit(SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL, tmp);
bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
fs_reg(component(tmp, 0)));
break;
}
case nir_intrinsic_quad_broadcast: {
const fs_reg value = get_nir_src(instr->src[0]);
const unsigned index = nir_src_as_uint(instr->src[1]);
bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type),
value, brw_imm_ud(index), brw_imm_ud(4));
break;
}
case nir_intrinsic_quad_swap_horizontal: {
const fs_reg value = get_nir_src(instr->src[0]);
const fs_reg tmp = bld.vgrf(value.type);
if (devinfo->ver <= 7) {
/* The hardware doesn't seem to support these crazy regions with
* compressed instructions on gfx7 and earlier so we fall back to
* using quad swizzles. Fortunately, we don't support 64-bit
* anything in Vulkan on gfx7.
*/
assert(nir_src_bit_size(instr->src[0]) == 32);
const fs_builder ubld = bld.exec_all();
ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
brw_imm_ud(BRW_SWIZZLE4(1,0,3,2)));
bld.MOV(retype(dest, value.type), tmp);
} else {
const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0);
const fs_reg src_left = horiz_stride(value, 2);
const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
const fs_reg tmp_left = horiz_stride(tmp, 2);
const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);
ubld.MOV(tmp_left, src_right);
ubld.MOV(tmp_right, src_left);
}
bld.MOV(retype(dest, value.type), tmp);
break;
}
case nir_intrinsic_quad_swap_vertical: {
const fs_reg value = get_nir_src(instr->src[0]);
if (nir_src_bit_size(instr->src[0]) == 32) {
/* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
const fs_reg tmp = bld.vgrf(value.type);
const fs_builder ubld = bld.exec_all();
ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
brw_imm_ud(BRW_SWIZZLE4(2,3,0,1)));
bld.MOV(retype(dest, value.type), tmp);
} else {
/* For larger data types, we have to either emit dispatch_width many
* MOVs or else fall back to doing indirects.
*/
fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
brw_imm_w(0x2));
bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
}
break;
}
case nir_intrinsic_quad_swap_diagonal: {
const fs_reg value = get_nir_src(instr->src[0]);
if (nir_src_bit_size(instr->src[0]) == 32) {
/* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
const fs_reg tmp = bld.vgrf(value.type);
const fs_builder ubld = bld.exec_all();
ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
brw_imm_ud(BRW_SWIZZLE4(3,2,1,0)));
bld.MOV(retype(dest, value.type), tmp);
} else {
/* For larger data types, we have to either emit dispatch_width many
* MOVs or else fall back to doing indirects.
*/
fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
brw_imm_w(0x3));
bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
}
break;
}
case nir_intrinsic_reduce: {
fs_reg src = get_nir_src(instr->src[0]);
nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
unsigned cluster_size = nir_intrinsic_cluster_size(instr);
if (cluster_size == 0 || cluster_size > dispatch_width)
cluster_size = dispatch_width;
/* Figure out the source type */
src.type = brw_type_for_nir_type(devinfo,
(nir_alu_type)(nir_op_infos[redop].input_types[0] |
nir_src_bit_size(instr->src[0])));
fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
opcode brw_op = brw_op_for_nir_reduction_op(redop);
brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
/* Set up a register for all of our scratching around and initialize it
* to reduction operation's identity value.
*/
fs_reg scan = bld.vgrf(src.type);
bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
bld.emit_scan(brw_op, scan, cluster_size, cond_mod);
dest.type = src.type;
if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) {
/* In this case, CLUSTER_BROADCAST instruction isn't needed because
* the distance between clusters is at least 2 GRFs. In this case,
* we don't need the weird striding of the CLUSTER_BROADCAST
* instruction and can just do regular MOVs.
*/
assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0);
const unsigned groups =
(dispatch_width * type_sz(src.type)) / (REG_SIZE * 2);
const unsigned group_size = dispatch_width / groups;
for (unsigned i = 0; i < groups; i++) {
const unsigned cluster = (i * group_size) / cluster_size;
const unsigned comp = cluster * cluster_size + (cluster_size - 1);
bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size),
component(scan, comp));
}
} else {
bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan,
brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size));
}
break;
}
case nir_intrinsic_inclusive_scan:
case nir_intrinsic_exclusive_scan: {
fs_reg src = get_nir_src(instr->src[0]);
nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
/* Figure out the source type */
src.type = brw_type_for_nir_type(devinfo,
(nir_alu_type)(nir_op_infos[redop].input_types[0] |
nir_src_bit_size(instr->src[0])));
fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
opcode brw_op = brw_op_for_nir_reduction_op(redop);
brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
/* Set up a register for all of our scratching around and initialize it
* to reduction operation's identity value.
*/
fs_reg scan = bld.vgrf(src.type);
const fs_builder allbld = bld.exec_all();
allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
if (instr->intrinsic == nir_intrinsic_exclusive_scan) {
/* Exclusive scan is a bit harder because we have to do an annoying
* shift of the contents before we can begin. To make things worse,
* we can't do this with a normal stride; we have to use indirects.
*/
fs_reg shifted = bld.vgrf(src.type);
fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
brw_imm_w(-1));
allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
allbld.group(1, 0).MOV(component(shifted, 0), identity);
scan = shifted;
}
bld.emit_scan(brw_op, scan, dispatch_width, cond_mod);
bld.MOV(retype(dest, src.type), scan);
break;
}
case nir_intrinsic_load_global_block_intel: {
assert(instr->def.bit_size == 32);
fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[0]));
const fs_builder ubld1 = bld.exec_all().group(1, 0);
const fs_builder ubld8 = bld.exec_all().group(8, 0);
const fs_builder ubld16 = bld.exec_all().group(16, 0);
const unsigned total = instr->num_components * dispatch_width;
unsigned loaded = 0;
while (loaded < total) {
const unsigned block =
choose_oword_block_size_dwords(devinfo, total - loaded);
const unsigned block_bytes = block * 4;
const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
fs_reg srcs[A64_LOGICAL_NUM_SRCS];
srcs[A64_LOGICAL_ADDRESS] = address;
srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */
srcs[A64_LOGICAL_ARG] = brw_imm_ud(block);
srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(1);
ubld.emit(SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
retype(byte_offset(dest, loaded * 4), BRW_REGISTER_TYPE_UD),
srcs, A64_LOGICAL_NUM_SRCS)->size_written = block_bytes;
increment_a64_address(ubld1, address, block_bytes);
loaded += block;
}
assert(loaded == total);
break;
}
case nir_intrinsic_store_global_block_intel: {
assert(nir_src_bit_size(instr->src[0]) == 32);
fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[1]));
fs_reg src = get_nir_src(instr->src[0]);
const fs_builder ubld1 = bld.exec_all().group(1, 0);
const fs_builder ubld8 = bld.exec_all().group(8, 0);
const fs_builder ubld16 = bld.exec_all().group(16, 0);
const unsigned total = instr->num_components * dispatch_width;
unsigned written = 0;
while (written < total) {
const unsigned block =
choose_oword_block_size_dwords(devinfo, total - written);
fs_reg srcs[A64_LOGICAL_NUM_SRCS];
srcs[A64_LOGICAL_ADDRESS] = address;
srcs[A64_LOGICAL_SRC] = retype(byte_offset(src, written * 4),
BRW_REGISTER_TYPE_UD);
srcs[A64_LOGICAL_ARG] = brw_imm_ud(block);
srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0);
const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL, fs_reg(),
srcs, A64_LOGICAL_NUM_SRCS);
const unsigned block_bytes = block * 4;
increment_a64_address(ubld1, address, block_bytes);
written += block;
}
assert(written == total);
break;
}
case nir_intrinsic_load_shared_block_intel:
case nir_intrinsic_load_ssbo_block_intel: {
assert(instr->def.bit_size == 32);
const bool is_ssbo =
instr->intrinsic == nir_intrinsic_load_ssbo_block_intel;
fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[is_ssbo ? 1 : 0]));
fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ?
get_nir_buffer_intrinsic_index(bld, instr) :
fs_reg(brw_imm_ud(GFX7_BTI_SLM));
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address;
const fs_builder ubld1 = bld.exec_all().group(1, 0);
const fs_builder ubld8 = bld.exec_all().group(8, 0);
const fs_builder ubld16 = bld.exec_all().group(16, 0);
const unsigned total = instr->num_components * dispatch_width;
unsigned loaded = 0;
while (loaded < total) {
const unsigned block =
choose_oword_block_size_dwords(devinfo, total - loaded);
const unsigned block_bytes = block * 4;
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block);
const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
ubld.emit(SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
retype(byte_offset(dest, loaded * 4), BRW_REGISTER_TYPE_UD),
srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written = block_bytes;
ubld1.ADD(address, address, brw_imm_ud(block_bytes));
loaded += block;
}
assert(loaded == total);
break;
}
case nir_intrinsic_store_shared_block_intel:
case nir_intrinsic_store_ssbo_block_intel: {
assert(nir_src_bit_size(instr->src[0]) == 32);
const bool is_ssbo =
instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[is_ssbo ? 2 : 1]));
fs_reg src = get_nir_src(instr->src[0]);
fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ?
get_nir_buffer_intrinsic_index(bld, instr) :
fs_reg(brw_imm_ud(GFX7_BTI_SLM));
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address;
const fs_builder ubld1 = bld.exec_all().group(1, 0);
const fs_builder ubld8 = bld.exec_all().group(8, 0);
const fs_builder ubld16 = bld.exec_all().group(16, 0);
const unsigned total = instr->num_components * dispatch_width;
unsigned written = 0;
while (written < total) {
const unsigned block =
choose_oword_block_size_dwords(devinfo, total - written);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block);
srcs[SURFACE_LOGICAL_SRC_DATA] =
retype(byte_offset(src, written * 4), BRW_REGISTER_TYPE_UD);
const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
ubld.emit(SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL,
fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
const unsigned block_bytes = block * 4;
ubld1.ADD(address, address, brw_imm_ud(block_bytes));
written += block;
}
assert(written == total);
break;
}
case nir_intrinsic_load_topology_id_intel: {
/* These move around basically every hardware generation, so don'
* do any >= checks and fail if the platform hasn't explicitly
* been enabled here.
*/
assert(devinfo->ver == 12);
/* Here is what the layout of SR0 looks like on Gfx12 :
* [13:11] : Slice ID.
* [10:9] : Dual-SubSlice ID
* [8] : SubSlice ID
* [7] : EUID[2] (aka EU Row ID)
* [6] : Reserved
* [5:4] : EUID[1:0]
* [2:0] : Thread ID
*/
fs_reg raw_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.emit(SHADER_OPCODE_READ_SR_REG, raw_id, brw_imm_ud(0));
switch (nir_intrinsic_base(instr)) {
case BRW_TOPOLOGY_ID_DSS:
bld.AND(raw_id, raw_id, brw_imm_ud(0x3fff));
/* Get rid of anything below dualsubslice */
bld.SHR(retype(dest, BRW_REGISTER_TYPE_UD), raw_id, brw_imm_ud(9));
break;
case BRW_TOPOLOGY_ID_EU_THREAD_SIMD: {
limit_dispatch_width(16, "Topology helper for Ray queries, "
"not supported in SIMD32 mode.");
fs_reg dst = retype(dest, BRW_REGISTER_TYPE_UD);
/* EU[3:0] << 7
*
* The 4bit EU[3:0] we need to build for ray query memory addresses
* computations is a bit odd :
*
* EU[1:0] = raw_id[5:4] (identified as EUID[1:0])
* EU[2] = raw_id[8] (identified as SubSlice ID)
* EU[3] = raw_id[7] (identified as EUID[2] or Row ID)
*/
{
fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.AND(tmp, raw_id, brw_imm_ud(INTEL_MASK(7, 7)));
bld.SHL(dst, tmp, brw_imm_ud(3));
bld.AND(tmp, raw_id, brw_imm_ud(INTEL_MASK(8, 8)));
bld.SHL(tmp, tmp, brw_imm_ud(1));
bld.OR(dst, dst, tmp);
bld.AND(tmp, raw_id, brw_imm_ud(INTEL_MASK(5, 4)));
bld.SHL(tmp, tmp, brw_imm_ud(3));
bld.OR(dst, dst, tmp);
}
/* ThreadID[2:0] << 4 (ThreadID comes from raw_id[2:0]) */
{
bld.AND(raw_id, raw_id, brw_imm_ud(INTEL_MASK(2, 0)));
bld.SHL(raw_id, raw_id, brw_imm_ud(4));
bld.OR(dst, dst, raw_id);
}
/* LaneID[0:3] << 0 (Use nir SYSTEM_VALUE_SUBGROUP_INVOCATION) */
assert(bld.dispatch_width() <= 16); /* Limit to 4 bits */
bld.ADD(dst, dst,
nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
break;
}
default:
unreachable("Invalid topology id type");
}
break;
}
case nir_intrinsic_load_btd_stack_id_intel:
if (stage == MESA_SHADER_COMPUTE) {
assert(brw_cs_prog_data(prog_data)->uses_btd_stack_ids);
} else {
assert(brw_shader_stage_is_bindless(stage));
}
/* Stack IDs are always in R1 regardless of whether we're coming from a
* bindless shader or a regular compute shader.
*/
bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
retype(brw_vec8_grf(1 * reg_unit(devinfo), 0), BRW_REGISTER_TYPE_UW));
break;
case nir_intrinsic_btd_spawn_intel:
if (stage == MESA_SHADER_COMPUTE) {
assert(brw_cs_prog_data(prog_data)->uses_btd_stack_ids);
} else {
assert(brw_shader_stage_is_bindless(stage));
}
/* Make sure all the pointers to resume shaders have landed where other
* threads can see them.
*/
emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE);
bld.emit(SHADER_OPCODE_BTD_SPAWN_LOGICAL, bld.null_reg_ud(),
bld.emit_uniformize(get_nir_src(instr->src[0])),
get_nir_src(instr->src[1]));
break;
case nir_intrinsic_btd_retire_intel:
if (stage == MESA_SHADER_COMPUTE) {
assert(brw_cs_prog_data(prog_data)->uses_btd_stack_ids);
} else {
assert(brw_shader_stage_is_bindless(stage));
}
/* Make sure all the pointers to resume shaders have landed where other
* threads can see them.
*/
emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE);
bld.emit(SHADER_OPCODE_BTD_RETIRE_LOGICAL);
break;
case nir_intrinsic_trace_ray_intel: {
const bool synchronous = nir_intrinsic_synchronous(instr);
assert(brw_shader_stage_is_bindless(stage) || synchronous);
/* Make sure all the previous RT structure writes are visible to the RT
* fixed function within the DSS, as well as stack pointers to resume
* shaders.
*/
emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE);
fs_reg srcs[RT_LOGICAL_NUM_SRCS];
fs_reg globals = get_nir_src(instr->src[0]);
srcs[RT_LOGICAL_SRC_GLOBALS] = bld.emit_uniformize(globals);
srcs[RT_LOGICAL_SRC_BVH_LEVEL] = get_nir_src(instr->src[1]);
srcs[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] = get_nir_src(instr->src[2]);
srcs[RT_LOGICAL_SRC_SYNCHRONOUS] = brw_imm_ud(synchronous);
bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL, bld.null_reg_ud(),
srcs, RT_LOGICAL_NUM_SRCS);
/* There is no actual value to use in the destination register of the
* synchronous trace instruction. All of the communication with the HW
* unit happens through memory reads/writes. So to ensure that the
* operation has completed before we go read the results in memory, we
* need a barrier followed by an invalidate before accessing memory.
*/
if (synchronous) {
bld.emit(BRW_OPCODE_SYNC, bld.null_reg_ud(), brw_imm_ud(TGL_SYNC_ALLWR));
emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_INVALIDATE);
}
break;
}
default:
#ifndef NDEBUG
assert(instr->intrinsic < nir_num_intrinsics);
fprintf(stderr, "intrinsic: %s\n", nir_intrinsic_infos[instr->intrinsic].name);
#endif
unreachable("unknown intrinsic");
}
}
static fs_reg
expand_to_32bit(const fs_builder &bld, const fs_reg &src)
{
if (type_sz(src.type) == 2) {
fs_reg src32 = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.MOV(src32, retype(src, BRW_REGISTER_TYPE_UW));
return src32;
} else {
return src;
}
}
void
fs_visitor::nir_emit_surface_atomic(const fs_builder &bld,
nir_intrinsic_instr *instr,
fs_reg surface,
bool bindless)
{
enum lsc_opcode op = lsc_aop_for_nir_intrinsic(instr);
int num_data = lsc_op_num_data_values(op);
bool shared = surface.file == IMM && surface.ud == GFX7_BTI_SLM;
/* The BTI untyped atomic messages only support 32-bit atomics. If you
* just look at the big table of messages in the Vol 7 of the SKL PRM, they
* appear to exist. However, if you look at Vol 2a, there are no message
* descriptors provided for Qword atomic ops except for A64 messages.
*
* 16-bit float atomics are supported, however.
*/
assert(instr->def.bit_size == 32 ||
(instr->def.bit_size == 64 && devinfo->has_lsc) ||
(instr->def.bit_size == 16 &&
(devinfo->has_lsc || lsc_opcode_is_atomic_float(op))));
fs_reg dest = get_nir_def(instr->def);
fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
srcs[bindless ?
SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
SURFACE_LOGICAL_SRC_SURFACE] = surface;
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
if (shared) {
/* SLM - Get the offset */
if (nir_src_is_const(instr->src[0])) {
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
brw_imm_ud(nir_intrinsic_base(instr) +
nir_src_as_uint(instr->src[0]));
} else {
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type);
bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
brw_imm_ud(nir_intrinsic_base(instr)));
}
} else {
/* SSBOs */
srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
}
fs_reg data;
if (num_data >= 1)
data = expand_to_32bit(bld, get_nir_src(instr->src[shared ? 1 : 2]));
if (num_data >= 2) {
fs_reg tmp = bld.vgrf(data.type, 2);
fs_reg sources[2] = {
data,
expand_to_32bit(bld, get_nir_src(instr->src[shared ? 2 : 3]))
};
bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
data = tmp;
}
srcs[SURFACE_LOGICAL_SRC_DATA] = data;
/* Emit the actual atomic operation */
switch (instr->def.bit_size) {
case 16: {
fs_reg dest32 = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
retype(dest32, dest.type),
srcs, SURFACE_LOGICAL_NUM_SRCS);
bld.MOV(retype(dest, BRW_REGISTER_TYPE_UW),
retype(dest32, BRW_REGISTER_TYPE_UD));
break;
}
case 32:
case 64:
bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
break;
default:
unreachable("Unsupported bit size");
}
}
void
fs_visitor::nir_emit_global_atomic(const fs_builder &bld,
nir_intrinsic_instr *instr)
{
int op = lsc_aop_for_nir_intrinsic(instr);
fs_reg dest = get_nir_def(instr->def);
fs_reg addr = get_nir_src(instr->src[0]);
fs_reg data;
if (op != LSC_OP_ATOMIC_INC && op != LSC_OP_ATOMIC_DEC)
data = expand_to_32bit(bld, get_nir_src(instr->src[1]));
if (op == LSC_OP_ATOMIC_CMPXCHG) {
fs_reg tmp = bld.vgrf(data.type, 2);
fs_reg sources[2] = {
data,
expand_to_32bit(bld, get_nir_src(instr->src[2]))
};
bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
data = tmp;
}
fs_reg srcs[A64_LOGICAL_NUM_SRCS];
srcs[A64_LOGICAL_ADDRESS] = addr;
srcs[A64_LOGICAL_SRC] = data;
srcs[A64_LOGICAL_ARG] = brw_imm_ud(op);
srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0);
switch (instr->def.bit_size) {
case 16: {
fs_reg dest32 = bld.vgrf(BRW_REGISTER_TYPE_UD);
bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
retype(dest32, dest.type),
srcs, A64_LOGICAL_NUM_SRCS);
bld.MOV(retype(dest, BRW_REGISTER_TYPE_UW), dest32);
break;
}
case 32:
case 64:
bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, dest,
srcs, A64_LOGICAL_NUM_SRCS);
break;
default:
unreachable("Unsupported bit size");
}
}
void
fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
{
fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
/* SKL PRMs: Volume 7: 3D-Media-GPGPU:
*
* "The Pixel Null Mask field, when enabled via the Pixel Null Mask
* Enable will be incorect for sample_c when applied to a surface with
* 64-bit per texel format such as R16G16BA16_UNORM. Pixel Null mask
* Enable may incorrectly report pixels as referencing a Null surface."
*
* We'll take care of this in NIR.
*/
assert(!instr->is_sparse || srcs[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE);
srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_ud(instr->is_sparse);
int lod_components = 0;
/* The hardware requires a LOD for buffer textures */
if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);
uint32_t header_bits = 0;
for (unsigned i = 0; i < instr->num_srcs; i++) {
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
nir_src nir_src = instr->src[i].src;
fs_reg src = get_nir_src(nir_src);
switch (instr->src[i].src_type) {
case nir_tex_src_bias:
srcs[TEX_LOGICAL_SRC_LOD] =
retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
break;
case nir_tex_src_comparator:
srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F);
break;
case nir_tex_src_coord:
switch (instr->op) {
case nir_texop_txf:
case nir_texop_txf_ms:
case nir_texop_txf_ms_mcs_intel:
case nir_texop_samples_identical:
srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D);
break;
default:
srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F);
break;
}
break;
case nir_tex_src_ddx:
srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F);
lod_components = nir_tex_instr_src_size(instr, i);
break;
case nir_tex_src_ddy:
srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F);
break;
case nir_tex_src_lod:
switch (instr->op) {
case nir_texop_txs:
srcs[TEX_LOGICAL_SRC_LOD] =
retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD);
break;
case nir_texop_txf:
srcs[TEX_LOGICAL_SRC_LOD] =
retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D);
break;
default:
srcs[TEX_LOGICAL_SRC_LOD] =
retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
break;
}
break;
case nir_tex_src_min_lod:
srcs[TEX_LOGICAL_SRC_MIN_LOD] =
retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
break;
case nir_tex_src_ms_index:
srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD);
break;
case nir_tex_src_offset: {
uint32_t offset_bits = 0;
if (brw_texture_offset(instr, i, &offset_bits)) {
header_bits |= offset_bits;
} else {
/* On gfx12.5+, if the offsets are not both constant and in the
* {-8,7} range, nir_lower_tex() will have already lowered the
* source offset. So we should never reach this point.
*/
assert(devinfo->verx10 < 125);
srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
retype(src, BRW_REGISTER_TYPE_D);
}
break;
}
case nir_tex_src_projector:
unreachable("should be lowered");
case nir_tex_src_texture_offset: {
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
assert(srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE);
/* Emit code to evaluate the actual indexing expression */
if (instr->texture_index == 0 && is_resource_src(nir_src))
srcs[TEX_LOGICAL_SRC_SURFACE] = get_resource_nir_src(nir_src);
if (srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE) {
fs_reg tmp = vgrf(glsl_type::uint_type);
bld.ADD(tmp, src, brw_imm_ud(instr->texture_index));
srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);
}
assert(srcs[TEX_LOGICAL_SRC_SURFACE].file != BAD_FILE);
break;
}
case nir_tex_src_sampler_offset: {
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
/* Emit code to evaluate the actual indexing expression */
if (instr->sampler_index == 0 && is_resource_src(nir_src))
srcs[TEX_LOGICAL_SRC_SAMPLER] = get_resource_nir_src(nir_src);
if (srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE) {
fs_reg tmp = vgrf(glsl_type::uint_type);
bld.ADD(tmp, src, brw_imm_ud(instr->sampler_index));
srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);
}
break;
}
case nir_tex_src_texture_handle:
assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1);
srcs[TEX_LOGICAL_SRC_SURFACE] = fs_reg();
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
if (is_resource_src(nir_src))
srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = get_resource_nir_src(nir_src);
if (srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE].file == BAD_FILE)
srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src);
break;
case nir_tex_src_sampler_handle:
assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
srcs[TEX_LOGICAL_SRC_SAMPLER] = fs_reg();
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
if (is_resource_src(nir_src))
srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = get_resource_nir_src(nir_src);
if (srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE].file == BAD_FILE)
srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src);
break;
case nir_tex_src_ms_mcs_intel:
assert(instr->op == nir_texop_txf_ms);
srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D);
break;
default:
unreachable("unknown texture source");
}
}
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
/* If the surface or sampler were not specified through sources, use the
* instruction index.
*/
if (srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE &&
srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE].file == BAD_FILE)
srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(instr->texture_index);
if (srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE &&
srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE].file == BAD_FILE)
srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(instr->sampler_index);
if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
(instr->op == nir_texop_txf_ms ||
instr->op == nir_texop_samples_identical)) {
if (devinfo->ver >= 7) {
srcs[TEX_LOGICAL_SRC_MCS] =
emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE],
instr->coord_components,
srcs[TEX_LOGICAL_SRC_SURFACE],
srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]);
} else {
srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u);
}
}
srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);
srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);
enum opcode opcode;
switch (instr->op) {
case nir_texop_tex:
opcode = SHADER_OPCODE_TEX_LOGICAL;
break;
case nir_texop_txb:
opcode = FS_OPCODE_TXB_LOGICAL;
break;
case nir_texop_txl:
opcode = SHADER_OPCODE_TXL_LOGICAL;
break;
case nir_texop_txd:
opcode = SHADER_OPCODE_TXD_LOGICAL;
break;
case nir_texop_txf:
opcode = SHADER_OPCODE_TXF_LOGICAL;
break;
case nir_texop_txf_ms:
/* On Gfx12HP there is only CMS_W available. From the Bspec: Shared
* Functions - 3D Sampler - Messages - Message Format:
*
* ld2dms REMOVEDBY(GEN:HAS:1406788836)
*/
if (devinfo->verx10 >= 125)
opcode = SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL;
else if (devinfo->ver >= 9)
opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
else
opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
break;
case nir_texop_txf_ms_mcs_intel:
opcode = SHADER_OPCODE_TXF_MCS_LOGICAL;
break;
case nir_texop_query_levels:
case nir_texop_txs:
opcode = SHADER_OPCODE_TXS_LOGICAL;
break;
case nir_texop_lod:
opcode = SHADER_OPCODE_LOD_LOGICAL;
break;
case nir_texop_tg4:
if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE)
opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL;
else
opcode = SHADER_OPCODE_TG4_LOGICAL;
break;
case nir_texop_texture_samples:
opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL;
break;
case nir_texop_samples_identical: {
fs_reg dst = retype(get_nir_def(instr->def), BRW_REGISTER_TYPE_D);
/* If mcs is an immediate value, it means there is no MCS. In that case
* just return false.
*/
if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) {
bld.MOV(dst, brw_imm_ud(0u));
} else if (devinfo->ver >= 9) {
fs_reg tmp = vgrf(glsl_type::uint_type);
bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS],
offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1));
bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ);
} else {
bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u),
BRW_CONDITIONAL_EQ);
}
return;
}
default:
unreachable("unknown texture opcode");
}
if (instr->op == nir_texop_tg4) {
if (instr->component == 1 &&
intel/fs: try to rematerialize surface computation code This helps a lot with accessing surface handles in control flow. Our resource_intel intrinsic has a non_uniform flag, in which case we cannot apply this optimization. But in uniform cases, this is just a massive win. We drop all kind of pipeline stalls due to find_live_channel. We also reduce register pressure by doing the surface handle computation in a single GRF (instead of 2 or 4). There are some regressions in max dispatch width but those I think are only on SIMD32 and due to the current heuristic disabling it after throughput comparison with SIMD16. We know this heuristic is not perfect, it should probably be updated in another change. Here are some stats (all titles seem to have similar gains) : PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width red_dead_redemption2 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% --------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 4716 -37.29% -5.67% +0.95% +0.07% -81.26% -79.16% -70.62% -9.15% -8.47% --------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 5860 -36.80% -5.67% +0.77% +0.06% -81.26% -79.16% -70.62% -8.63% -6.93% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width rise_of_the_tomb_raider_g2 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- All affected 11732 -37.27% -22.14% +0.01% +0.00% -99.01% -99.14% -98.65% -7.67% -5.11% --------------------------------------------------------------------------------------------------------------------------------------------------------------------- Total 12010 -37.19% -22.12% +0.01% +0.00% -99.01% -99.14% -98.65% -7.62% -4.96% PERCENTAGE DELTAS Shaders Instrs Cycles Spill count Fill count Scratch Memory Size Max live registers Max dispatch width total_war_warhammer2 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% ----------------------------------------------------------------------------------------------------------------------------------- All affected 335 -28.31% -12.77% -82.35% -88.46% -66.67% -6.25% -7.24% ----------------------------------------------------------------------------------------------------------------------------------- Total 462 -27.45% -12.42% -82.35% -88.46% -66.67% -5.52% -5.62% PERCENTAGE DELTAS Shaders Instrs Cycles Subgroup size Send messages Spill count Fill count Scratch Memory Size Max live registers Max dispatch width witcher_3_dxvk_g2 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% ------------------------------------------------------------------------------------------------------------------------------------------------------------ All affected 693 -41.93% -58.45% +0.09% +0.01% -98.52% -97.29% -98.10% -10.25% -1.33% ------------------------------------------------------------------------------------------------------------------------------------------------------------ Total 1049 -36.94% -57.82% +0.06% +0.01% -98.52% -97.29% -98.10% -7.81% -1.00% Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21645>
2023-02-09 15:07:36 +02:00
key_tex->gather_channel_quirk_mask & (1 << instr->texture_index)) {
/* gather4 sampler is broken for green channel on RG32F --
* we must ask for blue instead.
*/
header_bits |= 2 << 16;
} else {
header_bits |= instr->component << 16;
}
}
fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4 + instr->is_sparse);
fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
inst->offset = header_bits;
i965/fs: Reduce the response length of sampler messages on Skylake. Often, we don't need a full 4 channels worth of data from the sampler. For example, depth comparisons and red textures only return one value. To handle this, the sampler message header contains a mask which can be used to disable channels, and reduce the message length (in SIMD16 mode on all hardware, and SIMD8 mode on Broadwell and later). We've never used it before, since it required setting up a message header. This meant trading a smaller response length for a larger message length and additional MOVs to set it up. However, Skylake introduces a terrific new feature: for headerless messages, you can simply reduce the response length, and it makes the implicit header contain an appropriate mask. So to read only RG, you would simply set the message length to 2 or 4 (SIMD8/16). This means we can finally take advantage of this at no cost. total instructions in shared programs: 9091831 -> 9073067 (-0.21%) instructions in affected programs: 191370 -> 172606 (-9.81%) helped: 2609 HURT: 0 total cycles in shared programs: 70868114 -> 68454752 (-3.41%) cycles in affected programs: 35841154 -> 33427792 (-6.73%) helped: 16357 HURT: 8188 total spills in shared programs: 3492 -> 1707 (-51.12%) spills in affected programs: 2749 -> 964 (-64.93%) helped: 74 HURT: 0 total fills in shared programs: 4266 -> 2647 (-37.95%) fills in affected programs: 3029 -> 1410 (-53.45%) helped: 74 HURT: 0 LOST: 1 GAINED: 143 Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-04-23 01:54:33 -07:00
const unsigned dest_size = nir_tex_instr_dest_size(instr);
if (devinfo->ver >= 9 &&
instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {
unsigned write_mask = nir_def_components_read(&instr->def);
i965/fs: Reduce the response length of sampler messages on Skylake. Often, we don't need a full 4 channels worth of data from the sampler. For example, depth comparisons and red textures only return one value. To handle this, the sampler message header contains a mask which can be used to disable channels, and reduce the message length (in SIMD16 mode on all hardware, and SIMD8 mode on Broadwell and later). We've never used it before, since it required setting up a message header. This meant trading a smaller response length for a larger message length and additional MOVs to set it up. However, Skylake introduces a terrific new feature: for headerless messages, you can simply reduce the response length, and it makes the implicit header contain an appropriate mask. So to read only RG, you would simply set the message length to 2 or 4 (SIMD8/16). This means we can finally take advantage of this at no cost. total instructions in shared programs: 9091831 -> 9073067 (-0.21%) instructions in affected programs: 191370 -> 172606 (-9.81%) helped: 2609 HURT: 0 total cycles in shared programs: 70868114 -> 68454752 (-3.41%) cycles in affected programs: 35841154 -> 33427792 (-6.73%) helped: 16357 HURT: 8188 total spills in shared programs: 3492 -> 1707 (-51.12%) spills in affected programs: 2749 -> 964 (-64.93%) helped: 74 HURT: 0 total fills in shared programs: 4266 -> 2647 (-37.95%) fills in affected programs: 3029 -> 1410 (-53.45%) helped: 74 HURT: 0 LOST: 1 GAINED: 143 Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-04-23 01:54:33 -07:00
assert(write_mask != 0); /* dead code should have been eliminated */
if (instr->is_sparse) {
inst->size_written = (util_last_bit(write_mask) - 1) *
inst->dst.component_size(inst->exec_size) +
(reg_unit(devinfo) * REG_SIZE);
} else {
inst->size_written = util_last_bit(write_mask) *
inst->dst.component_size(inst->exec_size);
}
} else {
inst->size_written = 4 * inst->dst.component_size(inst->exec_size) +
(instr->is_sparse ? (reg_unit(devinfo) * REG_SIZE) : 0);
}
if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
inst->shadow_compare = true;
/* Wa_14012688258:
*
* Don't trim zeros at the end of payload for sample operations
* in cube and cube arrays.
*/
if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
intel_needs_workaround(devinfo, 14012688258)) {
/* Compiler should send U,V,R parameters even if V,R are 0. */
if (srcs[TEX_LOGICAL_SRC_COORDINATE].file != BAD_FILE)
assert(instr->coord_components >= 3u);
/* See opt_zero_samples(). */
inst->keep_payload_trailing_zeros = true;
}
fs_reg nir_dest[5];
for (unsigned i = 0; i < dest_size; i++)
nir_dest[i] = offset(dst, bld, i);
i965/fs: Reduce the response length of sampler messages on Skylake. Often, we don't need a full 4 channels worth of data from the sampler. For example, depth comparisons and red textures only return one value. To handle this, the sampler message header contains a mask which can be used to disable channels, and reduce the message length (in SIMD16 mode on all hardware, and SIMD8 mode on Broadwell and later). We've never used it before, since it required setting up a message header. This meant trading a smaller response length for a larger message length and additional MOVs to set it up. However, Skylake introduces a terrific new feature: for headerless messages, you can simply reduce the response length, and it makes the implicit header contain an appropriate mask. So to read only RG, you would simply set the message length to 2 or 4 (SIMD8/16). This means we can finally take advantage of this at no cost. total instructions in shared programs: 9091831 -> 9073067 (-0.21%) instructions in affected programs: 191370 -> 172606 (-9.81%) helped: 2609 HURT: 0 total cycles in shared programs: 70868114 -> 68454752 (-3.41%) cycles in affected programs: 35841154 -> 33427792 (-6.73%) helped: 16357 HURT: 8188 total spills in shared programs: 3492 -> 1707 (-51.12%) spills in affected programs: 2749 -> 964 (-64.93%) helped: 74 HURT: 0 total fills in shared programs: 4266 -> 2647 (-37.95%) fills in affected programs: 3029 -> 1410 (-53.45%) helped: 74 HURT: 0 LOST: 1 GAINED: 143 Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-04-23 01:54:33 -07:00
if (instr->op == nir_texop_query_levels) {
/* # levels is in .w */
if (devinfo->ver <= 9) {
/**
* Wa_1940217:
*
* When a surface of type SURFTYPE_NULL is accessed by resinfo, the
* MIPCount returned is undefined instead of 0.
*/
fs_inst *mov = bld.MOV(bld.null_reg_d(), dst);
mov->conditional_mod = BRW_CONDITIONAL_NZ;
nir_dest[0] = bld.vgrf(BRW_REGISTER_TYPE_D);
fs_inst *sel = bld.SEL(nir_dest[0], offset(dst, bld, 3), brw_imm_d(0));
sel->predicate = BRW_PREDICATE_NORMAL;
} else {
nir_dest[0] = offset(dst, bld, 3);
}
} else if (instr->op == nir_texop_txs &&
dest_size >= 3 && devinfo->ver < 7) {
/* Gfx4-6 return 0 instead of 1 for single layer surfaces. */
fs_reg depth = offset(dst, bld, 2);
nir_dest[2] = vgrf(glsl_type::int_type);
bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE);
}
/* The residency bits are only in the first component. */
if (instr->is_sparse)
nir_dest[dest_size - 1] = component(offset(dst, bld, dest_size - 1), 0);
bld.LOAD_PAYLOAD(get_nir_def(instr->def), nir_dest, dest_size, 0);
}
void
fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr)
{
switch (instr->type) {
case nir_jump_break:
bld.emit(BRW_OPCODE_BREAK);
break;
case nir_jump_continue:
bld.emit(BRW_OPCODE_CONTINUE);
break;
case nir_jump_halt:
bld.emit(BRW_OPCODE_HALT);
break;
case nir_jump_return:
default:
unreachable("unknown jump");
}
}
/*
* This helper takes a source register and un/shuffles it into the destination
* register.
*
* If source type size is smaller than destination type size the operation
* needed is a component shuffle. The opposite case would be an unshuffle. If
* source/destination type size is equal a shuffle is done that would be
* equivalent to a simple MOV.
*
* For example, if source is a 16-bit type and destination is 32-bit. A 3
* components .xyz 16-bit vector on SIMD8 would be.
*
* |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
* |z1|z2|z3|z4|z5|z6|z7|z8| | | | | | | | |
*
* This helper will return the following 2 32-bit components with the 16-bit
* values shuffled:
*
* |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
* |z1 |z2 |z3 |z4 |z5 |z6 |z7 |z8 |
*
* For unshuffle, the example would be the opposite, a 64-bit type source
* and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
* would be:
*
* | x1l x1h | x2l x2h | x3l x3h | x4l x4h |
* | x5l x5h | x6l x6h | x7l x7h | x8l x8h |
* | y1l y1h | y2l y2h | y3l y3h | y4l y4h |
* | y5l y5h | y6l y6h | y7l y7h | y8l y8h |
*
* The returned result would be the following 4 32-bit components unshuffled:
*
* | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
* | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
* | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
* | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
*
* - Source and destination register must not be overlapped.
* - components units are measured in terms of the smaller type between
* source and destination because we are un/shuffling the smaller
* components from/into the bigger ones.
* - first_component parameter allows skipping source components.
*/
void
shuffle_src_to_dst(const fs_builder &bld,
const fs_reg &dst,
const fs_reg &src,
uint32_t first_component,
uint32_t components)
{
if (type_sz(src.type) == type_sz(dst.type)) {
assert(!regions_overlap(dst,
type_sz(dst.type) * bld.dispatch_width() * components,
offset(src, bld, first_component),
type_sz(src.type) * bld.dispatch_width() * components));
for (unsigned i = 0; i < components; i++) {
bld.MOV(retype(offset(dst, bld, i), src.type),
offset(src, bld, i + first_component));
}
} else if (type_sz(src.type) < type_sz(dst.type)) {
/* Source is shuffled into destination */
unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);
assert(!regions_overlap(dst,
type_sz(dst.type) * bld.dispatch_width() *
DIV_ROUND_UP(components, size_ratio),
offset(src, bld, first_component),
type_sz(src.type) * bld.dispatch_width() * components));
brw_reg_type shuffle_type =
brw_reg_type_from_bit_size(8 * type_sz(src.type),
BRW_REGISTER_TYPE_D);
for (unsigned i = 0; i < components; i++) {
fs_reg shuffle_component_i =
subscript(offset(dst, bld, i / size_ratio),
shuffle_type, i % size_ratio);
bld.MOV(shuffle_component_i,
retype(offset(src, bld, i + first_component), shuffle_type));
}
} else {
/* Source is unshuffled into destination */
unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);
assert(!regions_overlap(dst,
type_sz(dst.type) * bld.dispatch_width() * components,
offset(src, bld, first_component / size_ratio),
type_sz(src.type) * bld.dispatch_width() *
DIV_ROUND_UP(components + (first_component % size_ratio),
size_ratio)));
brw_reg_type shuffle_type =
brw_reg_type_from_bit_size(8 * type_sz(dst.type),
BRW_REGISTER_TYPE_D);
for (unsigned i = 0; i < components; i++) {
fs_reg shuffle_component_i =
subscript(offset(src, bld, (first_component + i) / size_ratio),
shuffle_type, (first_component + i) % size_ratio);
bld.MOV(retype(offset(dst, bld, i), shuffle_type),
shuffle_component_i);
}
}
}
void
shuffle_from_32bit_read(const fs_builder &bld,
const fs_reg &dst,
const fs_reg &src,
uint32_t first_component,
uint32_t components)
{
assert(type_sz(src.type) == 4);
/* This function takes components in units of the destination type while
* shuffle_src_to_dst takes components in units of the smallest type
*/
if (type_sz(dst.type) > 4) {
assert(type_sz(dst.type) == 8);
first_component *= 2;
components *= 2;
}
shuffle_src_to_dst(bld, dst, src, first_component, components);
}
fs_reg
setup_imm_df(const fs_builder &bld, double v)
{
const struct intel_device_info *devinfo = bld.shader->devinfo;
assert(devinfo->ver >= 7);
if (devinfo->ver >= 8)
return brw_imm_df(v);
/* gfx7.5 does not support DF immediates straightforward but the DIM
* instruction allows to set the 64-bit immediate value.
*/
if (devinfo->platform == INTEL_PLATFORM_HSW) {
const fs_builder ubld = bld.exec_all().group(1, 0);
fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1);
ubld.DIM(dst, brw_imm_df(v));
return component(dst, 0);
}
/* gfx7 does not support DF immediates, so we generate a 64-bit constant by
* writing the low 32-bit of the constant to suboffset 0 of a VGRF and
* the high 32-bit to suboffset 4 and then applying a stride of 0.
*
* Alternatively, we could also produce a normal VGRF (without stride 0)
* by writing to all the channels in the VGRF, however, that would hit the
* gfx7 bug where we have to split writes that span more than 1 register
* into instructions with a width of 4 (otherwise the write to the second
* register written runs into an execmask hardware bug) which isn't very
* nice.
*/
union {
double d;
struct {
uint32_t i1;
uint32_t i2;
};
} di;
di.d = v;
const fs_builder ubld = bld.exec_all().group(1, 0);
const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
ubld.MOV(tmp, brw_imm_ud(di.i1));
ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2));
return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0);
}
fs_reg
setup_imm_b(const fs_builder &bld, int8_t v)
{
const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_B);
bld.MOV(tmp, brw_imm_w(v));
return tmp;
}
fs_reg
setup_imm_ub(const fs_builder &bld, uint8_t v)
{
const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UB);
bld.MOV(tmp, brw_imm_uw(v));
return tmp;
}