2015-02-06 01:11:18 +02:00
|
|
|
/* -*- c++ -*- */
|
|
|
|
|
/*
|
2024-12-06 13:05:43 -08:00
|
|
|
* Copyright © 2010-2016 Intel Corporation
|
2015-02-06 01:11:18 +02:00
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
|
*/
|
|
|
|
|
|
2024-12-06 14:25:29 -08:00
|
|
|
#pragma once
|
2015-02-06 01:11:18 +02:00
|
|
|
|
2024-12-06 13:05:43 -08:00
|
|
|
#include <assert.h>
|
|
|
|
|
#include "brw_reg.h"
|
2025-07-28 16:07:44 -04:00
|
|
|
#include "brw_list.h"
|
2024-12-06 13:05:43 -08:00
|
|
|
|
|
|
|
|
#define MAX_SAMPLER_MESSAGE_SIZE 11
|
|
|
|
|
|
|
|
|
|
/* The sampler can return a vec5 when sampling with sparse residency. In
|
|
|
|
|
* SIMD32, each component takes up 4 GRFs, so we need to allow up to size-20
|
|
|
|
|
* VGRFs to hold the result.
|
|
|
|
|
*/
|
|
|
|
|
#define MAX_VGRF_SIZE(devinfo) ((devinfo)->ver >= 20 ? 40 : 20)
|
|
|
|
|
|
|
|
|
|
struct bblock_t;
|
2015-02-06 01:11:18 +02:00
|
|
|
|
2025-07-28 16:07:44 -04:00
|
|
|
struct brw_inst : public brw_exec_node {
|
2024-02-20 21:15:18 -08:00
|
|
|
private:
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst &operator=(const brw_inst &);
|
2015-02-06 01:11:18 +02:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
void init(enum opcode opcode, uint8_t exec_width, const brw_reg &dst,
|
|
|
|
|
const brw_reg *src, unsigned sources);
|
2015-02-06 01:11:18 +02:00
|
|
|
|
|
|
|
|
public:
|
2024-12-07 00:23:07 -08:00
|
|
|
DECLARE_RALLOC_CXX_OPERATORS(brw_inst)
|
2015-02-06 01:11:18 +02:00
|
|
|
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg src[], unsigned sources);
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst(const brw_inst &that);
|
2015-02-06 01:11:18 +02:00
|
|
|
|
|
|
|
|
void resize_sources(uint8_t num_sources);
|
|
|
|
|
|
2025-01-31 02:47:53 -08:00
|
|
|
bool is_send() const;
|
2018-11-09 14:13:37 -08:00
|
|
|
bool is_payload(unsigned arg) const;
|
2025-02-20 15:23:04 +02:00
|
|
|
bool is_partial_write(unsigned grf_size = REG_SIZE) const;
|
2015-07-21 17:28:39 +03:00
|
|
|
unsigned components_read(unsigned i) const;
|
2024-06-19 10:50:51 -07:00
|
|
|
unsigned size_read(const struct intel_device_info *devinfo, int arg) const;
|
2021-04-05 13:19:39 -07:00
|
|
|
bool can_do_source_mods(const struct intel_device_info *devinfo) const;
|
2024-02-20 21:12:17 -08:00
|
|
|
bool can_do_cmod() const;
|
2015-10-14 02:12:09 -07:00
|
|
|
bool can_change_types() const;
|
i965: Add src/dst interference for certain instructions with hazards.
When working on tessellation shaders, I created some vec4 virtual
opcodes for creating message headers through a sequence like:
mov(8) g7<1>UD 0x00000000UD { align1 WE_all 1Q compacted };
mov(1) g7.5<1>UD 0x00000100UD { align1 WE_all };
mov(1) g7<1>UD g0<0,1,0>UD { align1 WE_all compacted };
mov(1) g7.3<1>UD g8<0,1,0>UD { align1 WE_all };
This is done in the generator since the vec4 backend can't handle align1
regioning. From the visitor's point of view, this is a single opcode:
hs_set_output_urb_offsets vgrf7.0:UD, 1U, vgrf8.xxxx:UD
Normally, there's no hazard between sources and destinations - an
instruction (naturally) reads its sources, then writes the result to the
destination. However, when the virtual instruction generates multiple
hardware instructions, we can get into trouble.
In the above example, if the register allocator assigned vgrf7 and vgrf8
to the same hardware register, then we'd clobber the source with 0 in
the first instruction, and read back the wrong value in the last one.
It occured to me that this is exactly the same problem we have with
SIMD16 instructions that use W/UW or B/UB types with 0 stride. The
hardware implicitly decodes them as two SIMD8 instructions, and with
the overlapping regions, the first would clobber the second.
Previously, we handled that by incrementing the live range end IP by 1,
which works, but is excessive: the next instruction doesn't actually
care about that. It might also be the end of control flow. This might
keep values alive too long. What we really want is to say "my source
and destinations interfere".
This patch creates new infrastructure for doing just that, and teaches
the register allocator to add interference when there's a hazard. For
my vec4 case, we can determine this by switching on opcodes. For the
SIMD16 case, we just move the existing code there.
I audited our existing virtual opcodes that generate multiple
instructions; I believe FS_OPCODE_PACK_HALF_2x16_SPLIT needs this
treatment as well, but no others.
v2: Rebased by mattst88.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-11-19 16:00:18 -08:00
|
|
|
bool has_source_and_destination_hazard() const;
|
2015-02-06 01:11:18 +02:00
|
|
|
|
2024-02-20 21:12:17 -08:00
|
|
|
bool is_3src(const struct brw_compiler *compiler) const;
|
|
|
|
|
bool is_math() const;
|
|
|
|
|
bool is_control_flow_begin() const;
|
|
|
|
|
bool is_control_flow_end() const;
|
|
|
|
|
bool is_control_flow() const;
|
|
|
|
|
bool is_commutative() const;
|
intel/brw: Copy prop from raw integer moves with mismatched types
The specific pattern from the unit test was observed in ray tracing
trampoline shaders.
v2: Refactor the is_raw_move tests out to a utility function. Suggested
by Ken.
v3: Fix a regression caused by being too picky about source
modifiers. This was introduced somewhere between when I did initial
shader-db runs an v2.
v4: Fix typo in comment. Noticed by Caio.
shader-db:
All Intel platforms had similar results. (Meteor Lake shown)
total instructions in shared programs: 19734086 -> 19733997 (<.01%)
instructions in affected programs: 135388 -> 135299 (-0.07%)
helped: 76 / HURT: 2
total cycles in shared programs: 916290451 -> 916264968 (<.01%)
cycles in affected programs: 41046002 -> 41020519 (-0.06%)
helped: 32 / HURT: 29
fossil-db:
Meteor Lake, DG2, and Skylake had similar results. (Meteor Lake shown)
Totals:
Instrs: 151531355 -> 151513669 (-0.01%); split: -0.01%, +0.00%
Cycle count: 17209372399 -> 17208178205 (-0.01%); split: -0.01%, +0.00%
Max live registers: 32016490 -> 32016493 (+0.00%)
Totals from 17361 (2.75% of 630198) affected shaders:
Instrs: 2642048 -> 2624362 (-0.67%); split: -0.67%, +0.00%
Cycle count: 79803066 -> 78608872 (-1.50%); split: -1.75%, +0.25%
Max live registers: 421668 -> 421671 (+0.00%)
Tiger Lake and Ice Lake had similar results. (Tiger Lake shown)
Totals:
Instrs: 149995644 -> 149977326 (-0.01%); split: -0.01%, +0.00%
Cycle count: 15567293770 -> 15566524840 (-0.00%); split: -0.02%, +0.01%
Spill count: 61241 -> 61238 (-0.00%)
Fill count: 107304 -> 107301 (-0.00%)
Max live registers: 31993109 -> 31993112 (+0.00%)
Totals from 17813 (2.83% of 629912) affected shaders:
Instrs: 3738236 -> 3719918 (-0.49%); split: -0.49%, +0.00%
Cycle count: 4251157049 -> 4250388119 (-0.02%); split: -0.06%, +0.04%
Spill count: 28268 -> 28265 (-0.01%)
Fill count: 50377 -> 50374 (-0.01%)
Max live registers: 470648 -> 470651 (+0.00%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30251>
2024-07-16 16:04:38 -07:00
|
|
|
bool is_raw_move() const;
|
2024-02-20 21:12:17 -08:00
|
|
|
bool can_do_saturate() const;
|
|
|
|
|
bool reads_accumulator_implicitly() const;
|
|
|
|
|
bool writes_accumulator_implicitly(const struct intel_device_info *devinfo) const;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Instructions that use indirect addressing have additional register
|
|
|
|
|
* regioning restrictions.
|
|
|
|
|
*/
|
|
|
|
|
bool uses_indirect_addressing() const;
|
|
|
|
|
|
2025-03-11 13:20:09 -07:00
|
|
|
void remove();
|
2024-02-20 21:12:17 -08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* True if the instruction has side effects other than writing to
|
|
|
|
|
* its destination registers. You are expected not to reorder or
|
|
|
|
|
* optimize these out unless you know what you are doing.
|
|
|
|
|
*/
|
|
|
|
|
bool has_side_effects() const;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* True if the instruction might be affected by side effects of other
|
|
|
|
|
* instructions.
|
|
|
|
|
*/
|
|
|
|
|
bool is_volatile() const;
|
|
|
|
|
|
2019-01-16 18:30:08 -08:00
|
|
|
/**
|
|
|
|
|
* Return whether \p arg is a control source of a virtual instruction which
|
|
|
|
|
* shouldn't contribute to the execution type and usual regioning
|
|
|
|
|
* restriction calculations of arithmetic instructions.
|
|
|
|
|
*/
|
|
|
|
|
bool is_control_source(unsigned arg) const;
|
|
|
|
|
|
2016-05-18 21:54:35 -07:00
|
|
|
/**
|
|
|
|
|
* Return the subset of flag registers read by the instruction as a bitset
|
|
|
|
|
* with byte granularity.
|
|
|
|
|
*/
|
2021-04-05 13:19:39 -07:00
|
|
|
unsigned flags_read(const intel_device_info *devinfo) const;
|
2016-05-18 21:54:35 -07:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Return the subset of flag registers updated by the instruction (either
|
|
|
|
|
* partially or fully) as a bitset with byte granularity.
|
|
|
|
|
*/
|
intel/fs: sel.cond writes the flags on Gfx4 and Gfx5
On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
using a separte cmpn and sel instruction. This lowering occurs in
fs_vistor::lower_minmax which is called very, very late... a long, long
time after the first calls to opt_cmod_propagation. As a result,
conditional modifiers can be incorrectly propagated across sel.cond on
those platforms.
No tests were affected by this change, and I find that quite shocking.
After just changing flags_written(), all of the atan tests started
failing on ILK. That required the change in cmod_propagatin (and the
addition of the prop_across_into_sel_gfx5 unit test).
Shader-db results for ILK and GM45 are below. I looked at a couple
before and after shaders... and every case that I looked at had
experienced incorrect cmod propagation. This affected a LOT of apps!
Euro Truck Simulator 2, The Talos Principle, Serious Sam 3, Sanctum 2,
Gang Beasts, and on and on... :(
I discovered this bug while working on a couple new optimization
passes. One of the passes attempts to remove condition modifiers that
are never used. The pass made no progress except on ILK and GM45.
After investigating a couple of the affected shaders, I noticed that
the code in those shaders looked wrong... investigation led to this
cause.
v2: Trivial changes in the unit tests.
v3: Fix type in comment in unit tests. Noticed by Jason and Priit.
v4: Tweak handling of BRW_OPCODE_SEL special case. Suggested by Jason.
Fixes: df1aec763eb ("i965/fs: Define methods to calculate the flag subset read or written by an fs_inst.")
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Tested-by: Dave Airlie <airlied@redhat.com>
Iron Lake
total instructions in shared programs: 8180493 -> 8181781 (0.02%)
instructions in affected programs: 541796 -> 543084 (0.24%)
helped: 28
HURT: 1158
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.35% max: 0.86% x̄: 0.53% x̃: 0.50%
HURT stats (abs) min: 1 max: 3 x̄: 1.14 x̃: 1
HURT stats (rel) min: 0.12% max: 4.00% x̄: 0.37% x̃: 0.23%
95% mean confidence interval for instructions value: 1.06 1.11
95% mean confidence interval for instructions %-change: 0.31% 0.38%
Instructions are HURT.
total cycles in shared programs: 239420470 -> 239421690 (<.01%)
cycles in affected programs: 2925992 -> 2927212 (0.04%)
helped: 49
HURT: 157
helped stats (abs) min: 2 max: 284 x̄: 62.69 x̃: 70
helped stats (rel) min: 0.04% max: 6.20% x̄: 1.68% x̃: 1.96%
HURT stats (abs) min: 2 max: 48 x̄: 27.34 x̃: 24
HURT stats (rel) min: 0.02% max: 2.91% x̄: 0.31% x̃: 0.20%
95% mean confidence interval for cycles value: -0.80 12.64
95% mean confidence interval for cycles %-change: -0.31% <.01%
Inconclusive result (value mean confidence interval includes 0).
GM45
total instructions in shared programs: 4985517 -> 4986207 (0.01%)
instructions in affected programs: 306935 -> 307625 (0.22%)
helped: 14
HURT: 625
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.35% max: 0.82% x̄: 0.52% x̃: 0.49%
HURT stats (abs) min: 1 max: 3 x̄: 1.13 x̃: 1
HURT stats (rel) min: 0.12% max: 3.90% x̄: 0.34% x̃: 0.22%
95% mean confidence interval for instructions value: 1.04 1.12
95% mean confidence interval for instructions %-change: 0.29% 0.36%
Instructions are HURT.
total cycles in shared programs: 153827268 -> 153828052 (<.01%)
cycles in affected programs: 1669290 -> 1670074 (0.05%)
helped: 24
HURT: 84
helped stats (abs) min: 2 max: 232 x̄: 64.33 x̃: 67
helped stats (rel) min: 0.04% max: 4.62% x̄: 1.60% x̃: 1.94%
HURT stats (abs) min: 2 max: 48 x̄: 27.71 x̃: 24
HURT stats (rel) min: 0.02% max: 2.66% x̄: 0.34% x̃: 0.14%
95% mean confidence interval for cycles value: -1.94 16.46
95% mean confidence interval for cycles %-change: -0.29% 0.11%
Inconclusive result (value mean confidence interval includes 0).
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12191>
2021-08-02 21:33:17 -07:00
|
|
|
unsigned flags_written(const intel_device_info *devinfo) const;
|
2016-05-18 21:54:35 -07:00
|
|
|
|
2023-05-23 13:11:02 +03:00
|
|
|
/**
|
|
|
|
|
* Return true if this instruction is a sampler message gathering residency
|
|
|
|
|
* data.
|
|
|
|
|
*/
|
|
|
|
|
bool has_sampler_residency() const;
|
|
|
|
|
|
2024-03-13 11:01:16 +02:00
|
|
|
/**
|
|
|
|
|
* Return true if this instruction is using the address register
|
|
|
|
|
* implicitly.
|
|
|
|
|
*/
|
|
|
|
|
bool uses_address_register_implicitly() const;
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
uint8_t sources; /**< Number of brw_reg sources. */
|
2024-02-29 02:06:41 -08:00
|
|
|
|
2024-02-20 21:15:18 -08:00
|
|
|
/**
|
|
|
|
|
* Execution size of the instruction. This is used by the generator to
|
|
|
|
|
* generate the correct binary for the given instruction. Current valid
|
|
|
|
|
* values are 1, 4, 8, 16, 32.
|
|
|
|
|
*/
|
|
|
|
|
uint8_t exec_size;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Channel group from the hardware execution and predication mask that
|
|
|
|
|
* should be applied to the instruction. The subset of channel enable
|
|
|
|
|
* signals (calculated from the EU control flow and predication state)
|
|
|
|
|
* given by [group, group + exec_size) will be used to mask GRF writes and
|
|
|
|
|
* any other side effects of the instruction.
|
|
|
|
|
*/
|
|
|
|
|
uint8_t group;
|
|
|
|
|
|
|
|
|
|
uint8_t mlen; /**< SEND message length */
|
|
|
|
|
uint8_t ex_mlen; /**< SENDS extended message length */
|
|
|
|
|
uint8_t sfid; /**< SFID for SEND instructions */
|
2024-02-29 02:06:41 -08:00
|
|
|
/** The number of hardware registers used for a message header. */
|
|
|
|
|
uint8_t header_size;
|
2024-02-20 21:15:18 -08:00
|
|
|
uint32_t desc; /**< SEND[S] message descriptor immediate */
|
|
|
|
|
uint32_t ex_desc; /**< SEND[S] extended message descriptor immediate */
|
2024-02-29 02:06:41 -08:00
|
|
|
|
|
|
|
|
uint32_t offset; /**< spill/unspill offset or texture offset bitfield */
|
2025-08-21 23:48:28 -07:00
|
|
|
uint16_t size_written; /**< Data written to the destination register in bytes. */
|
2024-02-20 21:15:18 -08:00
|
|
|
|
|
|
|
|
enum opcode opcode; /* BRW_OPCODE_* or FS_OPCODE_* */
|
|
|
|
|
enum brw_conditional_mod conditional_mod; /**< BRW_CONDITIONAL_* */
|
|
|
|
|
enum brw_predicate predicate;
|
|
|
|
|
|
2024-02-29 02:06:41 -08:00
|
|
|
tgl_swsb sched; /**< Scheduling info. */
|
2024-02-20 21:15:18 -08:00
|
|
|
|
2024-02-29 02:06:41 -08:00
|
|
|
union {
|
|
|
|
|
struct {
|
|
|
|
|
/* Chooses which flag subregister (f0.0 to f3.1) is used for
|
|
|
|
|
* conditional mod and predication.
|
|
|
|
|
*/
|
|
|
|
|
unsigned flag_subreg:3;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Systolic depth used by DPAS instruction.
|
|
|
|
|
*/
|
|
|
|
|
unsigned sdepth:4;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Repeat count used by DPAS instruction.
|
|
|
|
|
*/
|
|
|
|
|
unsigned rcount:4;
|
|
|
|
|
|
2024-10-22 17:00:13 +02:00
|
|
|
unsigned pad:4;
|
2024-02-29 02:06:41 -08:00
|
|
|
|
|
|
|
|
bool predicate_inverse:1;
|
|
|
|
|
bool writes_accumulator:1; /**< instruction implicitly writes accumulator */
|
|
|
|
|
bool force_writemask_all:1;
|
|
|
|
|
bool saturate:1;
|
|
|
|
|
bool check_tdr:1; /**< Only valid for SEND; turns it into a SENDC */
|
|
|
|
|
bool send_has_side_effects:1; /**< Only valid for SHADER_OPCODE_SEND */
|
|
|
|
|
bool send_is_volatile:1; /**< Only valid for SHADER_OPCODE_SEND */
|
|
|
|
|
bool send_ex_bso:1; /**< Only for SHADER_OPCODE_SEND, use extended
|
|
|
|
|
* bindless surface offset (26bits instead of
|
|
|
|
|
* 20bits)
|
|
|
|
|
*/
|
2024-10-22 17:00:13 +02:00
|
|
|
/**
|
|
|
|
|
* Only for SHADER_OPCODE_SEND, @offset field contains an immediate
|
|
|
|
|
* part of the extended descriptor that must be encoded in the
|
|
|
|
|
* instruction.
|
|
|
|
|
*/
|
|
|
|
|
bool send_ex_desc_imm:1;
|
2024-02-29 02:06:41 -08:00
|
|
|
/**
|
|
|
|
|
* The predication mask applied to this instruction is guaranteed to
|
|
|
|
|
* be uniform and a superset of the execution mask of the present block.
|
|
|
|
|
* No currently enabled channel will be disabled by the predicate.
|
|
|
|
|
*/
|
|
|
|
|
bool predicate_trivial:1;
|
|
|
|
|
bool eot:1;
|
|
|
|
|
bool keep_payload_trailing_zeros:1;
|
2024-06-07 18:50:04 +03:00
|
|
|
/**
|
|
|
|
|
* Whether the parameters of the SEND instructions are build with
|
|
|
|
|
* NoMask (for A32 messages this covers only the surface handle, for
|
|
|
|
|
* A64 messages this covers the load address).
|
2025-05-12 08:44:28 +03:00
|
|
|
*
|
|
|
|
|
* Also used to signal a dummy render target SEND message that is
|
|
|
|
|
* never executed.
|
2024-06-07 18:50:04 +03:00
|
|
|
*/
|
|
|
|
|
bool has_no_mask_send_params:1;
|
2024-02-29 02:06:41 -08:00
|
|
|
};
|
|
|
|
|
uint32_t bits;
|
|
|
|
|
};
|
2024-02-20 21:15:18 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dst;
|
|
|
|
|
brw_reg *src;
|
2024-08-23 10:46:13 -07:00
|
|
|
|
|
|
|
|
#ifndef NDEBUG
|
|
|
|
|
/** @{
|
|
|
|
|
* Annotation for the generated IR.
|
|
|
|
|
*/
|
|
|
|
|
const char *annotation;
|
|
|
|
|
/** @} */
|
|
|
|
|
#endif
|
2025-02-27 09:28:48 -08:00
|
|
|
|
|
|
|
|
bblock_t *block;
|
2015-02-06 01:11:18 +02:00
|
|
|
};
|
|
|
|
|
|
2015-06-03 21:23:46 +03:00
|
|
|
/**
|
|
|
|
|
* Make the execution of \p inst dependent on the evaluation of a possibly
|
|
|
|
|
* inverted predicate.
|
|
|
|
|
*/
|
2024-12-07 00:23:07 -08:00
|
|
|
static inline brw_inst *
|
2015-06-03 21:23:46 +03:00
|
|
|
set_predicate_inv(enum brw_predicate pred, bool inverse,
|
2024-12-07 00:23:07 -08:00
|
|
|
brw_inst *inst)
|
2015-06-03 21:23:46 +03:00
|
|
|
{
|
|
|
|
|
inst->predicate = pred;
|
|
|
|
|
inst->predicate_inverse = inverse;
|
|
|
|
|
return inst;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Make the execution of \p inst dependent on the evaluation of a predicate.
|
|
|
|
|
*/
|
2024-12-07 00:23:07 -08:00
|
|
|
static inline brw_inst *
|
|
|
|
|
set_predicate(enum brw_predicate pred, brw_inst *inst)
|
2015-06-03 21:23:46 +03:00
|
|
|
{
|
|
|
|
|
return set_predicate_inv(pred, false, inst);
|
|
|
|
|
}
|
|
|
|
|
|
2015-06-03 21:24:18 +03:00
|
|
|
/**
|
|
|
|
|
* Write the result of evaluating the condition given by \p mod to a flag
|
|
|
|
|
* register.
|
|
|
|
|
*/
|
2024-12-07 00:23:07 -08:00
|
|
|
static inline brw_inst *
|
|
|
|
|
set_condmod(enum brw_conditional_mod mod, brw_inst *inst)
|
2015-06-03 21:24:18 +03:00
|
|
|
{
|
|
|
|
|
inst->conditional_mod = mod;
|
|
|
|
|
return inst;
|
|
|
|
|
}
|
|
|
|
|
|
2015-06-03 21:24:50 +03:00
|
|
|
/**
|
|
|
|
|
* Clamp the result of \p inst to the saturation range of its destination
|
|
|
|
|
* datatype.
|
|
|
|
|
*/
|
2024-12-07 00:23:07 -08:00
|
|
|
static inline brw_inst *
|
|
|
|
|
set_saturate(bool saturate, brw_inst *inst)
|
2015-06-03 21:24:50 +03:00
|
|
|
{
|
|
|
|
|
inst->saturate = saturate;
|
|
|
|
|
return inst;
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-07 16:59:35 -07:00
|
|
|
/**
|
|
|
|
|
* Return the number of dataflow registers written by the instruction (either
|
|
|
|
|
* fully or partially) counted from 'floor(reg_offset(inst->dst) /
|
|
|
|
|
* register_size)'. The somewhat arbitrary register size unit is 4B for the
|
|
|
|
|
* UNIFORM and IMM files and 32B for all other files.
|
|
|
|
|
*/
|
|
|
|
|
inline unsigned
|
2024-12-07 00:23:07 -08:00
|
|
|
regs_written(const brw_inst *inst)
|
2016-09-07 16:59:35 -07:00
|
|
|
{
|
2016-09-07 13:38:20 -07:00
|
|
|
assert(inst->dst.file != UNIFORM && inst->dst.file != IMM);
|
2016-09-07 14:36:32 -07:00
|
|
|
return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE +
|
|
|
|
|
inst->size_written -
|
2016-09-07 14:33:55 -07:00
|
|
|
MIN2(inst->size_written, reg_padding(inst->dst)),
|
|
|
|
|
REG_SIZE);
|
2016-09-07 16:59:35 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Return the number of dataflow registers read by the instruction (either
|
|
|
|
|
* fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
|
|
|
|
|
* register_size)'. The somewhat arbitrary register size unit is 4B for the
|
2020-10-26 20:53:37 -05:00
|
|
|
* UNIFORM files and 32B for all other files.
|
2016-09-07 16:59:35 -07:00
|
|
|
*/
|
|
|
|
|
inline unsigned
|
2024-12-07 00:23:07 -08:00
|
|
|
regs_read(const struct intel_device_info *devinfo, const brw_inst *inst, unsigned i)
|
2016-09-07 16:59:35 -07:00
|
|
|
{
|
2020-10-26 20:53:37 -05:00
|
|
|
if (inst->src[i].file == IMM)
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
|
|
const unsigned reg_size = inst->src[i].file == UNIFORM ? 4 : REG_SIZE;
|
2016-09-07 14:36:32 -07:00
|
|
|
return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size +
|
2024-06-19 10:50:51 -07:00
|
|
|
inst->size_read(devinfo, i) -
|
|
|
|
|
MIN2(inst->size_read(devinfo, i), reg_padding(inst->src[i])),
|
2016-09-07 14:33:55 -07:00
|
|
|
reg_size);
|
2016-09-07 16:59:35 -07:00
|
|
|
}
|
|
|
|
|
|
2025-01-29 23:26:47 -08:00
|
|
|
enum brw_reg_type get_exec_type(const brw_inst *inst);
|
2016-07-18 07:17:39 +00:00
|
|
|
|
|
|
|
|
static inline unsigned
|
2024-12-07 00:23:07 -08:00
|
|
|
get_exec_type_size(const brw_inst *inst)
|
2016-07-18 07:17:39 +00:00
|
|
|
{
|
2024-04-21 00:57:59 -07:00
|
|
|
return brw_type_size_bytes(get_exec_type(inst));
|
2016-07-18 07:17:39 +00:00
|
|
|
}
|
|
|
|
|
|
2018-12-29 04:00:13 -08:00
|
|
|
/**
|
|
|
|
|
* Return whether the instruction isn't an ALU instruction and cannot be
|
|
|
|
|
* assumed to complete in-order.
|
|
|
|
|
*/
|
|
|
|
|
static inline bool
|
2024-12-07 00:23:07 -08:00
|
|
|
is_unordered(const intel_device_info *devinfo, const brw_inst *inst)
|
2018-12-29 04:00:13 -08:00
|
|
|
{
|
2025-01-31 02:47:53 -08:00
|
|
|
return inst->is_send() || (devinfo->ver < 20 && inst->is_math()) ||
|
2023-10-16 14:22:51 -07:00
|
|
|
inst->opcode == BRW_OPCODE_DPAS ||
|
2022-12-02 10:55:48 -08:00
|
|
|
(devinfo->has_64bit_float_via_math_pipe &&
|
2024-04-20 17:08:02 -07:00
|
|
|
(get_exec_type(inst) == BRW_TYPE_DF ||
|
|
|
|
|
inst->dst.type == BRW_TYPE_DF));
|
2018-12-29 04:00:13 -08:00
|
|
|
}
|
|
|
|
|
|
2025-03-19 10:06:26 -07:00
|
|
|
static inline bool
|
|
|
|
|
has_bfloat_operands(const brw_inst *inst)
|
|
|
|
|
{
|
|
|
|
|
if (brw_type_is_bfloat(inst->dst.type))
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
if (brw_type_is_bfloat(inst->src[i].type))
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-29 23:26:47 -08:00
|
|
|
bool has_dst_aligned_region_restriction(const intel_device_info *devinfo,
|
|
|
|
|
const brw_inst *inst,
|
|
|
|
|
brw_reg_type dst_type);
|
2018-12-07 14:26:23 -08:00
|
|
|
|
intel/fs/copy_prop: check stride constraints with actual final type
In some cases we will change the type of the destination register of
an instruction. This is the type we should use to verify that we're
allow to do the replacement.
Otherwise we can hit restrictions on CHV and upcoming Xe-Hp for
instance where the copy propagation transforms this :
send(16) (mlen: 2) vgrf10:UD, 0u, 0u, vgrf35:D, null:UD
mov(16) vgrf11:UW, vgrf10<2>:UW
mov(16) vgrf12:UW, vgrf10+0.2<2>:UW
mov(16) vgrf15:HF, |vgrf11|:HF
mov(16) vgrf16:HF, |vgrf12|:HF
mov(8) vgrf41<2>:UW, vgrf15+0.0:UW group0
mov(8) vgrf42<2>:UW, vgrf15+0.16:UW group8
mov(8) vgrf45<2>:UW, vgrf16+0.0:UW group0
mov(8) vgrf46<2>:UW, vgrf16+0.16:UW group8
into this :
send(16) (mlen: 2) vgrf10:UD, 0u, 0u, vgrf35:D, null:UD
mov(8) vgrf41<2>:HF, |vgrf10+0.0|<2>:HF group0
mov(8) vgrf42<2>:HF, |vgrf10+1.0|<2>:HF group8
mov(8) vgrf45<2>:HF, |vgrf10+0.2|<2>:HF group0
mov(8) vgrf46<2>:HF, |vgrf10+1.2|<2>:HF group8
Because of the floating point use, stride and offets should be the
same.
v2: Fix final destination type selection (Curro)
v3: constify (Curro)
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: <mesa-stable@lists.freedesktop.org>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9832>
2021-03-24 09:56:42 +02:00
|
|
|
static inline bool
|
2021-04-05 13:19:39 -07:00
|
|
|
has_dst_aligned_region_restriction(const intel_device_info *devinfo,
|
2024-12-07 00:23:07 -08:00
|
|
|
const brw_inst *inst)
|
intel/fs/copy_prop: check stride constraints with actual final type
In some cases we will change the type of the destination register of
an instruction. This is the type we should use to verify that we're
allow to do the replacement.
Otherwise we can hit restrictions on CHV and upcoming Xe-Hp for
instance where the copy propagation transforms this :
send(16) (mlen: 2) vgrf10:UD, 0u, 0u, vgrf35:D, null:UD
mov(16) vgrf11:UW, vgrf10<2>:UW
mov(16) vgrf12:UW, vgrf10+0.2<2>:UW
mov(16) vgrf15:HF, |vgrf11|:HF
mov(16) vgrf16:HF, |vgrf12|:HF
mov(8) vgrf41<2>:UW, vgrf15+0.0:UW group0
mov(8) vgrf42<2>:UW, vgrf15+0.16:UW group8
mov(8) vgrf45<2>:UW, vgrf16+0.0:UW group0
mov(8) vgrf46<2>:UW, vgrf16+0.16:UW group8
into this :
send(16) (mlen: 2) vgrf10:UD, 0u, 0u, vgrf35:D, null:UD
mov(8) vgrf41<2>:HF, |vgrf10+0.0|<2>:HF group0
mov(8) vgrf42<2>:HF, |vgrf10+1.0|<2>:HF group8
mov(8) vgrf45<2>:HF, |vgrf10+0.2|<2>:HF group0
mov(8) vgrf46<2>:HF, |vgrf10+1.2|<2>:HF group8
Because of the floating point use, stride and offets should be the
same.
v2: Fix final destination type selection (Curro)
v3: constify (Curro)
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: <mesa-stable@lists.freedesktop.org>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9832>
2021-03-24 09:56:42 +02:00
|
|
|
{
|
|
|
|
|
return has_dst_aligned_region_restriction(devinfo, inst, inst->dst.type);
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-29 23:26:47 -08:00
|
|
|
bool has_subdword_integer_region_restriction(const intel_device_info *devinfo,
|
|
|
|
|
const brw_inst *inst,
|
|
|
|
|
const brw_reg *srcs, unsigned num_srcs);
|
2024-03-06 16:16:45 -08:00
|
|
|
|
2024-04-10 15:23:45 -07:00
|
|
|
static inline bool
|
|
|
|
|
has_subdword_integer_region_restriction(const intel_device_info *devinfo,
|
2024-12-07 00:23:07 -08:00
|
|
|
const brw_inst *inst)
|
2024-04-10 15:23:45 -07:00
|
|
|
{
|
|
|
|
|
return has_subdword_integer_region_restriction(devinfo, inst,
|
|
|
|
|
inst->src, inst->sources);
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-29 23:26:47 -08:00
|
|
|
bool is_identity_payload(const struct intel_device_info *devinfo,
|
|
|
|
|
brw_reg_file file, const brw_inst *inst);
|
2019-12-31 00:10:28 -08:00
|
|
|
|
2025-01-29 23:26:47 -08:00
|
|
|
bool is_multi_copy_payload(const struct intel_device_info *devinfo,
|
|
|
|
|
const brw_inst *inst);
|
2019-12-31 00:10:28 -08:00
|
|
|
|
2024-12-07 10:25:45 -08:00
|
|
|
bool is_coalescing_payload(const struct brw_shader &s, const brw_inst *inst);
|
2019-12-31 00:10:28 -08:00
|
|
|
|
2025-01-29 23:26:47 -08:00
|
|
|
bool has_bank_conflict(const struct brw_isa_info *isa, const brw_inst *inst);
|
2020-04-02 16:20:34 -07:00
|
|
|
|
2024-01-04 22:29:54 -08:00
|
|
|
/* Return the subset of flag registers that an instruction could
|
|
|
|
|
* potentially read or write based on the execution controls and flag
|
|
|
|
|
* subregister number of the instruction.
|
|
|
|
|
*/
|
|
|
|
|
static inline unsigned
|
2025-02-10 08:55:26 -08:00
|
|
|
brw_flag_mask(const brw_inst *inst, unsigned width)
|
2024-01-04 22:29:54 -08:00
|
|
|
{
|
|
|
|
|
assert(util_is_power_of_two_nonzero(width));
|
|
|
|
|
const unsigned start = (inst->flag_subreg * 16 + inst->group) &
|
|
|
|
|
~(width - 1);
|
2025-01-29 23:26:47 -08:00
|
|
|
const unsigned end = start + ALIGN(inst->exec_size, width);
|
2024-01-04 22:29:54 -08:00
|
|
|
return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline unsigned
|
2025-02-10 08:55:26 -08:00
|
|
|
brw_bit_mask(unsigned n)
|
2024-01-04 22:29:54 -08:00
|
|
|
{
|
2025-02-10 08:55:26 -08:00
|
|
|
return (n >= CHAR_BIT * sizeof(brw_bit_mask(n)) ? ~0u : (1u << n) - 1);
|
2024-01-04 22:29:54 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline unsigned
|
2025-02-10 08:55:26 -08:00
|
|
|
brw_flag_mask(const brw_reg &r, unsigned sz)
|
2024-01-04 22:29:54 -08:00
|
|
|
{
|
|
|
|
|
if (r.file == ARF) {
|
|
|
|
|
const unsigned start = (r.nr - BRW_ARF_FLAG) * 4 + r.subnr;
|
|
|
|
|
const unsigned end = start + sz;
|
2025-02-10 08:55:26 -08:00
|
|
|
return brw_bit_mask(end) & ~brw_bit_mask(start);
|
2024-01-04 22:29:54 -08:00
|
|
|
} else {
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
}
|