mesa/src/intel/compiler/brw_inst.h

503 lines
15 KiB
C
Raw Normal View History

/* -*- c++ -*- */
/*
* Copyright © 2010-2016 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#pragma once
#include <assert.h>
#include "brw_reg.h"
#include "brw_list.h"
#define MAX_SAMPLER_MESSAGE_SIZE 11
/* The sampler can return a vec5 when sampling with sparse residency. In
* SIMD32, each component takes up 4 GRFs, so we need to allow up to size-20
* VGRFs to hold the result.
*/
#define MAX_VGRF_SIZE(devinfo) ((devinfo)->ver >= 20 ? 40 : 20)
struct bblock_t;
struct brw_shader;
enum ENUM_PACKED brw_inst_kind {
BRW_KIND_BASE,
BRW_KIND_SEND,
BRW_KIND_LOGICAL,
BRW_KIND_TEX,
BRW_KIND_MEM,
BRW_KIND_DPAS,
BRW_KIND_LOAD_PAYLOAD,
BRW_KIND_URB,
BRW_KIND_FB_WRITE,
};
brw_inst_kind brw_inst_kind_for_opcode(enum opcode opcode);
struct brw_inst : brw_exec_node {
brw_inst() = delete;
brw_inst(const brw_inst&) = delete;
/* Enable usage of placement new. */
static void* operator new(size_t size, void *ptr) { return ptr; }
static void operator delete(void *p) {}
/* Prefer macro here instead of templates to get nicer
* helper names.
*/
#define KIND_HELPERS(HELPER_NAME, TYPE_NAME, ENUM_NAME) \
struct TYPE_NAME *HELPER_NAME() { \
return kind == ENUM_NAME ? (struct TYPE_NAME *)this \
: nullptr; \
} \
const struct TYPE_NAME *HELPER_NAME() const { \
return kind == ENUM_NAME ? (const struct TYPE_NAME *)this \
: nullptr; \
}
KIND_HELPERS(as_send, brw_send_inst, BRW_KIND_SEND);
KIND_HELPERS(as_tex, brw_tex_inst, BRW_KIND_TEX);
KIND_HELPERS(as_mem, brw_mem_inst, BRW_KIND_MEM);
KIND_HELPERS(as_dpas, brw_dpas_inst, BRW_KIND_DPAS);
KIND_HELPERS(as_load_payload, brw_load_payload_inst, BRW_KIND_LOAD_PAYLOAD);
KIND_HELPERS(as_urb, brw_urb_inst, BRW_KIND_URB);
KIND_HELPERS(as_fb_write, brw_fb_write_inst, BRW_KIND_FB_WRITE);
#undef KIND_HELPERS
bool is_send() const;
bool is_payload(unsigned arg) const;
bool is_partial_write(unsigned grf_size = REG_SIZE) const;
unsigned components_read(unsigned i) const;
unsigned size_read(const struct intel_device_info *devinfo, int arg) const;
bool can_do_source_mods(const struct intel_device_info *devinfo) const;
bool can_do_cmod() const;
bool can_change_types() const;
i965: Add src/dst interference for certain instructions with hazards. When working on tessellation shaders, I created some vec4 virtual opcodes for creating message headers through a sequence like: mov(8) g7<1>UD 0x00000000UD { align1 WE_all 1Q compacted }; mov(1) g7.5<1>UD 0x00000100UD { align1 WE_all }; mov(1) g7<1>UD g0<0,1,0>UD { align1 WE_all compacted }; mov(1) g7.3<1>UD g8<0,1,0>UD { align1 WE_all }; This is done in the generator since the vec4 backend can't handle align1 regioning. From the visitor's point of view, this is a single opcode: hs_set_output_urb_offsets vgrf7.0:UD, 1U, vgrf8.xxxx:UD Normally, there's no hazard between sources and destinations - an instruction (naturally) reads its sources, then writes the result to the destination. However, when the virtual instruction generates multiple hardware instructions, we can get into trouble. In the above example, if the register allocator assigned vgrf7 and vgrf8 to the same hardware register, then we'd clobber the source with 0 in the first instruction, and read back the wrong value in the last one. It occured to me that this is exactly the same problem we have with SIMD16 instructions that use W/UW or B/UB types with 0 stride. The hardware implicitly decodes them as two SIMD8 instructions, and with the overlapping regions, the first would clobber the second. Previously, we handled that by incrementing the live range end IP by 1, which works, but is excessive: the next instruction doesn't actually care about that. It might also be the end of control flow. This might keep values alive too long. What we really want is to say "my source and destinations interfere". This patch creates new infrastructure for doing just that, and teaches the register allocator to add interference when there's a hazard. For my vec4 case, we can determine this by switching on opcodes. For the SIMD16 case, we just move the existing code there. I audited our existing virtual opcodes that generate multiple instructions; I believe FS_OPCODE_PACK_HALF_2x16_SPLIT needs this treatment as well, but no others. v2: Rebased by mattst88. Signed-off-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-11-19 16:00:18 -08:00
bool has_source_and_destination_hazard() const;
bool is_3src(const struct brw_compiler *compiler) const;
bool is_math() const;
bool is_control_flow_begin() const;
bool is_control_flow_end() const;
bool is_control_flow() const;
bool is_commutative() const;
intel/brw: Copy prop from raw integer moves with mismatched types The specific pattern from the unit test was observed in ray tracing trampoline shaders. v2: Refactor the is_raw_move tests out to a utility function. Suggested by Ken. v3: Fix a regression caused by being too picky about source modifiers. This was introduced somewhere between when I did initial shader-db runs an v2. v4: Fix typo in comment. Noticed by Caio. shader-db: All Intel platforms had similar results. (Meteor Lake shown) total instructions in shared programs: 19734086 -> 19733997 (<.01%) instructions in affected programs: 135388 -> 135299 (-0.07%) helped: 76 / HURT: 2 total cycles in shared programs: 916290451 -> 916264968 (<.01%) cycles in affected programs: 41046002 -> 41020519 (-0.06%) helped: 32 / HURT: 29 fossil-db: Meteor Lake, DG2, and Skylake had similar results. (Meteor Lake shown) Totals: Instrs: 151531355 -> 151513669 (-0.01%); split: -0.01%, +0.00% Cycle count: 17209372399 -> 17208178205 (-0.01%); split: -0.01%, +0.00% Max live registers: 32016490 -> 32016493 (+0.00%) Totals from 17361 (2.75% of 630198) affected shaders: Instrs: 2642048 -> 2624362 (-0.67%); split: -0.67%, +0.00% Cycle count: 79803066 -> 78608872 (-1.50%); split: -1.75%, +0.25% Max live registers: 421668 -> 421671 (+0.00%) Tiger Lake and Ice Lake had similar results. (Tiger Lake shown) Totals: Instrs: 149995644 -> 149977326 (-0.01%); split: -0.01%, +0.00% Cycle count: 15567293770 -> 15566524840 (-0.00%); split: -0.02%, +0.01% Spill count: 61241 -> 61238 (-0.00%) Fill count: 107304 -> 107301 (-0.00%) Max live registers: 31993109 -> 31993112 (+0.00%) Totals from 17813 (2.83% of 629912) affected shaders: Instrs: 3738236 -> 3719918 (-0.49%); split: -0.49%, +0.00% Cycle count: 4251157049 -> 4250388119 (-0.02%); split: -0.06%, +0.04% Spill count: 28268 -> 28265 (-0.01%) Fill count: 50377 -> 50374 (-0.01%) Max live registers: 470648 -> 470651 (+0.00%) Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30251>
2024-07-16 16:04:38 -07:00
bool is_raw_move() const;
bool can_do_saturate() const;
bool reads_accumulator_implicitly() const;
bool writes_accumulator_implicitly(const struct intel_device_info *devinfo) const;
/**
* Instructions that use indirect addressing have additional register
* regioning restrictions.
*/
bool uses_indirect_addressing() const;
void remove();
/**
* True if the instruction has side effects other than writing to
* its destination registers. You are expected not to reorder or
* optimize these out unless you know what you are doing.
*/
bool has_side_effects() const;
/**
* True if the instruction might be affected by side effects of other
* instructions.
*/
bool is_volatile() const;
/**
* Return whether \p arg is a control source of a virtual instruction which
* shouldn't contribute to the execution type and usual regioning
* restriction calculations of arithmetic instructions.
*/
bool is_control_source(unsigned arg) const;
/**
* Return the subset of flag registers read by the instruction as a bitset
* with byte granularity.
*/
unsigned flags_read(const intel_device_info *devinfo) const;
/**
* Return the subset of flag registers updated by the instruction (either
* partially or fully) as a bitset with byte granularity.
*/
intel/fs: sel.cond writes the flags on Gfx4 and Gfx5 On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented using a separte cmpn and sel instruction. This lowering occurs in fs_vistor::lower_minmax which is called very, very late... a long, long time after the first calls to opt_cmod_propagation. As a result, conditional modifiers can be incorrectly propagated across sel.cond on those platforms. No tests were affected by this change, and I find that quite shocking. After just changing flags_written(), all of the atan tests started failing on ILK. That required the change in cmod_propagatin (and the addition of the prop_across_into_sel_gfx5 unit test). Shader-db results for ILK and GM45 are below. I looked at a couple before and after shaders... and every case that I looked at had experienced incorrect cmod propagation. This affected a LOT of apps! Euro Truck Simulator 2, The Talos Principle, Serious Sam 3, Sanctum 2, Gang Beasts, and on and on... :( I discovered this bug while working on a couple new optimization passes. One of the passes attempts to remove condition modifiers that are never used. The pass made no progress except on ILK and GM45. After investigating a couple of the affected shaders, I noticed that the code in those shaders looked wrong... investigation led to this cause. v2: Trivial changes in the unit tests. v3: Fix type in comment in unit tests. Noticed by Jason and Priit. v4: Tweak handling of BRW_OPCODE_SEL special case. Suggested by Jason. Fixes: df1aec763eb ("i965/fs: Define methods to calculate the flag subset read or written by an fs_inst.") Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> Tested-by: Dave Airlie <airlied@redhat.com> Iron Lake total instructions in shared programs: 8180493 -> 8181781 (0.02%) instructions in affected programs: 541796 -> 543084 (0.24%) helped: 28 HURT: 1158 helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 helped stats (rel) min: 0.35% max: 0.86% x̄: 0.53% x̃: 0.50% HURT stats (abs) min: 1 max: 3 x̄: 1.14 x̃: 1 HURT stats (rel) min: 0.12% max: 4.00% x̄: 0.37% x̃: 0.23% 95% mean confidence interval for instructions value: 1.06 1.11 95% mean confidence interval for instructions %-change: 0.31% 0.38% Instructions are HURT. total cycles in shared programs: 239420470 -> 239421690 (<.01%) cycles in affected programs: 2925992 -> 2927212 (0.04%) helped: 49 HURT: 157 helped stats (abs) min: 2 max: 284 x̄: 62.69 x̃: 70 helped stats (rel) min: 0.04% max: 6.20% x̄: 1.68% x̃: 1.96% HURT stats (abs) min: 2 max: 48 x̄: 27.34 x̃: 24 HURT stats (rel) min: 0.02% max: 2.91% x̄: 0.31% x̃: 0.20% 95% mean confidence interval for cycles value: -0.80 12.64 95% mean confidence interval for cycles %-change: -0.31% <.01% Inconclusive result (value mean confidence interval includes 0). GM45 total instructions in shared programs: 4985517 -> 4986207 (0.01%) instructions in affected programs: 306935 -> 307625 (0.22%) helped: 14 HURT: 625 helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 helped stats (rel) min: 0.35% max: 0.82% x̄: 0.52% x̃: 0.49% HURT stats (abs) min: 1 max: 3 x̄: 1.13 x̃: 1 HURT stats (rel) min: 0.12% max: 3.90% x̄: 0.34% x̃: 0.22% 95% mean confidence interval for instructions value: 1.04 1.12 95% mean confidence interval for instructions %-change: 0.29% 0.36% Instructions are HURT. total cycles in shared programs: 153827268 -> 153828052 (<.01%) cycles in affected programs: 1669290 -> 1670074 (0.05%) helped: 24 HURT: 84 helped stats (abs) min: 2 max: 232 x̄: 64.33 x̃: 67 helped stats (rel) min: 0.04% max: 4.62% x̄: 1.60% x̃: 1.94% HURT stats (abs) min: 2 max: 48 x̄: 27.71 x̃: 24 HURT stats (rel) min: 0.02% max: 2.66% x̄: 0.34% x̃: 0.14% 95% mean confidence interval for cycles value: -1.94 16.46 95% mean confidence interval for cycles %-change: -0.29% 0.11% Inconclusive result (value mean confidence interval includes 0). Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12191>
2021-08-02 21:33:17 -07:00
unsigned flags_written(const intel_device_info *devinfo) const;
/**
* Return true if this instruction is a sampler message gathering residency
* data.
*/
bool has_sampler_residency() const;
/**
* Return true if this instruction is using the address register
* implicitly.
*/
bool uses_address_register_implicitly() const;
uint8_t sources; /**< Number of brw_reg sources. */
/**
* Execution size of the instruction. This is used by the generator to
* generate the correct binary for the given instruction. Current valid
* values are 1, 4, 8, 16, 32.
*/
uint8_t exec_size;
/**
* Channel group from the hardware execution and predication mask that
* should be applied to the instruction. The subset of channel enable
* signals (calculated from the EU control flow and predication state)
* given by [group, group + exec_size) will be used to mask GRF writes and
* any other side effects of the instruction.
*/
uint8_t group;
uint16_t size_written; /**< Data written to the destination register in bytes. */
enum opcode opcode; /* BRW_OPCODE_* or FS_OPCODE_* */
enum brw_conditional_mod conditional_mod; /**< BRW_CONDITIONAL_* */
enum brw_predicate predicate;
brw_inst_kind kind;
tgl_swsb sched; /**< Scheduling info. */
union {
struct {
/* Chooses which flag subregister (f0.0 to f3.1) is used for
* conditional mod and predication.
*/
uint8_t flag_subreg:3;
bool predicate_inverse:1;
bool writes_accumulator:1; /**< instruction implicitly writes accumulator */
bool force_writemask_all:1;
bool saturate:1;
/**
* The predication mask applied to this instruction is guaranteed to
* be uniform and a superset of the execution mask of the present block.
* No currently enabled channel will be disabled by the predicate.
*/
bool predicate_trivial:1;
bool eot:1;
bool keep_payload_trailing_zeros:1;
/**
* Whether the parameters of the SEND instructions are build with
* NoMask (for A32 messages this covers only the surface handle, for
* A64 messages this covers the load address).
*
* Also used to signal a dummy render target SEND message that is
* never executed.
*/
bool has_no_mask_send_params:1;
uint8_t pad:5;
};
uint16_t bits;
};
brw_reg dst;
brw_reg *src;
#ifndef NDEBUG
/** @{
* Annotation for the generated IR.
*/
const char *annotation;
/** @} */
#endif
bblock_t *block;
};
struct brw_send_inst : brw_inst {
uint32_t desc;
uint32_t ex_desc;
uint32_t offset;
uint8_t mlen;
uint8_t ex_mlen;
uint8_t sfid;
/** The number of hardware registers used for a message header. */
uint8_t header_size;
union {
struct {
/**
* Turns it into a SENDC.
*/
bool check_tdr:1;
bool has_side_effects:1;
bool is_volatile:1;
/**
* Use extended bindless surface offset (26bits instead of 20bits)
*/
bool ex_bso:1;
/**
* Only for SHADER_OPCODE_SEND, @offset field contains an immediate
* part of the extended descriptor that must be encoded in the
* instruction.
*/
bool ex_desc_imm:1;
uint8_t pad:3;
};
uint8_t send_bits;
};
};
struct brw_tex_inst : brw_inst {
uint32_t offset;
uint8_t coord_components;
uint8_t grad_components;
bool residency;
};
struct brw_mem_inst : brw_inst {
enum lsc_opcode lsc_op;
enum memory_logical_mode mode;
enum lsc_addr_surface_type binding_type;
enum lsc_data_size data_size;
uint8_t coord_components;
uint8_t components;
uint8_t flags;
/** Required alignment of address in bytes; 0 for natural alignment */
uint32_t alignment;
int32_t address_offset;
};
struct brw_dpas_inst : brw_inst {
/** Systolic depth. */
uint8_t sdepth;
/** Repeat count. */
uint8_t rcount;
};
struct brw_load_payload_inst : brw_inst {
/** The number of hardware registers used for a message header. */
uint8_t header_size;
};
struct brw_urb_inst : brw_inst {
uint32_t offset;
uint8_t components;
};
struct brw_fb_write_inst : brw_inst {
uint8_t components;
uint8_t target;
bool null_rt;
bool last_rt;
};
/**
* Make the execution of \p inst dependent on the evaluation of a possibly
* inverted predicate.
*/
static inline brw_inst *
set_predicate_inv(enum brw_predicate pred, bool inverse,
brw_inst *inst)
{
inst->predicate = pred;
inst->predicate_inverse = inverse;
return inst;
}
/**
* Make the execution of \p inst dependent on the evaluation of a predicate.
*/
static inline brw_inst *
set_predicate(enum brw_predicate pred, brw_inst *inst)
{
return set_predicate_inv(pred, false, inst);
}
/**
* Write the result of evaluating the condition given by \p mod to a flag
* register.
*/
static inline brw_inst *
set_condmod(enum brw_conditional_mod mod, brw_inst *inst)
{
inst->conditional_mod = mod;
return inst;
}
/**
* Clamp the result of \p inst to the saturation range of its destination
* datatype.
*/
static inline brw_inst *
set_saturate(bool saturate, brw_inst *inst)
{
inst->saturate = saturate;
return inst;
}
/**
* Return the number of dataflow registers written by the instruction (either
* fully or partially) counted from 'floor(reg_offset(inst->dst) /
* register_size)'. The somewhat arbitrary register size unit is 4B for the
* UNIFORM and IMM files and 32B for all other files.
*/
inline unsigned
regs_written(const brw_inst *inst)
{
assert(inst->dst.file != UNIFORM && inst->dst.file != IMM);
return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE +
inst->size_written -
MIN2(inst->size_written, reg_padding(inst->dst)),
REG_SIZE);
}
/**
* Return the number of dataflow registers read by the instruction (either
* fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
* register_size)'. The somewhat arbitrary register size unit is 4B for the
* UNIFORM files and 32B for all other files.
*/
inline unsigned
regs_read(const struct intel_device_info *devinfo, const brw_inst *inst, unsigned i)
{
if (inst->src[i].file == IMM)
return 1;
const unsigned reg_size = inst->src[i].file == UNIFORM ? 4 : REG_SIZE;
return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size +
inst->size_read(devinfo, i) -
MIN2(inst->size_read(devinfo, i), reg_padding(inst->src[i])),
reg_size);
}
enum brw_reg_type get_exec_type(const brw_inst *inst);
static inline unsigned
get_exec_type_size(const brw_inst *inst)
{
return brw_type_size_bytes(get_exec_type(inst));
}
/**
* Return whether the instruction isn't an ALU instruction and cannot be
* assumed to complete in-order.
*/
static inline bool
is_unordered(const intel_device_info *devinfo, const brw_inst *inst)
{
return inst->is_send() || (devinfo->ver < 20 && inst->is_math()) ||
inst->opcode == BRW_OPCODE_DPAS ||
(devinfo->has_64bit_float_via_math_pipe &&
(get_exec_type(inst) == BRW_TYPE_DF ||
inst->dst.type == BRW_TYPE_DF));
}
static inline bool
has_bfloat_operands(const brw_inst *inst)
{
if (brw_type_is_bfloat(inst->dst.type))
return true;
for (int i = 0; i < inst->sources; i++) {
if (brw_type_is_bfloat(inst->src[i].type))
return true;
}
return false;
}
bool has_dst_aligned_region_restriction(const intel_device_info *devinfo,
const brw_inst *inst,
brw_reg_type dst_type);
static inline bool
has_dst_aligned_region_restriction(const intel_device_info *devinfo,
const brw_inst *inst)
{
return has_dst_aligned_region_restriction(devinfo, inst, inst->dst.type);
}
bool has_subdword_integer_region_restriction(const intel_device_info *devinfo,
const brw_inst *inst,
const brw_reg *srcs, unsigned num_srcs);
intel/fs/gfx20+: Implement sub-dword integer regioning restrictions. This patch introduces code to enforce the pages-long regioning restrictions introduced by Xe2 that apply to sub-dword integer datatypes (See BSpec page 56640). They impose a number of restrictions on what the regioning parameters of a source can be depending on the source and destination datatypes as well as the alignment of the destination. The tricky cases are when the destination stride is smaller than 32 bits and the source stride is at least 32 bits, since such cases require the destination and source offsets to be in agreement based on an equation determined by the source and destination strides. The second source of instructions with multiple sources is even more restricted, and due to the existence of hardware bug HSDES#16012383669 it basically requires the source data to be packed in the GRF if the destination stride isn't dword-aligned. In order to address those restrictions this patch leverages the existing infrastructure from brw_fs_lower_regioning.cpp. The same general approach can be used to handle this restriction we were using to handle restrictions of the floating-point pipeline in previous generations: Unsupported source regions are lowered by emitting an additional copy before the instruction that shuffles the data in a way that allows using a valid region in the original instruction. The main difficulty that wasn't encountered in previous platforms is that it is non-trivial to come up with a copy instruction that doesn't break the regioning restrictions itself, since on previous platforms we could just bitcast floating-point data and use integer copies in order to implement arbitrary regioning, which is unfortunately no longer a choice lacking a magic third pipeline able to do the regioning modes the integer pipeline is no longer able to do. The required_src_byte_stride() and required_src_byte_offset() helpers introduced here try to calculate parameters for both regions that avoid that situation, but it isn't always possible, and actually in some cases that involve the second source of ALU instructions a chain of multiple copy instructions will be required, so the lower_instruction() routine needs to be applied recursively to the instructions emitted to lower the original instruction. XXX - Allow more flexible regioning for the second source of an instruction if bug HSDES#16012383669 is fixed in a future hardware platform. Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28698>
2024-03-06 16:16:45 -08:00
static inline bool
has_subdword_integer_region_restriction(const intel_device_info *devinfo,
const brw_inst *inst)
{
return has_subdword_integer_region_restriction(devinfo, inst,
inst->src, inst->sources);
}
bool is_identity_payload(const struct intel_device_info *devinfo,
brw_reg_file file, const brw_inst *inst);
bool is_multi_copy_payload(const struct intel_device_info *devinfo,
const brw_inst *inst);
bool is_coalescing_payload(const struct brw_shader &s, const brw_inst *inst);
bool has_bank_conflict(const struct brw_isa_info *isa, const brw_inst *inst);
/* Return the subset of flag registers that an instruction could
* potentially read or write based on the execution controls and flag
* subregister number of the instruction.
*/
static inline unsigned
brw_flag_mask(const brw_inst *inst, unsigned width)
{
assert(util_is_power_of_two_nonzero(width));
const unsigned start = (inst->flag_subreg * 16 + inst->group) &
~(width - 1);
const unsigned end = start + ALIGN(inst->exec_size, width);
return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
}
static inline unsigned
brw_bit_mask(unsigned n)
{
return (n >= CHAR_BIT * sizeof(brw_bit_mask(n)) ? ~0u : (1u << n) - 1);
}
static inline unsigned
brw_flag_mask(const brw_reg &r, unsigned sz)
{
if (r.file == ARF) {
const unsigned start = (r.nr - BRW_ARF_FLAG) * 4 + r.subnr;
const unsigned end = start + sz;
return brw_bit_mask(end) & ~brw_bit_mask(start);
} else {
return 0;
}
}