mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-22 20:00:10 +01:00
This fixes a corner case of the LNL sub-dword integer restrictions
that wasn't being detected by has_subdword_integer_region_restriction(),
specifically:
> if(Src.Type==Byte && Dst.Type==Byte && Dst.Stride==1 && W!=2) {
> // ...
> if(Src.Stride == 2) && (Src.UniformStride) && (Dst.SubReg%32 == Src.SubReg/2 ) { Allowed }
> // ...
> }
All the other restrictions that require agreement between the SubReg
number of source and destination only affect sources with a stride
greater than a dword, which is why
has_subdword_integer_region_restriction() was returning false except
when "byte_stride(srcs[i]) >= 4" evaluated to true, but as implied by
the pseudocode above, in the particular case of a packed byte
destination, the restriction applies for source strides as narrow as
2B.
The form of the equation that relates the subreg numbers is consistent
with the existing calculations in brw_fs_lower_regioning (see
required_src_byte_offset()), we just need to enable lowering for this
corner case, and change lower_dst_region() to call lower_instruction()
recursively, since some of the cases where we break this restriction
are copy instructions introduced by brw_fs_lower_regioning() itself
trying to lower other instructions with byte destinations.
This fixes some Vulkan CTS test-cases that were hitting these
restrictions with byte data types.
Fixes: 217d412360 ("intel/fs/gfx20+: Implement sub-dword integer regioning restrictions.")
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30630>
603 lines
20 KiB
C++
603 lines
20 KiB
C++
/* -*- c++ -*- */
|
|
/*
|
|
* Copyright © 2010-2015 Intel Corporation
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the next
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
* Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
* IN THE SOFTWARE.
|
|
*/
|
|
|
|
#ifndef BRW_IR_FS_H
|
|
#define BRW_IR_FS_H
|
|
|
|
#include "brw_ir.h"
|
|
#include "brw_ir_allocator.h"
|
|
|
|
struct fs_inst : public exec_node {
|
|
private:
|
|
fs_inst &operator=(const fs_inst &);
|
|
|
|
void init(enum opcode opcode, uint8_t exec_width, const brw_reg &dst,
|
|
const brw_reg *src, unsigned sources);
|
|
|
|
public:
|
|
DECLARE_RALLOC_CXX_OPERATORS(fs_inst)
|
|
|
|
fs_inst();
|
|
fs_inst(enum opcode opcode, uint8_t exec_size);
|
|
fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst);
|
|
fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
|
|
const brw_reg &src0);
|
|
fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
|
|
const brw_reg &src0, const brw_reg &src1);
|
|
fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
|
|
const brw_reg &src0, const brw_reg &src1, const brw_reg &src2);
|
|
fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
|
|
const brw_reg src[], unsigned sources);
|
|
fs_inst(const fs_inst &that);
|
|
~fs_inst();
|
|
|
|
void resize_sources(uint8_t num_sources);
|
|
|
|
bool is_send_from_grf() const;
|
|
bool is_payload(unsigned arg) const;
|
|
bool is_partial_write() const;
|
|
unsigned components_read(unsigned i) const;
|
|
unsigned size_read(int arg) const;
|
|
bool can_do_source_mods(const struct intel_device_info *devinfo) const;
|
|
bool can_do_cmod() const;
|
|
bool can_change_types() const;
|
|
bool has_source_and_destination_hazard() const;
|
|
|
|
bool is_3src(const struct brw_compiler *compiler) const;
|
|
bool is_math() const;
|
|
bool is_control_flow_begin() const;
|
|
bool is_control_flow_end() const;
|
|
bool is_control_flow() const;
|
|
bool is_commutative() const;
|
|
bool is_raw_move() const;
|
|
bool can_do_saturate() const;
|
|
bool reads_accumulator_implicitly() const;
|
|
bool writes_accumulator_implicitly(const struct intel_device_info *devinfo) const;
|
|
|
|
/**
|
|
* Instructions that use indirect addressing have additional register
|
|
* regioning restrictions.
|
|
*/
|
|
bool uses_indirect_addressing() const;
|
|
|
|
void remove(bblock_t *block, bool defer_later_block_ip_updates = false);
|
|
void insert_after(bblock_t *block, fs_inst *inst);
|
|
void insert_before(bblock_t *block, fs_inst *inst);
|
|
|
|
/**
|
|
* True if the instruction has side effects other than writing to
|
|
* its destination registers. You are expected not to reorder or
|
|
* optimize these out unless you know what you are doing.
|
|
*/
|
|
bool has_side_effects() const;
|
|
|
|
/**
|
|
* True if the instruction might be affected by side effects of other
|
|
* instructions.
|
|
*/
|
|
bool is_volatile() const;
|
|
|
|
/**
|
|
* Return whether \p arg is a control source of a virtual instruction which
|
|
* shouldn't contribute to the execution type and usual regioning
|
|
* restriction calculations of arithmetic instructions.
|
|
*/
|
|
bool is_control_source(unsigned arg) const;
|
|
|
|
/**
|
|
* Return the subset of flag registers read by the instruction as a bitset
|
|
* with byte granularity.
|
|
*/
|
|
unsigned flags_read(const intel_device_info *devinfo) const;
|
|
|
|
/**
|
|
* Return the subset of flag registers updated by the instruction (either
|
|
* partially or fully) as a bitset with byte granularity.
|
|
*/
|
|
unsigned flags_written(const intel_device_info *devinfo) const;
|
|
|
|
/**
|
|
* Return true if this instruction is a sampler message gathering residency
|
|
* data.
|
|
*/
|
|
bool has_sampler_residency() const;
|
|
|
|
uint8_t sources; /**< Number of brw_reg sources. */
|
|
|
|
/**
|
|
* Execution size of the instruction. This is used by the generator to
|
|
* generate the correct binary for the given instruction. Current valid
|
|
* values are 1, 4, 8, 16, 32.
|
|
*/
|
|
uint8_t exec_size;
|
|
|
|
/**
|
|
* Channel group from the hardware execution and predication mask that
|
|
* should be applied to the instruction. The subset of channel enable
|
|
* signals (calculated from the EU control flow and predication state)
|
|
* given by [group, group + exec_size) will be used to mask GRF writes and
|
|
* any other side effects of the instruction.
|
|
*/
|
|
uint8_t group;
|
|
|
|
uint8_t mlen; /**< SEND message length */
|
|
uint8_t ex_mlen; /**< SENDS extended message length */
|
|
uint8_t sfid; /**< SFID for SEND instructions */
|
|
/** The number of hardware registers used for a message header. */
|
|
uint8_t header_size;
|
|
uint8_t target; /**< MRT target. */
|
|
uint32_t desc; /**< SEND[S] message descriptor immediate */
|
|
uint32_t ex_desc; /**< SEND[S] extended message descriptor immediate */
|
|
|
|
uint32_t offset; /**< spill/unspill offset or texture offset bitfield */
|
|
unsigned size_written; /**< Data written to the destination register in bytes. */
|
|
|
|
enum opcode opcode; /* BRW_OPCODE_* or FS_OPCODE_* */
|
|
enum brw_conditional_mod conditional_mod; /**< BRW_CONDITIONAL_* */
|
|
enum brw_predicate predicate;
|
|
|
|
tgl_swsb sched; /**< Scheduling info. */
|
|
|
|
union {
|
|
struct {
|
|
/* Chooses which flag subregister (f0.0 to f3.1) is used for
|
|
* conditional mod and predication.
|
|
*/
|
|
unsigned flag_subreg:3;
|
|
|
|
/**
|
|
* Systolic depth used by DPAS instruction.
|
|
*/
|
|
unsigned sdepth:4;
|
|
|
|
/**
|
|
* Repeat count used by DPAS instruction.
|
|
*/
|
|
unsigned rcount:4;
|
|
|
|
unsigned pad:2;
|
|
|
|
bool predicate_inverse:1;
|
|
bool writes_accumulator:1; /**< instruction implicitly writes accumulator */
|
|
bool force_writemask_all:1;
|
|
bool no_dd_clear:1;
|
|
bool no_dd_check:1;
|
|
bool saturate:1;
|
|
bool shadow_compare:1;
|
|
bool check_tdr:1; /**< Only valid for SEND; turns it into a SENDC */
|
|
bool send_has_side_effects:1; /**< Only valid for SHADER_OPCODE_SEND */
|
|
bool send_is_volatile:1; /**< Only valid for SHADER_OPCODE_SEND */
|
|
bool send_ex_desc_scratch:1; /**< Only valid for SHADER_OPCODE_SEND, use
|
|
* the scratch surface offset to build
|
|
* extended descriptor
|
|
*/
|
|
bool send_ex_bso:1; /**< Only for SHADER_OPCODE_SEND, use extended
|
|
* bindless surface offset (26bits instead of
|
|
* 20bits)
|
|
*/
|
|
/**
|
|
* The predication mask applied to this instruction is guaranteed to
|
|
* be uniform and a superset of the execution mask of the present block.
|
|
* No currently enabled channel will be disabled by the predicate.
|
|
*/
|
|
bool predicate_trivial:1;
|
|
bool eot:1;
|
|
bool last_rt:1;
|
|
bool pi_noperspective:1; /**< Pixel interpolator noperspective flag */
|
|
bool keep_payload_trailing_zeros:1;
|
|
/**
|
|
* Whether the parameters of the SEND instructions are build with
|
|
* NoMask (for A32 messages this covers only the surface handle, for
|
|
* A64 messages this covers the load address).
|
|
*/
|
|
bool has_no_mask_send_params:1;
|
|
};
|
|
uint32_t bits;
|
|
};
|
|
|
|
brw_reg dst;
|
|
brw_reg *src;
|
|
brw_reg builtin_src[4];
|
|
|
|
#ifndef NDEBUG
|
|
/** @{
|
|
* Annotation for the generated IR.
|
|
*/
|
|
const char *annotation;
|
|
/** @} */
|
|
#endif
|
|
};
|
|
|
|
/**
|
|
* Make the execution of \p inst dependent on the evaluation of a possibly
|
|
* inverted predicate.
|
|
*/
|
|
static inline fs_inst *
|
|
set_predicate_inv(enum brw_predicate pred, bool inverse,
|
|
fs_inst *inst)
|
|
{
|
|
inst->predicate = pred;
|
|
inst->predicate_inverse = inverse;
|
|
return inst;
|
|
}
|
|
|
|
/**
|
|
* Make the execution of \p inst dependent on the evaluation of a predicate.
|
|
*/
|
|
static inline fs_inst *
|
|
set_predicate(enum brw_predicate pred, fs_inst *inst)
|
|
{
|
|
return set_predicate_inv(pred, false, inst);
|
|
}
|
|
|
|
/**
|
|
* Write the result of evaluating the condition given by \p mod to a flag
|
|
* register.
|
|
*/
|
|
static inline fs_inst *
|
|
set_condmod(enum brw_conditional_mod mod, fs_inst *inst)
|
|
{
|
|
inst->conditional_mod = mod;
|
|
return inst;
|
|
}
|
|
|
|
/**
|
|
* Clamp the result of \p inst to the saturation range of its destination
|
|
* datatype.
|
|
*/
|
|
static inline fs_inst *
|
|
set_saturate(bool saturate, fs_inst *inst)
|
|
{
|
|
inst->saturate = saturate;
|
|
return inst;
|
|
}
|
|
|
|
/**
|
|
* Return the number of dataflow registers written by the instruction (either
|
|
* fully or partially) counted from 'floor(reg_offset(inst->dst) /
|
|
* register_size)'. The somewhat arbitrary register size unit is 4B for the
|
|
* UNIFORM and IMM files and 32B for all other files.
|
|
*/
|
|
inline unsigned
|
|
regs_written(const fs_inst *inst)
|
|
{
|
|
assert(inst->dst.file != UNIFORM && inst->dst.file != IMM);
|
|
return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE +
|
|
inst->size_written -
|
|
MIN2(inst->size_written, reg_padding(inst->dst)),
|
|
REG_SIZE);
|
|
}
|
|
|
|
/**
|
|
* Return the number of dataflow registers read by the instruction (either
|
|
* fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
|
|
* register_size)'. The somewhat arbitrary register size unit is 4B for the
|
|
* UNIFORM files and 32B for all other files.
|
|
*/
|
|
inline unsigned
|
|
regs_read(const fs_inst *inst, unsigned i)
|
|
{
|
|
if (inst->src[i].file == IMM)
|
|
return 1;
|
|
|
|
const unsigned reg_size = inst->src[i].file == UNIFORM ? 4 : REG_SIZE;
|
|
return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size +
|
|
inst->size_read(i) -
|
|
MIN2(inst->size_read(i), reg_padding(inst->src[i])),
|
|
reg_size);
|
|
}
|
|
|
|
static inline enum brw_reg_type
|
|
get_exec_type(const fs_inst *inst)
|
|
{
|
|
brw_reg_type exec_type = BRW_TYPE_B;
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
if (inst->src[i].file != BAD_FILE &&
|
|
!inst->is_control_source(i)) {
|
|
const brw_reg_type t = get_exec_type(inst->src[i].type);
|
|
if (brw_type_size_bytes(t) > brw_type_size_bytes(exec_type))
|
|
exec_type = t;
|
|
else if (brw_type_size_bytes(t) == brw_type_size_bytes(exec_type) &&
|
|
brw_type_is_float(t))
|
|
exec_type = t;
|
|
}
|
|
}
|
|
|
|
if (exec_type == BRW_TYPE_B)
|
|
exec_type = inst->dst.type;
|
|
|
|
assert(exec_type != BRW_TYPE_B);
|
|
|
|
/* Promotion of the execution type to 32-bit for conversions from or to
|
|
* half-float seems to be consistent with the following text from the
|
|
* Cherryview PRM Vol. 7, "Execution Data Type":
|
|
*
|
|
* "When single precision and half precision floats are mixed between
|
|
* source operands or between source and destination operand [..] single
|
|
* precision float is the execution datatype."
|
|
*
|
|
* and from "Register Region Restrictions":
|
|
*
|
|
* "Conversion between Integer and HF (Half Float) must be DWord aligned
|
|
* and strided by a DWord on the destination."
|
|
*/
|
|
if (brw_type_size_bytes(exec_type) == 2 &&
|
|
inst->dst.type != exec_type) {
|
|
if (exec_type == BRW_TYPE_HF)
|
|
exec_type = BRW_TYPE_F;
|
|
else if (inst->dst.type == BRW_TYPE_HF)
|
|
exec_type = BRW_TYPE_D;
|
|
}
|
|
|
|
return exec_type;
|
|
}
|
|
|
|
static inline unsigned
|
|
get_exec_type_size(const fs_inst *inst)
|
|
{
|
|
return brw_type_size_bytes(get_exec_type(inst));
|
|
}
|
|
|
|
static inline bool
|
|
is_send(const fs_inst *inst)
|
|
{
|
|
return inst->mlen || inst->is_send_from_grf();
|
|
}
|
|
|
|
/**
|
|
* Return whether the instruction isn't an ALU instruction and cannot be
|
|
* assumed to complete in-order.
|
|
*/
|
|
static inline bool
|
|
is_unordered(const intel_device_info *devinfo, const fs_inst *inst)
|
|
{
|
|
return is_send(inst) || (devinfo->ver < 20 && inst->is_math()) ||
|
|
inst->opcode == BRW_OPCODE_DPAS ||
|
|
(devinfo->has_64bit_float_via_math_pipe &&
|
|
(get_exec_type(inst) == BRW_TYPE_DF ||
|
|
inst->dst.type == BRW_TYPE_DF));
|
|
}
|
|
|
|
/**
|
|
* Return whether the following regioning restriction applies to the specified
|
|
* instruction. From the Cherryview PRM Vol 7. "Register Region
|
|
* Restrictions":
|
|
*
|
|
* "When source or destination datatype is 64b or operation is integer DWord
|
|
* multiply, regioning in Align1 must follow these rules:
|
|
*
|
|
* 1. Source and Destination horizontal stride must be aligned to the same qword.
|
|
* 2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride.
|
|
* 3. Source and Destination offset must be the same, except the case of
|
|
* scalar source."
|
|
*/
|
|
static inline bool
|
|
has_dst_aligned_region_restriction(const intel_device_info *devinfo,
|
|
const fs_inst *inst,
|
|
brw_reg_type dst_type)
|
|
{
|
|
const brw_reg_type exec_type = get_exec_type(inst);
|
|
/* Even though the hardware spec claims that "integer DWord multiply"
|
|
* operations are restricted, empirical evidence and the behavior of the
|
|
* simulator suggest that only 32x32-bit integer multiplication is
|
|
* restricted.
|
|
*/
|
|
const bool is_dword_multiply = !brw_type_is_float(exec_type) &&
|
|
((inst->opcode == BRW_OPCODE_MUL &&
|
|
MIN2(brw_type_size_bytes(inst->src[0].type), brw_type_size_bytes(inst->src[1].type)) >= 4) ||
|
|
(inst->opcode == BRW_OPCODE_MAD &&
|
|
MIN2(brw_type_size_bytes(inst->src[1].type), brw_type_size_bytes(inst->src[2].type)) >= 4));
|
|
|
|
if (brw_type_size_bytes(dst_type) > 4 || brw_type_size_bytes(exec_type) > 4 ||
|
|
(brw_type_size_bytes(exec_type) == 4 && is_dword_multiply))
|
|
return intel_device_info_is_9lp(devinfo) || devinfo->verx10 >= 125;
|
|
|
|
else if (brw_type_is_float(dst_type))
|
|
return devinfo->verx10 >= 125;
|
|
|
|
else
|
|
return false;
|
|
}
|
|
|
|
static inline bool
|
|
has_dst_aligned_region_restriction(const intel_device_info *devinfo,
|
|
const fs_inst *inst)
|
|
{
|
|
return has_dst_aligned_region_restriction(devinfo, inst, inst->dst.type);
|
|
}
|
|
|
|
/**
|
|
* Return true if the instruction can be potentially affected by the Xe2+
|
|
* regioning restrictions that apply to integer types smaller than a dword.
|
|
* The restriction isn't quoted here due to its length, see BSpec #56640 for
|
|
* details.
|
|
*/
|
|
static inline bool
|
|
has_subdword_integer_region_restriction(const intel_device_info *devinfo,
|
|
const fs_inst *inst,
|
|
const brw_reg *srcs, unsigned num_srcs)
|
|
{
|
|
if (devinfo->ver >= 20 &&
|
|
brw_type_is_int(inst->dst.type) &&
|
|
MAX2(byte_stride(inst->dst),
|
|
brw_type_size_bytes(inst->dst.type)) < 4) {
|
|
for (unsigned i = 0; i < num_srcs; i++) {
|
|
if (brw_type_is_int(srcs[i].type) &&
|
|
((brw_type_size_bytes(srcs[i].type) < 4 &&
|
|
byte_stride(srcs[i]) >= 4) ||
|
|
(MAX2(byte_stride(inst->dst),
|
|
brw_type_size_bytes(inst->dst.type)) == 1 &&
|
|
brw_type_size_bytes(srcs[i].type) == 1 &&
|
|
byte_stride(srcs[i]) >= 2)))
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
static inline bool
|
|
has_subdword_integer_region_restriction(const intel_device_info *devinfo,
|
|
const fs_inst *inst)
|
|
{
|
|
return has_subdword_integer_region_restriction(devinfo, inst,
|
|
inst->src, inst->sources);
|
|
}
|
|
|
|
/**
|
|
* Return whether the LOAD_PAYLOAD instruction is a plain copy of bits from
|
|
* the specified register file into a VGRF.
|
|
*
|
|
* This implies identity register regions without any source-destination
|
|
* overlap, but otherwise has no implications on the location of sources and
|
|
* destination in the register file: Gathering any number of portions from
|
|
* multiple virtual registers in any order is allowed.
|
|
*/
|
|
inline bool
|
|
is_copy_payload(brw_reg_file file, const fs_inst *inst)
|
|
{
|
|
if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD ||
|
|
inst->is_partial_write() || inst->saturate ||
|
|
inst->dst.file != VGRF)
|
|
return false;
|
|
|
|
for (unsigned i = 0; i < inst->sources; i++) {
|
|
if (inst->src[i].file != file ||
|
|
inst->src[i].abs || inst->src[i].negate)
|
|
return false;
|
|
|
|
if (!inst->src[i].is_contiguous())
|
|
return false;
|
|
|
|
if (regions_overlap(inst->dst, inst->size_written,
|
|
inst->src[i], inst->size_read(i)))
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Like is_copy_payload(), but the instruction is required to copy a single
|
|
* contiguous block of registers from the given register file into the
|
|
* destination without any reordering.
|
|
*/
|
|
inline bool
|
|
is_identity_payload(brw_reg_file file, const fs_inst *inst) {
|
|
if (is_copy_payload(file, inst)) {
|
|
brw_reg reg = inst->src[0];
|
|
|
|
for (unsigned i = 0; i < inst->sources; i++) {
|
|
reg.type = inst->src[i].type;
|
|
if (!inst->src[i].equals(reg))
|
|
return false;
|
|
|
|
reg = byte_offset(reg, inst->size_read(i));
|
|
}
|
|
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Like is_copy_payload(), but the instruction is required to source data from
|
|
* at least two disjoint VGRFs.
|
|
*
|
|
* This doesn't necessarily rule out the elimination of this instruction
|
|
* through register coalescing, but due to limitations of the register
|
|
* coalesce pass it might be impossible to do so directly until a later stage,
|
|
* when the LOAD_PAYLOAD instruction is unrolled into a sequence of MOV
|
|
* instructions.
|
|
*/
|
|
inline bool
|
|
is_multi_copy_payload(const fs_inst *inst) {
|
|
if (is_copy_payload(VGRF, inst)) {
|
|
for (unsigned i = 0; i < inst->sources; i++) {
|
|
if (inst->src[i].nr != inst->src[0].nr)
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Like is_identity_payload(), but the instruction is required to copy the
|
|
* whole contents of a single VGRF into the destination.
|
|
*
|
|
* This means that there is a good chance that the instruction will be
|
|
* eliminated through register coalescing, but it's neither a necessary nor a
|
|
* sufficient condition for that to happen -- E.g. consider the case where
|
|
* source and destination registers diverge due to other instructions in the
|
|
* program overwriting part of their contents, which isn't something we can
|
|
* predict up front based on a cheap strictly local test of the copy
|
|
* instruction.
|
|
*/
|
|
inline bool
|
|
is_coalescing_payload(const brw::simple_allocator &alloc, const fs_inst *inst)
|
|
{
|
|
return is_identity_payload(VGRF, inst) &&
|
|
inst->src[0].offset == 0 &&
|
|
alloc.sizes[inst->src[0].nr] * REG_SIZE == inst->size_written;
|
|
}
|
|
|
|
bool
|
|
has_bank_conflict(const struct brw_isa_info *isa, const fs_inst *inst);
|
|
|
|
/* Return the subset of flag registers that an instruction could
|
|
* potentially read or write based on the execution controls and flag
|
|
* subregister number of the instruction.
|
|
*/
|
|
static inline unsigned
|
|
brw_fs_flag_mask(const fs_inst *inst, unsigned width)
|
|
{
|
|
assert(util_is_power_of_two_nonzero(width));
|
|
const unsigned start = (inst->flag_subreg * 16 + inst->group) &
|
|
~(width - 1);
|
|
const unsigned end = start + ALIGN(inst->exec_size, width);
|
|
return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
|
|
}
|
|
|
|
static inline unsigned
|
|
brw_fs_bit_mask(unsigned n)
|
|
{
|
|
return (n >= CHAR_BIT * sizeof(brw_fs_bit_mask(n)) ? ~0u : (1u << n) - 1);
|
|
}
|
|
|
|
static inline unsigned
|
|
brw_fs_flag_mask(const brw_reg &r, unsigned sz)
|
|
{
|
|
if (r.file == ARF) {
|
|
const unsigned start = (r.nr - BRW_ARF_FLAG) * 4 + r.subnr;
|
|
const unsigned end = start + sz;
|
|
return brw_fs_bit_mask(end) & ~brw_fs_bit_mask(start);
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
#endif
|