mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 02:38:04 +02:00
vc4: Implement live intervals using a CFG.
Right now our CFG is always a trivial single basic block, but that will change when enable loops.
This commit is contained in:
parent
f2eb8e3052
commit
89918c1e74
6 changed files with 393 additions and 39 deletions
|
|
@ -31,6 +31,7 @@ C_SOURCES := \
|
|||
vc4_opt_vpm.c \
|
||||
vc4_program.c \
|
||||
vc4_qir.c \
|
||||
vc4_qir_live_variables.c \
|
||||
vc4_qir_lower_uniforms.c \
|
||||
vc4_qir_schedule.c \
|
||||
vc4_qir_validate.c \
|
||||
|
|
|
|||
|
|
@ -224,6 +224,53 @@ qir_writes_r4(struct qinst *inst)
|
|||
}
|
||||
}
|
||||
|
||||
uint8_t
|
||||
qir_channels_written(struct qinst *inst)
|
||||
{
|
||||
if (qir_is_mul(inst)) {
|
||||
switch (inst->dst.pack) {
|
||||
case QPU_PACK_MUL_NOP:
|
||||
case QPU_PACK_MUL_8888:
|
||||
return 0xf;
|
||||
case QPU_PACK_MUL_8A:
|
||||
return 0x1;
|
||||
case QPU_PACK_MUL_8B:
|
||||
return 0x2;
|
||||
case QPU_PACK_MUL_8C:
|
||||
return 0x4;
|
||||
case QPU_PACK_MUL_8D:
|
||||
return 0x8;
|
||||
}
|
||||
} else {
|
||||
switch (inst->dst.pack) {
|
||||
case QPU_PACK_A_NOP:
|
||||
case QPU_PACK_A_8888:
|
||||
case QPU_PACK_A_8888_SAT:
|
||||
case QPU_PACK_A_32_SAT:
|
||||
return 0xf;
|
||||
case QPU_PACK_A_8A:
|
||||
case QPU_PACK_A_8A_SAT:
|
||||
return 0x1;
|
||||
case QPU_PACK_A_8B:
|
||||
case QPU_PACK_A_8B_SAT:
|
||||
return 0x2;
|
||||
case QPU_PACK_A_8C:
|
||||
case QPU_PACK_A_8C_SAT:
|
||||
return 0x4;
|
||||
case QPU_PACK_A_8D:
|
||||
case QPU_PACK_A_8D_SAT:
|
||||
return 0x8;
|
||||
case QPU_PACK_A_16A:
|
||||
case QPU_PACK_A_16A_SAT:
|
||||
return 0x3;
|
||||
case QPU_PACK_A_16B:
|
||||
case QPU_PACK_A_16B_SAT:
|
||||
return 0xc;
|
||||
}
|
||||
}
|
||||
unreachable("Bad pack field");
|
||||
}
|
||||
|
||||
static void
|
||||
qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -38,6 +38,7 @@
|
|||
|
||||
#include "vc4_screen.h"
|
||||
#include "vc4_qpu_defines.h"
|
||||
#include "vc4_qpu.h"
|
||||
#include "kernel/vc4_packet.h"
|
||||
#include "pipe/p_state.h"
|
||||
|
||||
|
|
@ -353,6 +354,14 @@ struct qblock {
|
|||
struct qblock *successors[2];
|
||||
|
||||
int index;
|
||||
|
||||
/** @{ used by vc4_qir_live_variables.c */
|
||||
BITSET_WORD *def;
|
||||
BITSET_WORD *use;
|
||||
BITSET_WORD *live_in;
|
||||
BITSET_WORD *live_out;
|
||||
int start_ip, end_ip;
|
||||
/** @} */
|
||||
};
|
||||
|
||||
struct vc4_compile {
|
||||
|
|
@ -422,6 +431,9 @@ struct vc4_compile {
|
|||
struct vc4_fs_key *fs_key;
|
||||
struct vc4_vs_key *vs_key;
|
||||
|
||||
/* Live ranges of temps. */
|
||||
int *temp_start, *temp_end;
|
||||
|
||||
uint32_t *uniform_data;
|
||||
enum quniform_contents *uniform_contents;
|
||||
uint32_t uniform_array_size;
|
||||
|
|
@ -488,6 +500,7 @@ struct qreg qir_emit_def(struct vc4_compile *c, struct qinst *inst);
|
|||
struct qinst *qir_emit_nondef(struct vc4_compile *c, struct qinst *inst);
|
||||
|
||||
struct qreg qir_get_temp(struct vc4_compile *c);
|
||||
void qir_calculate_live_intervals(struct vc4_compile *c);
|
||||
int qir_get_op_nsrc(enum qop qop);
|
||||
bool qir_reg_equals(struct qreg a, struct qreg b);
|
||||
bool qir_has_side_effects(struct vc4_compile *c, struct qinst *inst);
|
||||
|
|
@ -499,6 +512,7 @@ bool qir_is_float_input(struct qinst *inst);
|
|||
bool qir_depends_on_flags(struct qinst *inst);
|
||||
bool qir_writes_r4(struct qinst *inst);
|
||||
struct qreg qir_follow_movs(struct vc4_compile *c, struct qreg reg);
|
||||
uint8_t qir_channels_written(struct qinst *inst);
|
||||
|
||||
void qir_dump(struct vc4_compile *c);
|
||||
void qir_dump_inst(struct vc4_compile *c, struct qinst *inst);
|
||||
|
|
@ -667,7 +681,7 @@ qir_SEL(struct vc4_compile *c, uint8_t cond, struct qreg src0, struct qreg src1)
|
|||
struct qinst *a = qir_MOV_dest(c, t, src0);
|
||||
struct qinst *b = qir_MOV_dest(c, t, src1);
|
||||
a->cond = cond;
|
||||
b->cond = cond ^ 1;
|
||||
b->cond = qpu_cond_complement(cond);
|
||||
return t;
|
||||
}
|
||||
|
||||
|
|
|
|||
316
src/gallium/drivers/vc4/vc4_qir_live_variables.c
Normal file
316
src/gallium/drivers/vc4/vc4_qir_live_variables.c
Normal file
|
|
@ -0,0 +1,316 @@
|
|||
/*
|
||||
* Copyright © 2012 Intel Corporation
|
||||
* Copyright © 2016 Broadcom
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#define MAX_INSTRUCTION (1 << 30)
|
||||
|
||||
#include "util/ralloc.h"
|
||||
#include "util/register_allocate.h"
|
||||
#include "vc4_context.h"
|
||||
#include "vc4_qir.h"
|
||||
|
||||
struct partial_update_state {
|
||||
struct qinst *insts[4];
|
||||
uint8_t channels;
|
||||
};
|
||||
|
||||
static uint32_t
|
||||
int_hash(const void *key)
|
||||
{
|
||||
return _mesa_hash_data(key, sizeof(int));
|
||||
}
|
||||
|
||||
static bool
|
||||
int_compare(const void *key1, const void *key2)
|
||||
{
|
||||
return *(const int *)key1 == *(const int *)key2;
|
||||
}
|
||||
|
||||
static int
|
||||
qir_reg_to_var(struct qreg reg)
|
||||
{
|
||||
if (reg.file == QFILE_TEMP)
|
||||
return reg.index;
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
static void
|
||||
qir_setup_use(struct vc4_compile *c, struct qblock *block, int ip,
|
||||
struct qreg src)
|
||||
{
|
||||
int var = qir_reg_to_var(src);
|
||||
if (var == -1)
|
||||
return;
|
||||
|
||||
c->temp_start[var] = MIN2(c->temp_start[var], ip);
|
||||
c->temp_end[var] = MAX2(c->temp_end[var], ip);
|
||||
|
||||
/* The use[] bitset marks when the block makes
|
||||
* use of a variable without having completely
|
||||
* defined that variable within the block.
|
||||
*/
|
||||
if (!BITSET_TEST(block->def, var))
|
||||
BITSET_SET(block->use, var);
|
||||
}
|
||||
|
||||
static struct partial_update_state *
|
||||
get_partial_update_state(struct hash_table *partial_update_ht,
|
||||
struct qinst *inst)
|
||||
{
|
||||
struct hash_entry *entry =
|
||||
_mesa_hash_table_search(partial_update_ht,
|
||||
&inst->dst.index);
|
||||
if (entry)
|
||||
return entry->data;
|
||||
|
||||
struct partial_update_state *state =
|
||||
rzalloc(partial_update_ht, struct partial_update_state);
|
||||
|
||||
_mesa_hash_table_insert(partial_update_ht, &inst->dst.index, state);
|
||||
|
||||
return state;
|
||||
}
|
||||
|
||||
static void
|
||||
qir_setup_def(struct vc4_compile *c, struct qblock *block, int ip,
|
||||
struct hash_table *partial_update_ht, struct qinst *inst)
|
||||
{
|
||||
/* The def[] bitset marks when an initialization in a
|
||||
* block completely screens off previous updates of
|
||||
* that variable.
|
||||
*/
|
||||
int var = qir_reg_to_var(inst->dst);
|
||||
if (var == -1)
|
||||
return;
|
||||
|
||||
c->temp_start[var] = MIN2(c->temp_start[var], ip);
|
||||
c->temp_end[var] = MAX2(c->temp_end[var], ip);
|
||||
|
||||
/* If we've already tracked this as a def, or already used it within
|
||||
* the block, there's nothing to do.
|
||||
*/
|
||||
if (BITSET_TEST(block->use, var) || BITSET_TEST(block->def, var))
|
||||
return;
|
||||
|
||||
/* Easy, common case: unconditional full register update. */
|
||||
if (inst->cond == QPU_COND_ALWAYS && !inst->dst.pack) {
|
||||
BITSET_SET(block->def, var);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Finally, look at the condition code and packing and mark it as a
|
||||
* def. We need to make sure that we understand sequences
|
||||
* instructions like:
|
||||
*
|
||||
* mov.zs t0, t1
|
||||
* mov.zc t0, t2
|
||||
*
|
||||
* or:
|
||||
*
|
||||
* mmov t0.8a, t1
|
||||
* mmov t0.8b, t2
|
||||
* mmov t0.8c, t3
|
||||
* mmov t0.8d, t4
|
||||
*
|
||||
* as defining the temp within the block, because otherwise dst's live
|
||||
* range will get extended up the control flow to the top of the
|
||||
* program.
|
||||
*/
|
||||
struct partial_update_state *state =
|
||||
get_partial_update_state(partial_update_ht, inst);
|
||||
uint8_t mask = qir_channels_written(inst);
|
||||
|
||||
if (inst->cond == QPU_COND_ALWAYS) {
|
||||
state->channels |= mask;
|
||||
} else {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (!(mask & (1 << i)))
|
||||
continue;
|
||||
|
||||
if (state->insts[i] &&
|
||||
state->insts[i]->cond ==
|
||||
qpu_cond_complement(inst->cond))
|
||||
state->channels |= 1 << i;
|
||||
else
|
||||
state->insts[i] = inst;
|
||||
}
|
||||
}
|
||||
|
||||
if (state->channels == 0xf)
|
||||
BITSET_SET(block->def, var);
|
||||
}
|
||||
|
||||
static void
|
||||
sf_state_clear(struct hash_table *partial_update_ht)
|
||||
{
|
||||
struct hash_entry *entry;
|
||||
|
||||
hash_table_foreach(partial_update_ht, entry) {
|
||||
struct partial_update_state *state = entry->data;
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (state->insts[i] && state->insts[i]->cond)
|
||||
state->insts[i] = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Sets up the def/use arrays for when variables are used-before-defined or
|
||||
* defined-before-used in the block.
|
||||
*
|
||||
* Also initializes the temp_start/temp_end to cover just the instruction IPs
|
||||
* where the variable is used, which will be extended later in
|
||||
* qir_compute_start_end().
|
||||
*/
|
||||
static void
|
||||
qir_setup_def_use(struct vc4_compile *c)
|
||||
{
|
||||
struct hash_table *partial_update_ht =
|
||||
_mesa_hash_table_create(c, int_hash, int_compare);
|
||||
int ip = 0;
|
||||
|
||||
qir_for_each_block(block, c) {
|
||||
block->start_ip = ip;
|
||||
|
||||
_mesa_hash_table_clear(partial_update_ht, NULL);
|
||||
|
||||
qir_for_each_inst(inst, block) {
|
||||
for (int i = 0; i < qir_get_op_nsrc(inst->op); i++)
|
||||
qir_setup_use(c, block, ip, inst->src[i]);
|
||||
|
||||
qir_setup_def(c, block, ip, partial_update_ht, inst);
|
||||
|
||||
if (inst->sf)
|
||||
sf_state_clear(partial_update_ht);
|
||||
|
||||
switch (inst->op) {
|
||||
case QOP_FRAG_Z:
|
||||
case QOP_FRAG_W:
|
||||
/* The payload registers have values
|
||||
* implicitly loaded at the start of the
|
||||
* program.
|
||||
*/
|
||||
if (inst->dst.file == QFILE_TEMP)
|
||||
c->temp_start[inst->dst.index] = 0;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
ip++;
|
||||
}
|
||||
block->end_ip = ip;
|
||||
}
|
||||
|
||||
_mesa_hash_table_destroy(partial_update_ht, NULL);
|
||||
}
|
||||
|
||||
static bool
|
||||
qir_live_variables_dataflow(struct vc4_compile *c, int bitset_words)
|
||||
{
|
||||
bool cont = false;
|
||||
|
||||
qir_for_each_block_rev(block, c) {
|
||||
/* Update live_out: Any successor using the variable
|
||||
* on entrance needs us to have the variable live on
|
||||
* exit.
|
||||
*/
|
||||
qir_for_each_successor(succ, block) {
|
||||
for (int i = 0; i < bitset_words; i++) {
|
||||
BITSET_WORD new_live_out = (succ->live_in[i] &
|
||||
~block->live_out[i]);
|
||||
if (new_live_out) {
|
||||
block->live_out[i] |= new_live_out;
|
||||
cont = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Update live_in */
|
||||
for (int i = 0; i < bitset_words; i++) {
|
||||
BITSET_WORD new_live_in = (block->use[i] |
|
||||
(block->live_out[i] &
|
||||
~block->def[i]));
|
||||
if (new_live_in & ~block->live_in[i]) {
|
||||
block->live_in[i] |= new_live_in;
|
||||
cont = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return cont;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extend the start/end ranges for each variable to account for the
|
||||
* new information calculated from control flow.
|
||||
*/
|
||||
static void
|
||||
qir_compute_start_end(struct vc4_compile *c, int num_vars)
|
||||
{
|
||||
qir_for_each_block(block, c) {
|
||||
for (int i = 0; i < num_vars; i++) {
|
||||
if (BITSET_TEST(block->live_in, i)) {
|
||||
c->temp_start[i] = MIN2(c->temp_start[i],
|
||||
block->start_ip);
|
||||
c->temp_end[i] = MAX2(c->temp_end[i],
|
||||
block->start_ip);
|
||||
}
|
||||
|
||||
if (BITSET_TEST(block->live_out, i)) {
|
||||
c->temp_start[i] = MIN2(c->temp_start[i],
|
||||
block->end_ip);
|
||||
c->temp_end[i] = MAX2(c->temp_end[i],
|
||||
block->end_ip);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
qir_calculate_live_intervals(struct vc4_compile *c)
|
||||
{
|
||||
int bitset_words = BITSET_WORDS(c->num_temps);
|
||||
|
||||
c->temp_start = reralloc(c, c->temp_start, int, c->num_temps);
|
||||
c->temp_end = reralloc(c, c->temp_end, int, c->num_temps);
|
||||
|
||||
for (int i = 0; i < c->num_temps; i++) {
|
||||
c->temp_start[i] = MAX_INSTRUCTION;
|
||||
c->temp_end[i] = -1;
|
||||
}
|
||||
|
||||
qir_for_each_block(block, c) {
|
||||
block->def = reralloc(c, block->def, BITSET_WORD, bitset_words);
|
||||
block->use = reralloc(c, block->use, BITSET_WORD, bitset_words);
|
||||
block->live_in = reralloc(c, block->live_in, BITSET_WORD, bitset_words);
|
||||
block->live_out = reralloc(c, block->live_out, BITSET_WORD, bitset_words);
|
||||
}
|
||||
|
||||
qir_setup_def_use(c);
|
||||
|
||||
while (qir_live_variables_dataflow(c, bitset_words))
|
||||
;
|
||||
|
||||
qir_compute_start_end(c, c->num_temps);
|
||||
}
|
||||
|
|
@ -153,6 +153,12 @@ bool qpu_inst_is_tlb(uint64_t inst) ATTRIBUTE_CONST;
|
|||
int qpu_num_sf_accesses(uint64_t inst) ATTRIBUTE_CONST;
|
||||
void qpu_serialize_one_inst(struct vc4_compile *c, uint64_t inst);
|
||||
|
||||
static inline enum qpu_cond
|
||||
qpu_cond_complement(enum qpu_cond cond)
|
||||
{
|
||||
return cond ^ 1;
|
||||
}
|
||||
|
||||
static inline uint64_t
|
||||
qpu_load_imm_f(struct qpu_reg dst, float val)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -175,14 +175,9 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
|
|||
{
|
||||
struct node_to_temp_map map[c->num_temps];
|
||||
uint32_t temp_to_node[c->num_temps];
|
||||
uint32_t def[c->num_temps];
|
||||
uint32_t use[c->num_temps];
|
||||
uint8_t class_bits[c->num_temps];
|
||||
struct qpu_reg *temp_registers = calloc(c->num_temps,
|
||||
sizeof(*temp_registers));
|
||||
for (int i = 0; i < ARRAY_SIZE(def); i++)
|
||||
def[i] = ~0;
|
||||
memset(use, 0, sizeof(use));
|
||||
|
||||
/* If things aren't ever written (undefined values), just read from
|
||||
* r0.
|
||||
|
|
@ -195,38 +190,12 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
|
|||
struct ra_graph *g = ra_alloc_interference_graph(vc4->regs,
|
||||
c->num_temps);
|
||||
|
||||
/* Compute the live ranges so we can figure out interference.
|
||||
*/
|
||||
uint32_t ip = 0;
|
||||
qir_for_each_inst_inorder(inst, c) {
|
||||
if (inst->dst.file == QFILE_TEMP) {
|
||||
def[inst->dst.index] = MIN2(ip, def[inst->dst.index]);
|
||||
use[inst->dst.index] = ip;
|
||||
}
|
||||
|
||||
for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
|
||||
if (inst->src[i].file == QFILE_TEMP)
|
||||
use[inst->src[i].index] = ip;
|
||||
}
|
||||
|
||||
switch (inst->op) {
|
||||
case QOP_FRAG_Z:
|
||||
case QOP_FRAG_W:
|
||||
/* The payload registers have values implicitly loaded
|
||||
* at the start of the program.
|
||||
*/
|
||||
def[inst->dst.index] = 0;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
ip++;
|
||||
}
|
||||
/* Compute the live ranges so we can figure out interference. */
|
||||
qir_calculate_live_intervals(c);
|
||||
|
||||
for (uint32_t i = 0; i < c->num_temps; i++) {
|
||||
map[i].temp = i;
|
||||
map[i].priority = use[i] - def[i];
|
||||
map[i].priority = c->temp_end[i] - c->temp_start[i];
|
||||
}
|
||||
qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority);
|
||||
for (uint32_t i = 0; i < c->num_temps; i++) {
|
||||
|
|
@ -241,7 +210,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
|
|||
CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4,
|
||||
sizeof(class_bits));
|
||||
|
||||
ip = 0;
|
||||
int ip = 0;
|
||||
qir_for_each_inst_inorder(inst, c) {
|
||||
if (qir_writes_r4(inst)) {
|
||||
/* This instruction writes r4 (and optionally moves
|
||||
|
|
@ -249,7 +218,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
|
|||
* stored in r4 across it.
|
||||
*/
|
||||
for (int i = 0; i < c->num_temps; i++) {
|
||||
if (def[i] < ip && use[i] > ip)
|
||||
if (c->temp_start[i] < ip && c->temp_end[i] > ip)
|
||||
class_bits[i] &= ~CLASS_BIT_R4;
|
||||
}
|
||||
} else {
|
||||
|
|
@ -328,7 +297,8 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
|
|||
|
||||
for (uint32_t i = 0; i < c->num_temps; i++) {
|
||||
for (uint32_t j = i + 1; j < c->num_temps; j++) {
|
||||
if (!(def[i] >= use[j] || def[j] >= use[i])) {
|
||||
if (!(c->temp_start[i] >= c->temp_end[j] ||
|
||||
c->temp_start[j] >= c->temp_end[i])) {
|
||||
ra_add_node_interference(g,
|
||||
temp_to_node[i],
|
||||
temp_to_node[j]);
|
||||
|
|
@ -349,7 +319,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
|
|||
/* If the value's never used, just write to the NOP register
|
||||
* for clarity in debug output.
|
||||
*/
|
||||
if (def[i] == use[i])
|
||||
if (c->temp_start[i] == c->temp_end[i])
|
||||
temp_registers[i] = qpu_ra(QPU_W_NOP);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue