lima/ppir: implement gl_FragDepth support

Mali4x0 supports writing depth and stencil from fragment shader
and we've been using it quite a while for depth/stencil buffer reload.

The missing part was specifying output register for depth/stencil.
To figure it out, I changed reload shader to use register $4 as output
and poked RSW bits (or rather consecutive 4 bit groups) until tests
that rely on reload started to pass again.

It turns out that register number for gl_FragDepth/gl_FragStencil is in
rsw->depth_test and register number for gl_FragColor is in
rsw->multi_sample and it's repeated 4 times for some reason (likely for
MSAA?)

With this knowledge we now can modify ppir compiler to support multiple
store_output intrinsics.

To do that just add destination SSA for store_output to the registers
list for regalloc and mark them explicitly as output. Since it's never
read in shader we have to take care about it in liveness analysis -
basically just mark it alive from the time when it's written to the end
of the block. If it's live only in the last instruction, mark it as
live_internal, so regalloc doesn't clobber it.

Then just let regalloc do its job, and then copy register number to the
shader state and program it in RSW.

The tricky part is gl_FragStencil, since it resides in the same register
as gl_FragDepth and with the current design of the compiler it's hard to
merge them. However gl_FragStencil doesn't seem to be part of GL2
or GLES2, so we can just leave it not implemented.

Also we need to take care of stop bit for instructions - now we can't
just set it in every instruction that stores output, since there may be
several outputs. So if there's any store_output instructions in the
block just mark that block has a stop, and set stop bit in the last
instruction in the block. The only exception is discard - we always need
to set stop bit in discard instruction.

Reviewed-by: Andreas Baierl <ichgeh@imkreisrum.de>
Reviewed-by: Erico Nunes <nunes.erico@gmail.com>
Signed-off-by: Vasily Khoruzhick <anarsoul@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13830>
This commit is contained in:
Vasily Khoruzhick 2021-11-16 22:43:52 -08:00 committed by Marge Bot
parent 98a7c4c6f8
commit 3b15fb3575
11 changed files with 144 additions and 37 deletions

View file

@ -773,7 +773,7 @@ static int encode_instr(ppir_instr *instr, void *code, void *last_code)
size = align_to_word(size) + 1;
ctrl->count = size;
if (instr->is_end)
if (instr->stop)
ctrl->stop = true;
if (last_code) {
@ -818,6 +818,11 @@ bool ppir_codegen_prog(ppir_compiler *comp)
instr->encode_size = get_instr_encode_size(instr);
size += instr->encode_size;
}
/* Set stop flag for the last instruction if block has stop flag */
if (block->stop) {
ppir_instr *instr = list_last_entry(&block->instr_list, ppir_instr, list);
instr->stop = true;
}
}
uint32_t *prog = rzalloc_size(comp->prog, size * sizeof(uint32_t));

View file

@ -284,7 +284,7 @@ void ppir_instr_print_list(ppir_compiler *comp)
list_for_each_entry(ppir_block, block, &comp->block_list, list) {
printf("-------block %3d-------\n", block->index);
list_for_each_entry(ppir_instr, instr, &block->instr_list, list) {
printf("%c%03d: ", instr->is_end ? '*' : ' ', instr->index);
printf("%c%03d: ", instr->stop ? '*' : ' ', instr->index);
for (int i = 0; i < PPIR_INSTR_SLOT_NUM; i++) {
ppir_node *node = instr->slots[i];
if (node)

View file

@ -121,7 +121,7 @@ ppir_liveness_instr_srcs(ppir_compiler *comp, ppir_instr *instr)
/* Update the liveness information of the instruction by removing its
* dests from the live_in set. */
static void
ppir_liveness_instr_dest(ppir_compiler *comp, ppir_instr *instr)
ppir_liveness_instr_dest(ppir_compiler *comp, ppir_instr *instr, ppir_instr *last)
{
for (int i = PPIR_INSTR_SLOT_NUM-1; i >= 0; i--) {
ppir_node *node = instr->slots[i];
@ -146,9 +146,18 @@ ppir_liveness_instr_dest(ppir_compiler *comp, ppir_instr *instr)
unsigned int index = reg->regalloc_index;
bool live = BITSET_TEST(instr->live_set, index);
/* If it's an out reg, it's alive till the end of the block, so add it
* to live_set of the last instruction */
if (!live && reg->out_reg && (instr != last)) {
BITSET_SET(last->live_set, index);
BITSET_CLEAR(instr->live_set, index);
continue;
}
/* If a register is written but wasn't read in a later instruction, it is
* either dead code or a bug. For now, assign an interference to it to
* ensure it doesn't get assigned a live register and overwrites it. */
* either an output register in last instruction, dead code or a bug.
* For now, assign an interference to it to ensure it doesn't get assigned
* a live register and overwrites it. */
if (!live) {
BITSET_SET(instr->live_internal, index);
continue;
@ -230,7 +239,7 @@ ppir_liveness_compute_live_sets(ppir_compiler *comp)
instr->live_mask, next_instr->live_mask);
}
ppir_liveness_instr_dest(comp, instr);
ppir_liveness_instr_dest(comp, instr, last);
ppir_liveness_instr_srcs(comp, instr);
cont |= !ppir_liveness_set_equal(comp,

View file

@ -345,6 +345,18 @@ static bool ppir_emit_intrinsic(ppir_block *block, nir_instr *ni)
* back to inserting a mov at the end.
* If the source node will only be able to output to pipeline
* registers, fall back to the mov as well. */
assert(nir_src_is_const(instr->src[1]) &&
"lima doesn't support indirect outputs");
nir_io_semantics io = nir_intrinsic_io_semantics(instr);
unsigned offset = nir_src_as_uint(instr->src[1]);
unsigned slot = io.location + offset;
ppir_output_type out_type = ppir_nir_output_to_ppir(slot);
if (out_type == ppir_output_invalid) {
ppir_debug("Unsupported output type: %d\n", slot);
return false;
}
if (!block->comp->uses_discard && instr->src->is_ssa) {
node = block->comp->var_nodes[instr->src->ssa->index];
switch (node->op) {
@ -352,9 +364,12 @@ static bool ppir_emit_intrinsic(ppir_block *block, nir_instr *ni)
case ppir_op_load_texture:
case ppir_op_const:
break;
default:
node->is_end = 1;
default: {
ppir_dest *dest = ppir_node_get_dest(node);
dest->ssa.out_type = out_type;
node->is_out = 1;
return true;
}
}
}
@ -367,6 +382,7 @@ static bool ppir_emit_intrinsic(ppir_block *block, nir_instr *ni)
dest->ssa.num_components = instr->num_components;
dest->ssa.index = 0;
dest->write_mask = u_bit_consecutive(0, instr->num_components);
dest->ssa.out_type = out_type;
alu_node->num_src = 1;
@ -376,7 +392,7 @@ static bool ppir_emit_intrinsic(ppir_block *block, nir_instr *ni)
ppir_node_add_src(block->comp, &alu_node->node, alu_node->src, instr->src,
u_bit_consecutive(0, instr->num_components));
alu_node->node.is_end = 1;
alu_node->node.is_out = 1;
list_addtail(&alu_node->node.list, &block->node_list);
return true;
@ -798,6 +814,7 @@ static ppir_compiler *ppir_compiler_create(void *prog, unsigned num_reg, unsigne
comp->var_nodes = (ppir_node **)(comp + 1);
comp->reg_base = num_ssa;
comp->prog = prog;
return comp;
}
@ -833,7 +850,7 @@ static void ppir_add_ordering_deps(ppir_compiler *comp)
if (prev_node && ppir_node_is_root(node) && node->op != ppir_op_const) {
ppir_node_add_dep(prev_node, node, ppir_dep_sequence);
}
if (node->is_end ||
if (node->is_out ||
node->op == ppir_op_discard ||
node->op == ppir_op_store_temp ||
node->op == ppir_op_branch) {
@ -930,18 +947,11 @@ bool ppir_compile_nir(struct lima_fs_compiled_shader *prog, struct nir_shader *n
}
}
/* Validate outputs, we support only gl_FragColor */
nir_foreach_shader_out_variable(var, nir) {
switch (var->data.location) {
case FRAG_RESULT_COLOR:
case FRAG_RESULT_DATA0:
break;
default:
ppir_error("unsupported output type\n");
goto err_out0;
break;
}
}
comp->out_type_to_reg = rzalloc_size(comp, sizeof(int) * ppir_output_num);
/* -1 means reg is not written by the shader */
for (int i = 0; i < ppir_output_num; i++)
comp->out_type_to_reg[i] = -1;
foreach_list_typed(nir_register, reg, node, &func->registers) {
ppir_reg *r = rzalloc(comp, ppir_reg);

View file

@ -618,9 +618,9 @@ static ppir_node *ppir_node_insert_mov_local(ppir_node *node)
ppir_node_add_dep(move, node, ppir_dep_src);
list_addtail(&move->list, &node->list);
if (node->is_end) {
node->is_end = false;
move->is_end = true;
if (node->is_out) {
node->is_out = false;
move->is_out = true;
}
return move;

View file

@ -203,7 +203,7 @@ static bool ppir_do_one_node_to_instr(ppir_block *block, ppir_node *node)
case ppir_node_type_discard:
if (!create_new_instr(block, node))
return false;
node->instr->is_end = true;
block->stop = true;
break;
case ppir_node_type_branch:
if (!create_new_instr(block, node))
@ -276,8 +276,13 @@ static bool ppir_do_node_to_instr(ppir_block *block, ppir_node *root)
if (!ppir_do_one_node_to_instr(block, node))
return false;
if (node->is_end)
node->instr->is_end = true;
/* The node writes output register. We can't stop at this exact
* instruction because there may be another node that writes another
* output, so set stop flag for the block. We will set stop flag on
* the last instruction of the block during codegen
*/
if (node->is_out)
block->stop = true;
ppir_node_foreach_pred(node, dep) {
ppir_node *pred = dep->pred;

View file

@ -161,7 +161,7 @@ typedef struct ppir_node {
struct ppir_instr *instr;
int instr_pos;
struct ppir_block *block;
bool is_end;
bool is_out;
bool succ_different_block;
/* for scheduler */
@ -179,9 +179,42 @@ typedef enum {
ppir_pipeline_reg_discard, /* varying load */
} ppir_pipeline;
typedef enum {
ppir_output_color,
ppir_output_depth,
ppir_output_num,
ppir_output_invalid = -1,
} ppir_output_type;
static inline const char *ppir_output_type_to_str(ppir_output_type type)
{
switch (type) {
case ppir_output_color:
return "OUTPUT_COLOR";
case ppir_output_depth:
return "OUTPUT_DEPTH";
default:
return "INVALID";
}
}
static inline ppir_output_type ppir_nir_output_to_ppir(gl_frag_result res)
{
switch (res) {
case FRAG_RESULT_COLOR:
case FRAG_RESULT_DATA0:
return ppir_output_color;
case FRAG_RESULT_DEPTH:
return ppir_output_depth;
default:
return ppir_output_invalid;
}
}
typedef struct ppir_reg {
struct list_head list;
int index;
ppir_output_type out_type;
int regalloc_index;
int num_components;
@ -191,6 +224,7 @@ typedef struct ppir_reg {
bool is_head;
bool spilled;
bool undef;
bool out_reg;
} ppir_reg;
typedef enum {
@ -316,7 +350,7 @@ typedef struct ppir_instr {
ppir_node *slots[PPIR_INSTR_SLOT_NUM];
ppir_const constant[2];
bool is_end;
bool stop;
/* for scheduler */
struct list_head succ_list;
@ -340,6 +374,7 @@ typedef struct ppir_block {
struct list_head list;
struct list_head node_list;
struct list_head instr_list;
bool stop;
struct ppir_block *successors[2];
@ -370,6 +405,7 @@ typedef struct ppir_compiler {
struct hash_table_u64 *blocks;
int cur_index;
int cur_instr_index;
int *out_type_to_reg;
struct list_head reg_list;
int reg_num;

View file

@ -82,9 +82,6 @@ static void ppir_regalloc_update_reglist_ssa(ppir_compiler *comp)
{
list_for_each_entry(ppir_block, block, &comp->block_list, list) {
list_for_each_entry(ppir_node, node, &block->node_list, list) {
if (node->is_end)
continue;
if (!node->instr || node->op == ppir_op_const)
continue;
@ -94,6 +91,8 @@ static void ppir_regalloc_update_reglist_ssa(ppir_compiler *comp)
if (dest->type == ppir_target_ssa) {
reg = &dest->ssa;
if (node->is_out)
reg->out_reg = true;
list_addtail(&reg->list, &comp->reg_list);
comp->reg_num++;
}
@ -133,6 +132,14 @@ static void ppir_regalloc_print_result(ppir_compiler *comp)
}
}
printf("--------------------------\n");
printf("======ppir output regs======\n");
for (int i = 0; i < ppir_output_num; i++) {
if (comp->out_type_to_reg[i] != -1)
printf("%s: $%d\n", ppir_output_type_to_str(i),
(int)comp->out_type_to_reg[i]);
}
printf("--------------------------\n");
}
static bool create_new_instr_after(ppir_block *block, ppir_instr *ref,
@ -578,6 +585,11 @@ static bool ppir_regalloc_prog_try(ppir_compiler *comp, bool *spilled)
n = 0;
list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
reg->index = ra_get_node_reg(g, n++);
if (reg->out_reg) {
/* We need actual reg number, we don't have swizzle for output regs */
assert(!(reg->index & 0x3) && "ppir: output regs don't have swizzle");
comp->out_type_to_reg[reg->out_type] = reg->index / 4;
}
}
ralloc_free(g);
@ -604,8 +616,11 @@ bool ppir_regalloc_prog(ppir_compiler *comp)
ppir_regalloc_update_reglist_ssa(comp);
/* No registers? Probably shader consists of discard instruction */
if (list_is_empty(&comp->reg_list))
if (list_is_empty(&comp->reg_list)) {
comp->prog->state.frag_color_reg = 0;
comp->prog->state.frag_depth_reg = -1;
return true;
}
/* this will most likely succeed in the first
* try, except for very complicated shaders */
@ -613,5 +628,10 @@ bool ppir_regalloc_prog(ppir_compiler *comp)
if (!spilled)
return false;
comp->prog->state.frag_color_reg =
comp->out_type_to_reg[ppir_output_color];
comp->prog->state.frag_depth_reg =
comp->out_type_to_reg[ppir_output_depth];
return true;
}

View file

@ -49,6 +49,8 @@ struct lima_fs_compiled_shader {
struct {
int shader_size;
int stack_size;
int frag_color_reg;
int frag_depth_reg;
bool uses_discard;
} state;
};

View file

@ -677,6 +677,12 @@ lima_pack_render_state(struct lima_context *ctx, const struct pipe_draw_info *in
if (!rst->depth_clip_far || ctx->viewport.far == 1.0f)
render->depth_test |= 0x20; /* don't clip depth far */
if (fs->state.frag_depth_reg != -1) {
render->depth_test |= (fs->state.frag_depth_reg << 6);
/* Shader writes depth */
render->depth_test |= 0x801;
}
ushort far, near;
near = float_to_ushort(ctx->viewport.near);
@ -729,6 +735,12 @@ lima_pack_render_state(struct lima_context *ctx, const struct pipe_draw_info *in
if (ctx->framebuffer.base.samples)
render->multi_sample |= 0x68;
/* Set gl_FragColor register, need to specify it 4 times */
render->multi_sample |= (fs->state.frag_color_reg << 28) |
(fs->state.frag_color_reg << 24) |
(fs->state.frag_color_reg << 20) |
(fs->state.frag_color_reg << 16);
/* alpha test */
if (ctx->zsa->base.alpha_enabled) {
render->multi_sample |= ctx->zsa->base.alpha_func;
@ -755,7 +767,8 @@ lima_pack_render_state(struct lima_context *ctx, const struct pipe_draw_info *in
render->aux1 |= 0x00002000;
if (fs->state.uses_discard ||
ctx->zsa->base.alpha_enabled) {
ctx->zsa->base.alpha_enabled ||
fs->state.frag_depth_reg != -1) {
early_z = false;
pixel_kill = false;
}

View file

@ -525,7 +525,7 @@ parse_rsw(FILE *fp, uint32_t *value, int i, uint32_t *helper)
fprintf(fp, ": ignore depth clip near");
if ((*value & 0x00000020) == 0x00000020)
fprintf(fp, ", ignore depth clip far");
fprintf(fp, ", unknown bits 6-9: 0x%08x", *value & 0x000003c0);
fprintf(fp, ", register for gl_FragDepth: $%d", (*value & 0x000003c0) >> 6);
fprintf(fp, ", unknown bits 13-15: 0x%08x */\n", *value & 0x00000e000);
break;
case 4: /* DEPTH RANGE */
@ -594,7 +594,14 @@ parse_rsw(FILE *fp, uint32_t *value, int i, uint32_t *helper)
fprintf(fp, " */\n");
else
fprintf(fp, ", UNKNOWN\n");
fprintf(fp, "\t\t\t\t\t\t/* %s(2)", render_state_infos[i].info);
fprintf(fp, "\t\t\t\t\t\t/* %s(3)", render_state_infos[i].info);
fprintf(fp, ", register for gl_FragColor: $%d $%d $%d $%d */\n",
(*value & 0xf0000000) >> 28,
(*value & 0x0f000000) >> 24,
(*value & 0x00f00000) >> 20,
(*value & 0x000f0000) >> 16);
fprintf(fp, "\t\t\t\t\t\t/* %s(3)", render_state_infos[i].info);
fprintf(fp, ": alpha_test_func: %d (%s) */\n",
(*value & 0x00000007),
lima_get_compare_func_string((*value & 0x00000007))); /* alpha_test_func */