CELL: changes to generate SPU code for stenciling

This set of code changes are for stencil code generation
support.  Both one-sided and two-sided stenciling are supported.
In addition to the raw code generation changes, these changes had
to be made elsewhere in the system:

- Added new "register set" feature to the SPE assembly generation.
  A "register set" is a way to allocate multiple registers and free
  them all at the same time, delegating register allocation management
  to the spe_function unit.  It's quite useful in complex register
  allocation schemes (like stenciling).

- Added and improved SPE macro calculations.
  These are operations between registers and unsigned integer
  immediates.  In many cases, the calculation can be performed
  with a single instruction; the macros will generate the
  single instruction if possible, or generate a register load
  and register-to-register operation if not.  These macro
  functions are: spe_load_uint() (which has new ways to
  load a value in a single instruction), spe_and_uint(),
  spe_xor_uint(), spe_compare_equal_uint(), and spe_compare_greater_uint().

- Added facing to fragment generation.  While rendering, the rasterizer
  needs to be able to determine front- and back-facing fragments, in order
  to correctly apply two-sided stencil.  That requires these changes:
  - Added front_winding field to the cell_command_render block, so that
    the state tracker could communicate to the rasterizer what it
    considered to be the front-facing direction.
  - Added fragment facing as an input to the fragment function.
  - Calculated facing is passed during emit_quad().
This commit is contained in:
Robert Ellison 2008-10-03 18:00:43 -06:00
parent 22eb067c88
commit afaa53040b
12 changed files with 1092 additions and 147 deletions

View file

@ -359,14 +359,21 @@ void _name (struct spe_function *p, int imm) \
*/
void spe_init_func(struct spe_function *p, unsigned code_size)
{
register unsigned int i;
p->store = align_malloc(code_size, 16);
p->num_inst = 0;
p->max_inst = code_size / SPE_INST_SIZE;
p->set_count = 0;
memset(p->regs, 0, SPE_NUM_REGS * sizeof(p->regs[0]));
/* Conservatively treat R0 - R2 and R80 - R127 as non-volatile.
*/
p->regs[0] = ~7;
p->regs[1] = (1U << (80 - 64)) - 1;
p->regs[0] = p->regs[1] = p->regs[2] = 1;
for (i = 80; i <= 127; i++) {
p->regs[i] = 1;
}
p->print = false;
p->indent = 0;
@ -398,12 +405,8 @@ int spe_allocate_available_register(struct spe_function *p)
{
unsigned i;
for (i = 0; i < SPE_NUM_REGS; i++) {
const uint64_t mask = (1ULL << (i % 64));
const unsigned idx = i / 64;
assert(idx < 2);
if ((p->regs[idx] & mask) != 0) {
p->regs[idx] &= ~mask;
if (p->regs[i] == 0) {
p->regs[i] = 1;
return i;
}
}
@ -417,31 +420,68 @@ int spe_allocate_available_register(struct spe_function *p)
*/
int spe_allocate_register(struct spe_function *p, int reg)
{
const unsigned idx = reg / 64;
const unsigned bit = reg % 64;
assert(reg < SPE_NUM_REGS);
assert((p->regs[idx] & (1ULL << bit)) != 0);
p->regs[idx] &= ~(1ULL << bit);
assert(p->regs[reg] == 0);
p->regs[reg] = 1;
return reg;
}
/**
* Mark the given SPE register as "unallocated".
* Mark the given SPE register as "unallocated". Note that this should
* only be used on registers allocated in the current register set; an
* assertion will fail if an attempt is made to deallocate a register
* allocated in an earlier register set.
*/
void spe_release_register(struct spe_function *p, int reg)
{
const unsigned idx = reg / 64;
const unsigned bit = reg % 64;
assert(idx < 2);
assert(reg < SPE_NUM_REGS);
assert((p->regs[idx] & (1ULL << bit)) == 0);
assert(p->regs[reg] == 1);
p->regs[idx] |= (1ULL << bit);
p->regs[reg] = 0;
}
/**
* Start a new set of registers. This can be called if
* it will be difficult later to determine exactly what
* registers were actually allocated during a code generation
* sequence, and you really just want to deallocate all of them.
*/
void spe_allocate_register_set(struct spe_function *p)
{
register unsigned int i;
/* Keep track of the set count. If it ever wraps around to 0,
* we're in trouble.
*/
p->set_count++;
assert(p->set_count > 0);
/* Increment the allocation count of all registers currently
* allocated. Then any registers that are allocated in this set
* will be the only ones with a count of 1; they'll all be released
* when the register set is released.
*/
for (i = 0; i < SPE_NUM_REGS; i++) {
if (p->regs[i] > 0) p->regs[i]++;
}
}
void spe_release_register_set(struct spe_function *p)
{
unsigned int i;
/* If the set count drops below zero, we're in trouble. */
assert(p->set_count > 0);
p->set_count--;
/* Drop the allocation level of all registers. Any allocated
* during this register set will drop to 0 and then become
* available.
*/
for (i = 0; i < SPE_NUM_REGS; i++) {
if (p->regs[i] > 0) p->regs[i]--;
}
}
@ -603,8 +643,10 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
{
/* If the whole value is in the lower 18 bits, use ila, which
* doesn't sign-extend. Otherwise, if the two halfwords of
* the constant are identical, use ilh. Otherwise, we have
* to use ilhu followed by iohl.
* the constant are identical, use ilh. Otherwise, if every byte of
* the desired value is 0x00 or 0xff, we can use Form Select Mask for
* Bytes Immediate (fsmbi) to load the value in a single instruction.
* Otherwise, in the general case, we have to use ilhu followed by iohl.
*/
if ((ui & 0xfffc0000) == ui) {
spe_ila(p, rT, ui);
@ -612,13 +654,171 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
else if ((ui >> 16) == (ui & 0xffff)) {
spe_ilh(p, rT, ui & 0xffff);
}
else if (
((ui & 0x000000ff) == 0 || (ui & 0x000000ff) == 0x000000ff) &&
((ui & 0x0000ff00) == 0 || (ui & 0x0000ff00) == 0x0000ff00) &&
((ui & 0x00ff0000) == 0 || (ui & 0x00ff0000) == 0x00ff0000) &&
((ui & 0xff000000) == 0 || (ui & 0xff000000) == 0xff000000)
) {
unsigned int mask = 0;
/* fsmbi duplicates each bit in the given mask eight times,
* using a 16-bit value to initialize a 16-byte quadword.
* Each 4-bit nybble of the mask corresponds to a full word
* of the result; look at the value and figure out the mask
* (replicated for each word in the quadword), and then
* form the "select mask" to get the value.
*/
if ((ui & 0x000000ff) == 0x000000ff) mask |= 0x1111;
if ((ui & 0x0000ff00) == 0x0000ff00) mask |= 0x2222;
if ((ui & 0x00ff0000) == 0x00ff0000) mask |= 0x4444;
if ((ui & 0xff000000) == 0xff000000) mask |= 0x8888;
spe_fsmbi(p, rT, mask);
}
else {
/* The general case: this usually uses two instructions, but
* may use only one if the low-order 16 bits of each word are 0.
*/
spe_ilhu(p, rT, ui >> 16);
if (ui & 0xffff)
spe_iohl(p, rT, ui & 0xffff);
}
}
/* This function is constructed identically to spe_sor_uint() below.
* Changes to one should be made in the other.
*/
void spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
{
/* If we can, emit a single instruction, either And Byte Immediate
* (which uses the same constant across each byte), And Halfword Immediate
* (which sign-extends a 10-bit immediate to 16 bits and uses that
* across each halfword), or And Word Immediate (which sign-extends
* a 10-bit immediate to 32 bits).
*
* Otherwise, we'll need to use a temporary register.
*/
register unsigned int tmp;
/* If the upper 23 bits are all 0s or all 1s, sign extension
* will work and we can use And Word Immediate
*/
tmp = ui & 0xfffffe00;
if (tmp == 0xfffffe00 || tmp == 0) {
spe_andi(p, rT, rA, ui & 0x000003ff);
return;
}
/* If the ui field is symmetric along halfword boundaries and
* the upper 7 bits of each halfword are all 0s or 1s, we
* can use And Halfword Immediate
*/
tmp = ui & 0xfe00fe00;
if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
spe_andhi(p, rT, rA, ui & 0x000003ff);
return;
}
/* If the ui field is symmetric in each byte, then we can use
* the And Byte Immediate instruction.
*/
tmp = ui & 0x000000ff;
if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
spe_andbi(p, rT, rA, tmp);
return;
}
/* Otherwise, we'll have to use a temporary register. */
unsigned int tmp_reg = spe_allocate_available_register(p);
spe_load_uint(p, tmp_reg, ui);
spe_and(p, rT, rA, tmp_reg);
spe_release_register(p, tmp_reg);
}
/* This function is constructed identically to spe_and_uint() above.
* Changes to one should be made in the other.
*/
void spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
{
/* If we can, emit a single instruction, either Exclusive Or Byte
* Immediate (which uses the same constant across each byte), Exclusive
* Or Halfword Immediate (which sign-extends a 10-bit immediate to
* 16 bits and uses that across each halfword), or Exclusive Or Word
* Immediate (which sign-extends a 10-bit immediate to 32 bits).
*
* Otherwise, we'll need to use a temporary register.
*/
register unsigned int tmp;
/* If the upper 23 bits are all 0s or all 1s, sign extension
* will work and we can use Exclusive Or Word Immediate
*/
tmp = ui & 0xfffffe00;
if (tmp == 0xfffffe00 || tmp == 0) {
spe_xori(p, rT, rA, ui & 0x000003ff);
return;
}
/* If the ui field is symmetric along halfword boundaries and
* the upper 7 bits of each halfword are all 0s or 1s, we
* can use Exclusive Or Halfword Immediate
*/
tmp = ui & 0xfe00fe00;
if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
spe_xorhi(p, rT, rA, ui & 0x000003ff);
return;
}
/* If the ui field is symmetric in each byte, then we can use
* the Exclusive Or Byte Immediate instruction.
*/
tmp = ui & 0x000000ff;
if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
spe_xorbi(p, rT, rA, tmp);
return;
}
/* Otherwise, we'll have to use a temporary register. */
unsigned int tmp_reg = spe_allocate_available_register(p);
spe_load_uint(p, tmp_reg, ui);
spe_xor(p, rT, rA, tmp_reg);
spe_release_register(p, tmp_reg);
}
void
spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
{
/* If the comparison value is 9 bits or less, it fits inside a
* Compare Equal Word Immediate instruction.
*/
if ((ui & 0x000001ff) == ui) {
spe_ceqi(p, rT, rA, ui);
}
/* Otherwise, we're going to have to load a word first. */
else {
unsigned int tmp_reg = spe_allocate_available_register(p);
spe_load_uint(p, tmp_reg, ui);
spe_ceq(p, rT, rA, tmp_reg);
spe_release_register(p, tmp_reg);
}
}
void
spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
{
/* If the comparison value is 10 bits or less, it fits inside a
* Compare Logical Greater Than Word Immediate instruction.
*/
if ((ui & 0x000003ff) == ui) {
spe_clgti(p, rT, rA, ui);
}
/* Otherwise, we're going to have to load a word first. */
else {
unsigned int tmp_reg = spe_allocate_available_register(p);
spe_load_uint(p, tmp_reg, ui);
spe_clgt(p, rT, rA, tmp_reg);
spe_release_register(p, tmp_reg);
}
}
void
spe_splat(struct spe_function *p, unsigned rT, unsigned rA)

View file

@ -53,17 +53,26 @@ struct spe_function
uint num_inst;
uint max_inst;
/**
* Mask of used / unused registers
*
* Each set bit corresponds to an available register. Each cleared bit
* corresponds to an allocated register.
/**
* The "set count" reflects the number of nested register sets
* are allowed. In the unlikely case that we exceed the set count,
* register allocation will start to be confused, which is critical
* enough that we check for it.
*/
unsigned char set_count;
/**
* Flags for used and unused registers. Each byte corresponds to a
* register; a 0 in that byte means that the register is available.
* A value of 1 means that the register was allocated in the current
* register set. Any other value N means that the register was allocated
* N register sets ago.
*
* \sa
* spe_allocate_register, spe_allocate_available_register,
* spe_release_register
* spe_allocate_register_set, spe_release_register_set, spe_release_register,
*/
uint64_t regs[SPE_NUM_REGS / 64];
unsigned char regs[SPE_NUM_REGS];
boolean print; /**< print/dump instructions as they're emitted? */
int indent; /**< number of spaces to indent */
@ -77,6 +86,8 @@ extern unsigned spe_code_size(const struct spe_function *p);
extern int spe_allocate_available_register(struct spe_function *p);
extern int spe_allocate_register(struct spe_function *p, int reg);
extern void spe_release_register(struct spe_function *p, int reg);
extern void spe_allocate_register_set(struct spe_function *p);
extern void spe_release_register_set(struct spe_function *p);
extern void spe_print_code(struct spe_function *p, boolean enable);
extern void spe_indent(struct spe_function *p, int spaces);
@ -307,6 +318,22 @@ spe_load_int(struct spe_function *p, unsigned rT, int i);
extern void
spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui);
/** And immediate value into rT. */
extern void
spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
/** Xor immediate value into rT. */
extern void
spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
/** Compare equal with immediate value. */
extern void
spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
/** Compare greater with immediate value. */
extern void
spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
/** Replicate word 0 of rA across rT. */
extern void
spe_splat(struct spe_function *p, unsigned rT, unsigned rA);

View file

@ -227,6 +227,7 @@ struct cell_command_render
float xmin, ymin, xmax, ymax; /* XXX another dummy field */
uint min_index;
boolean inline_verts;
uint front_winding; /* the rasterizer needs to be able to determine facing to apply front/back-facing stencil */
};

File diff suppressed because it is too large Load diff

View file

@ -152,6 +152,7 @@ cell_flush_prim_buffer(struct cell_context *cell)
struct cell_command_render *render = &cell_global.command[i].render;
render->prim_type = PIPE_PRIM_TRIANGLES;
render->num_verts = cell->prim_buffer.num_verts;
render->front_winding = cell->rasterizer->front_winding;
render->vertex_size = cell->vertex_info->size * 4;
render->xmin = cell->prim_buffer.xmin;
render->ymin = cell->prim_buffer.ymin;

View file

@ -214,6 +214,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
render->opcode = CELL_CMD_RENDER;
render->prim_type = cvbr->prim;
render->front_winding = cell->rasterizer->front_winding;
render->num_indexes = nr_indices;
render->min_index = min_index;

View file

@ -73,7 +73,8 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
vector float fragGreen,
vector float fragBlue,
vector float fragAlpha,
vector unsigned int mask);
vector unsigned int mask,
uint facing);
/** Function for running fragment program */
typedef void (*spu_fragment_program_func)(vector float *inputs,

View file

@ -57,7 +57,8 @@ spu_fallback_fragment_ops(uint x, uint y,
vector float fragG,
vector float fragB,
vector float fragA,
vector unsigned int mask)
vector unsigned int mask,
uint facing)
{
vector float frag_aos[4];
unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
@ -433,23 +434,23 @@ spu_fallback_fragment_ops(uint x, uint y,
/* Form bitmask depending on color buffer format and colormask bits */
switch (spu.fb.color_format) {
case PIPE_FORMAT_A8R8G8B8_UNORM:
if (spu.blend.colormask & (1<<0))
if (spu.blend.colormask & PIPE_MASK_R)
cmask |= 0x00ff0000; /* red */
if (spu.blend.colormask & (1<<1))
if (spu.blend.colormask & PIPE_MASK_G)
cmask |= 0x0000ff00; /* green */
if (spu.blend.colormask & (1<<2))
if (spu.blend.colormask & PIPE_MASK_B)
cmask |= 0x000000ff; /* blue */
if (spu.blend.colormask & (1<<3))
if (spu.blend.colormask & PIPE_MASK_A)
cmask |= 0xff000000; /* alpha */
break;
case PIPE_FORMAT_B8G8R8A8_UNORM:
if (spu.blend.colormask & (1<<0))
if (spu.blend.colormask & PIPE_MASK_R)
cmask |= 0x0000ff00; /* red */
if (spu.blend.colormask & (1<<1))
if (spu.blend.colormask & PIPE_MASK_G)
cmask |= 0x00ff0000; /* green */
if (spu.blend.colormask & (1<<2))
if (spu.blend.colormask & PIPE_MASK_B)
cmask |= 0xff000000; /* blue */
if (spu.blend.colormask & (1<<3))
if (spu.blend.colormask & PIPE_MASK_A)
cmask |= 0x000000ff; /* alpha */
break;
default:

View file

@ -38,7 +38,8 @@ spu_fallback_fragment_ops(uint x, uint y,
vector float fragGreen,
vector float fragBlue,
vector float fragAlpha,
vector unsigned int mask);
vector unsigned int mask,
uint facing);
#endif /* SPU_PER_FRAGMENT_OP */

View file

@ -279,7 +279,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
drawn += tri_draw(v0, v1, v2, tx, ty);
drawn += tri_draw(v0, v1, v2, tx, ty, render->front_winding);
}
//printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
@ -297,5 +297,3 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
printf("SPU %u: RENDER done\n",
spu.init.id);
}

View file

@ -118,6 +118,8 @@ struct setup_stage {
float oneoverarea;
uint facing;
uint tx, ty;
int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
@ -274,7 +276,7 @@ eval_z(float x, float y)
* overall.
*/
static INLINE void
emit_quad( int x, int y, mask_t mask )
emit_quad( int x, int y, mask_t mask)
{
/* If any bits in mask are set... */
if (spu_extract(spu_orx(mask), 0)) {
@ -344,7 +346,8 @@ emit_quad( int x, int y, mask_t mask )
fragZ,
soa_frag[0], soa_frag[1],
soa_frag[2], soa_frag[3],
mask);
mask,
setup.facing);
}
}
@ -379,7 +382,8 @@ emit_quad( int x, int y, mask_t mask )
outputs[0*4+1],
outputs[0*4+2],
outputs[0*4+3],
mask);
mask,
setup.facing);
}
}
}
@ -483,7 +487,7 @@ static void flush_spans( void )
*/
for (x = block(minleft); x <= block(maxright); x += 2) {
#if 1
emit_quad( x, setup.span.y, calculate_mask( x ) );
emit_quad( x, setup.span.y, calculate_mask( x ));
#endif
}
@ -902,13 +906,28 @@ static void subtriangle( struct edge *eleft,
eright->sy += lines;
}
static float
determinant( const float *v0,
const float *v1,
const float *v2 )
{
/* edge vectors e = v0 - v2, f = v1 - v2 */
const float ex = v0[0] - v2[0];
const float ey = v0[1] - v2[1];
const float fx = v1[0] - v2[0];
const float fy = v1[1] - v2[1];
/* det = cross(e,f).z */
return ex * fy - ey * fx;
}
/**
* Draw triangle into tile at (tx, ty) (tile coords)
* The tile data should have already been fetched.
*/
boolean
tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding)
{
setup.tx = tx;
setup.ty = ty;
@ -919,6 +938,12 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
/* Before we sort vertices, determine the facing of the triangle,
* which will be needed for front/back-face stencil application
*/
float det = determinant(v0, v1, v2);
setup.facing = (det > 0.0) ^ (front_winding == PIPE_WINDING_CW);
if (!setup_sort_vertices((struct vertex_header *) v0,
(struct vertex_header *) v1,
(struct vertex_header *) v2)) {

View file

@ -31,7 +31,7 @@
extern boolean
tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty);
tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding);
#endif /* SPU_TRI_H */