Cell: implement Z16 and Z32 testing with SIMD instructions.

This commit is contained in:
Brian 2008-02-01 13:45:58 -07:00 committed by Ben Skeggs
parent 4f7dcb0e04
commit 7a0099b9f3
3 changed files with 163 additions and 197 deletions

View file

@ -42,7 +42,8 @@
typedef union {
ushort t16[TILE_SIZE][TILE_SIZE];
uint t32[TILE_SIZE][TILE_SIZE];
float4 f4[TILE_SIZE/2][TILE_SIZE/2];
vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4];
vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2];
} tile_t;

View file

@ -39,18 +39,11 @@
#include "spu_tile.h"
#include "spu_tri.h"
#include "spu_ztest.h"
/*
* If SIMD_Z=1 the Z buffer is floating point and we use vector instructions
* to do Z testing/updating.
*/
#define SIMD_Z 0
#if SIMD_Z
/** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
typedef vector unsigned int mask_t;
#else
typedef uint mask_t;
#endif
/**
@ -282,20 +275,11 @@ pack_colors(uint uicolors[4], const float4 fcolors[4])
}
static unsigned int
do_depth_test(int x, int y, unsigned int mask)
static INLINE mask_t
do_depth_test(int x, int y, mask_t quadmask)
{
static const float4 zscale16
= {.f={65535.0, 65535.0, 65535.0, 65535.0}};
static const float4 zscale32
= {.f={(float)0xffffffff,
(float)0xffffffff,
(float)0xffffffff,
(float)0xffffffff}};
int ix = x - setup.cliprect_minx;
int iy = y - setup.cliprect_miny;
float4 zvals;
mask_t mask;
zvals.v = eval_z((float) x, (float) y);
@ -305,129 +289,20 @@ do_depth_test(int x, int y, unsigned int mask)
cur_tile_status_z = TILE_STATUS_DIRTY;
}
#if 0
if (cur_tile_status_z == TILE_STATUS_CLEAR) {
/* now, _really_ clear the tile */
clear_z_tile(&ztile);
}
else if (cur_tile_status_z != TILE_STATUS_DIRTY) {
/* make sure we've got the tile from main mem */
wait_on_mask(1 << TAG_READ_TILE_Z);
}
cur_tile_status_z = TILE_STATUS_DIRTY;
#endif
if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
zvals.v = spu_mul(zvals.v, zscale16.v);
if (mask & MASK_TOP_LEFT) {
uint z = (uint) zvals.f[0];
if (z < ztile.t16[iy][ix])
ztile.t16[iy][ix] = z;
else
mask &= ~MASK_TOP_LEFT;
}
if (mask & MASK_TOP_RIGHT) {
uint z = (uint) zvals.f[1];
if (z < ztile.t16[iy][ix+1])
ztile.t16[iy][ix+1] = z;
else
mask &= ~MASK_TOP_RIGHT;
}
if (mask & MASK_BOTTOM_LEFT) {
uint z = (uint) zvals.f[2];
if (z < ztile.t16[iy+1][ix])
ztile.t16[iy+1][ix] = z;
else
mask &= ~MASK_BOTTOM_LEFT;
}
if (mask & MASK_BOTTOM_RIGHT) {
uint z = (uint) zvals.f[3];
if (z < ztile.t16[iy+1][ix+1])
ztile.t16[iy+1][ix+1] = z;
else
mask &= ~MASK_BOTTOM_RIGHT;
}
int ix = (x - setup.cliprect_minx) / 4;
int iy = (y - setup.cliprect_miny) / 2;
mask = spu_z16_test_less(zvals.v, &ztile.us8[iy][ix], x>>1, quadmask);
}
else {
zvals.v = spu_mul(zvals.v, zscale32.v);
ASSERT(spu.fb.depth_format == PIPE_FORMAT_Z32_UNORM);
if (mask & MASK_TOP_LEFT) {
uint z = (uint) zvals.f[0];
if (z < ztile.t32[iy][ix])
ztile.t32[iy][ix] = z;
else
mask &= ~MASK_TOP_LEFT;
}
if (mask & MASK_TOP_RIGHT) {
uint z = (uint) zvals.f[1];
if (z < ztile.t32[iy][ix+1])
ztile.t32[iy][ix+1] = z;
else
mask &= ~MASK_TOP_RIGHT;
}
if (mask & MASK_BOTTOM_LEFT) {
uint z = (uint) zvals.f[2];
if (z < ztile.t32[iy+1][ix])
ztile.t32[iy+1][ix] = z;
else
mask &= ~MASK_BOTTOM_LEFT;
}
if (mask & MASK_BOTTOM_RIGHT) {
uint z = (uint) zvals.f[3];
if (z < ztile.t32[iy+1][ix+1])
ztile.t32[iy+1][ix+1] = z;
else
mask &= ~MASK_BOTTOM_RIGHT;
}
int ix = (x - setup.cliprect_minx) / 2;
int iy = (y - setup.cliprect_miny) / 2;
mask = spu_z32_test_less(zvals.v, &ztile.ui4[iy][ix], quadmask);
}
if (mask)
cur_tile_status_z = TILE_STATUS_DIRTY;
return mask;
}
static vector unsigned int
do_depth_test_simd(int x, int y, vector unsigned int quadmask)
{
int ix = (x - setup.cliprect_minx) / 2;
int iy = (y - setup.cliprect_miny) / 2;
float4 zvals;
vector unsigned int zmask;
zvals.v = eval_z((float) x, (float) y);
if (cur_tile_status_z == TILE_STATUS_CLEAR) {
/* now, _really_ clear the tile */
clear_z_tile(&ztile);
}
else if (cur_tile_status_z != TILE_STATUS_DIRTY) {
/* make sure we've got the tile from main mem */
wait_on_mask(1 << TAG_READ_TILE_Z);
}
cur_tile_status_z = TILE_STATUS_DIRTY;
/* XXX fetch Z value sooner to hide latency here */
zmask = spu_cmpgt(ztile.f4[ix][iy].v, zvals.v);
zmask = spu_and(zmask, quadmask);
ztile.f4[ix][iy].v = spu_sel(ztile.f4[ix][iy].v, zvals.v, zmask);
//ztile.f4[ix][iy].v = spu_sel(zvals.v, ztile.f4[ix][iy].v, mask4);
return zmask;
}
/**
* Emit a quad (pass to next stage). No clipping is done.
*/
@ -461,36 +336,18 @@ emit_quad( int x, int y, mask_t mask )
}
if (spu.depth_stencil.depth.enabled) {
#if SIMD_Z
mask = do_depth_test_simd(x, y, mask);
#else
mask = do_depth_test(x, y, mask);
#endif
}
#if !SIMD_Z
if (mask)
#endif
{
if (cur_tile_status_c == TILE_STATUS_CLEAR) {
/* now, _really_ clear the tile */
clear_c_tile(&ctile);
}
/* If any bits in mask are set... */
if (spu_extract(spu_orx(mask), 0)) {
#if 0
if (cur_tile_status_c == TILE_STATUS_CLEAR) {
/* now, _really_ clear the tile */
clear_c_tile(&ctile);
cur_tile_status_c = TILE_STATUS_DIRTY;
}
else if (cur_tile_status_c != TILE_STATUS_DIRTY) {
/* make sure we've got the tile from main mem */
wait_on_mask(1 << TAG_READ_TILE_COLOR);
}
#endif
cur_tile_status_c = TILE_STATUS_DIRTY;
#if SIMD_Z
if (spu_extract(mask, 0))
ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
if (spu_extract(mask, 1))
@ -499,20 +356,11 @@ emit_quad( int x, int y, mask_t mask )
ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
if (spu_extract(mask, 3))
ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
#elif 0
#if 0
/* SIMD_Z with swizzled color buffer (someday) */
vector float icolors = *((vector float *) &colors);
ctile.f4[iy/2][ix/2].v = spu_sel(ctile.f4[iy/2][ix/2].v, icolors, mask);
#else
if (mask & MASK_TOP_LEFT)
ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
if (mask & MASK_TOP_RIGHT)
ctile.t32[iy][ix+1] = colors[QUAD_TOP_RIGHT];
if (mask & MASK_BOTTOM_LEFT)
ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
if (mask & MASK_BOTTOM_RIGHT)
ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
#endif
}
@ -533,38 +381,20 @@ static INLINE int block( int x )
/**
* Compute mask which indicates which pixels in the 2x2 quad are actually inside
* the triangle's bounds.
*
* this is pretty nasty... may need to rework flush_spans again to
* fix it, if possible.
* The mask is a uint4 vector and each element will be 0 or 0xffffffff.
*/
static mask_t calculate_mask( int x )
static INLINE mask_t calculate_mask( int x )
{
#if SIMD_Z
uint m0, m1, m2, m3;
m0 = (x >= setup.span.left[0] && x < setup.span.right[0]) * ~0;
m1 = (x+1 >= setup.span.left[0] && x+1 < setup.span.right[0]) * ~0;
m2 = (x >= setup.span.left[1] && x < setup.span.right[1]) * ~0;
m3 = (x+1 >= setup.span.left[1] && x+1 < setup.span.right[1]) * ~0;
return (vector unsigned int) {m0, m1, m2, m3};
#else
unsigned mask = 0x0;
if (x >= setup.span.left[0] && x < setup.span.right[0])
mask |= MASK_TOP_LEFT;
if (x >= setup.span.left[1] && x < setup.span.right[1])
mask |= MASK_BOTTOM_LEFT;
if (x+1 >= setup.span.left[0] && x+1 < setup.span.right[0])
mask |= MASK_TOP_RIGHT;
if (x+1 >= setup.span.left[1] && x+1 < setup.span.right[1])
mask |= MASK_BOTTOM_RIGHT;
/* This is a little tricky.
* Use & instead of && to avoid branches.
* Use negation to convert true/false to ~0/0 values.
*/
mask_t mask;
mask = spu_insert(-((x >= setup.span.left[0]) & (x < setup.span.right[0])), mask, 0);
mask = spu_insert(-((x+1 >= setup.span.left[0]) & (x+1 < setup.span.right[0])), mask, 1);
mask = spu_insert(-((x >= setup.span.left[1]) & (x < setup.span.right[1])), mask, 2);
mask = spu_insert(-((x+1 >= setup.span.left[1]) & (x+1 < setup.span.right[1])), mask, 3);
return mask;
#endif
}

View file

@ -0,0 +1,135 @@
/**************************************************************************
*
* Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sub license, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice (including the
* next paragraph) shall be included in all copies or substantial portions
* of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
* IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
**************************************************************************/
/**
* Zbuffer/depth test code.
*/
#ifndef SPU_ZTEST_H
#define SPU_ZTEST_H
#ifdef __SPU__
#include <spu_intrinsics.h>
#endif
/**
* Perform Z testing for a 16-bit/value Z buffer.
*
* \param zvals vector of four fragment zvalues as floats
* \param zbuf ptr to vector of ushort[8] zbuffer values. Note that this
* contains the Z values for 2 quads, 8 pixels.
* \param x x coordinate of quad (only lsbit is significant)
* \param inMask indicates which fragments in the quad are alive
* \return new mask indicating which fragments are alive after ztest
*/
static INLINE vector unsigned int
spu_z16_test_less(vector float zvals, vector unsigned short *zbuf,
uint x, vector unsigned int inMask)
{
#define ZERO 0x80
vector unsigned int zvals_ui4, zbuf_ui4, mask;
/* convert floats to uints in [0, 65535] */
zvals_ui4 = spu_convtu(zvals, 32); /* convert to [0, 2^32] */
zvals_ui4 = spu_rlmask(zvals_ui4, -16); /* right shift 16 */
/* XXX this conditional could be removed with a bit of work */
if (x & 1) {
/* convert zbuffer values from ushorts to uints */
/* gather lower four ushorts */
zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
(vector unsigned int) *zbuf,
VEC_LITERAL(vector unsigned char,
ZERO, ZERO, 8, 9, ZERO, ZERO, 10, 11,
ZERO, ZERO, 12, 13, ZERO, ZERO, 14, 15));
/* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
/* mask &= inMask */
mask = spu_and(mask, inMask);
/* zbuf = mask ? zval : zbuf */
zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask);
/* convert zbuffer values from uints back to ushorts, preserve lower 4 */
*zbuf = (vector unsigned short)
spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
VEC_LITERAL(vector unsigned char,
16, 17, 18, 19, 20, 21, 22, 23,
2, 3, 6, 7, 10, 11, 14, 15));
}
else {
/* convert zbuffer values from ushorts to uints */
/* gather upper four ushorts */
zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
(vector unsigned int) *zbuf,
VEC_LITERAL(vector unsigned char,
ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7));
/* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
/* mask &= inMask */
mask = spu_and(mask, inMask);
/* zbuf = mask ? zval : zbuf */
zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask);
/* convert zbuffer values from uints back to ushorts, preserve upper 4 */
*zbuf = (vector unsigned short)
spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
VEC_LITERAL(vector unsigned char,
2, 3, 6, 7, 10, 11, 14, 15,
24, 25, 26, 27, 28, 29, 30, 31));
}
return mask;
#undef ZERO
}
/**
* As above, but Zbuffer values as 32-bit uints
*/
static INLINE vector unsigned int
spu_z32_test_less(vector float zvals, vector unsigned int *zbuf_ptr,
vector unsigned int inMask)
{
vector unsigned int zvals_ui4, mask, zbuf = *zbuf_ptr;
/* convert floats to uints in [0, 0xffffffff] */
zvals_ui4 = spu_convtu(zvals, 32);
/* mask = (zbuf < zvals_ui4) ? ~0 : 0 */
mask = spu_cmpgt(zbuf, zvals_ui4);
/* mask &= inMask */
mask = spu_and(mask, inMask);
/* zbuf = mask ? zval : zbuf */
*zbuf_ptr = spu_sel(zbuf, zvals_ui4, mask);
return mask;
}
#endif /* SPU_ZTEST_H */