Fetch routines convert and transpose all 4 vertices at once.

This commit is contained in:
Ian Romanick 2008-02-12 11:29:34 -08:00
parent 125451b9f0
commit dd07e154d2
2 changed files with 71 additions and 83 deletions

View file

@ -100,7 +100,7 @@ fetch_unaligned(qword *dst, unsigned ea, unsigned size)
}
#define CVT_32_FLOAT(q) (*q)
#define CVT_32_FLOAT(q) (*(q))
static INLINE qword
CVT_64_FLOAT(const qword *qw)
@ -242,85 +242,90 @@ CVT_32_SNORM(const qword *qw)
* This is probably needed/dupliocated elsewhere, eg format
* conversion, texture sampling etc.
*/
#define FETCH_ATTRIB( NAME, SZ, CVT ) \
static qword \
fetch_##NAME(const qword *qw) \
{ \
qword expanded = CVT(qw); \
return si_selb(expanded, (qword) defaults, SZ); \
#define FETCH_ATTRIB( NAME, SZ, CVT, N ) \
static void \
fetch_##NAME(qword *out, const qword *in) \
{ \
qword tmp[4]; \
\
tmp[0] = si_selb(CVT(in + (0 * N)), (qword) defaults, SZ); \
tmp[1] = si_selb(CVT(in + (1 * N)), (qword) defaults, SZ); \
tmp[2] = si_selb(CVT(in + (2 * N)), (qword) defaults, SZ); \
tmp[3] = si_selb(CVT(in + (3 * N)), (qword) defaults, SZ); \
_transpose_matrix4x4((vec_float4 *) out, (vec_float4 *) tmp); \
}
FETCH_ATTRIB( R64G64B64A64_FLOAT, SZ_4, CVT_64_FLOAT )
FETCH_ATTRIB( R64G64B64_FLOAT, SZ_3, CVT_64_FLOAT )
FETCH_ATTRIB( R64G64_FLOAT, SZ_2, CVT_64_FLOAT )
FETCH_ATTRIB( R64_FLOAT, SZ_1, CVT_64_FLOAT )
FETCH_ATTRIB( R64G64B64A64_FLOAT, SZ_4, CVT_64_FLOAT, 2 )
FETCH_ATTRIB( R64G64B64_FLOAT, SZ_3, CVT_64_FLOAT, 2 )
FETCH_ATTRIB( R64G64_FLOAT, SZ_2, CVT_64_FLOAT, 2 )
FETCH_ATTRIB( R64_FLOAT, SZ_1, CVT_64_FLOAT, 2 )
FETCH_ATTRIB( R32G32B32A32_FLOAT, SZ_4, CVT_32_FLOAT )
FETCH_ATTRIB( R32G32B32_FLOAT, SZ_3, CVT_32_FLOAT )
FETCH_ATTRIB( R32G32_FLOAT, SZ_2, CVT_32_FLOAT )
FETCH_ATTRIB( R32_FLOAT, SZ_1, CVT_32_FLOAT )
FETCH_ATTRIB( R32G32B32A32_FLOAT, SZ_4, CVT_32_FLOAT, 1 )
FETCH_ATTRIB( R32G32B32_FLOAT, SZ_3, CVT_32_FLOAT, 1 )
FETCH_ATTRIB( R32G32_FLOAT, SZ_2, CVT_32_FLOAT, 1 )
FETCH_ATTRIB( R32_FLOAT, SZ_1, CVT_32_FLOAT, 1 )
FETCH_ATTRIB( R32G32B32A32_USCALED, SZ_4, CVT_32_USCALED )
FETCH_ATTRIB( R32G32B32_USCALED, SZ_3, CVT_32_USCALED )
FETCH_ATTRIB( R32G32_USCALED, SZ_2, CVT_32_USCALED )
FETCH_ATTRIB( R32_USCALED, SZ_1, CVT_32_USCALED )
FETCH_ATTRIB( R32G32B32A32_USCALED, SZ_4, CVT_32_USCALED, 1 )
FETCH_ATTRIB( R32G32B32_USCALED, SZ_3, CVT_32_USCALED, 1 )
FETCH_ATTRIB( R32G32_USCALED, SZ_2, CVT_32_USCALED, 1 )
FETCH_ATTRIB( R32_USCALED, SZ_1, CVT_32_USCALED, 1 )
FETCH_ATTRIB( R32G32B32A32_SSCALED, SZ_4, CVT_32_SSCALED )
FETCH_ATTRIB( R32G32B32_SSCALED, SZ_3, CVT_32_SSCALED )
FETCH_ATTRIB( R32G32_SSCALED, SZ_2, CVT_32_SSCALED )
FETCH_ATTRIB( R32_SSCALED, SZ_1, CVT_32_SSCALED )
FETCH_ATTRIB( R32G32B32A32_SSCALED, SZ_4, CVT_32_SSCALED, 1 )
FETCH_ATTRIB( R32G32B32_SSCALED, SZ_3, CVT_32_SSCALED, 1 )
FETCH_ATTRIB( R32G32_SSCALED, SZ_2, CVT_32_SSCALED, 1 )
FETCH_ATTRIB( R32_SSCALED, SZ_1, CVT_32_SSCALED, 1 )
FETCH_ATTRIB( R32G32B32A32_UNORM, SZ_4, CVT_32_UNORM )
FETCH_ATTRIB( R32G32B32_UNORM, SZ_3, CVT_32_UNORM )
FETCH_ATTRIB( R32G32_UNORM, SZ_2, CVT_32_UNORM )
FETCH_ATTRIB( R32_UNORM, SZ_1, CVT_32_UNORM )
FETCH_ATTRIB( R32G32B32A32_UNORM, SZ_4, CVT_32_UNORM, 1 )
FETCH_ATTRIB( R32G32B32_UNORM, SZ_3, CVT_32_UNORM, 1 )
FETCH_ATTRIB( R32G32_UNORM, SZ_2, CVT_32_UNORM, 1 )
FETCH_ATTRIB( R32_UNORM, SZ_1, CVT_32_UNORM, 1 )
FETCH_ATTRIB( R32G32B32A32_SNORM, SZ_4, CVT_32_SNORM )
FETCH_ATTRIB( R32G32B32_SNORM, SZ_3, CVT_32_SNORM )
FETCH_ATTRIB( R32G32_SNORM, SZ_2, CVT_32_SNORM )
FETCH_ATTRIB( R32_SNORM, SZ_1, CVT_32_SNORM )
FETCH_ATTRIB( R32G32B32A32_SNORM, SZ_4, CVT_32_SNORM, 1 )
FETCH_ATTRIB( R32G32B32_SNORM, SZ_3, CVT_32_SNORM, 1 )
FETCH_ATTRIB( R32G32_SNORM, SZ_2, CVT_32_SNORM, 1 )
FETCH_ATTRIB( R32_SNORM, SZ_1, CVT_32_SNORM, 1 )
FETCH_ATTRIB( R16G16B16A16_USCALED, SZ_4, CVT_16_USCALED )
FETCH_ATTRIB( R16G16B16_USCALED, SZ_3, CVT_16_USCALED )
FETCH_ATTRIB( R16G16_USCALED, SZ_2, CVT_16_USCALED )
FETCH_ATTRIB( R16_USCALED, SZ_1, CVT_16_USCALED )
FETCH_ATTRIB( R16G16B16A16_USCALED, SZ_4, CVT_16_USCALED, 1 )
FETCH_ATTRIB( R16G16B16_USCALED, SZ_3, CVT_16_USCALED, 1 )
FETCH_ATTRIB( R16G16_USCALED, SZ_2, CVT_16_USCALED, 1 )
FETCH_ATTRIB( R16_USCALED, SZ_1, CVT_16_USCALED, 1 )
FETCH_ATTRIB( R16G16B16A16_SSCALED, SZ_4, CVT_16_SSCALED )
FETCH_ATTRIB( R16G16B16_SSCALED, SZ_3, CVT_16_SSCALED )
FETCH_ATTRIB( R16G16_SSCALED, SZ_2, CVT_16_SSCALED )
FETCH_ATTRIB( R16_SSCALED, SZ_1, CVT_16_SSCALED )
FETCH_ATTRIB( R16G16B16A16_SSCALED, SZ_4, CVT_16_SSCALED, 1 )
FETCH_ATTRIB( R16G16B16_SSCALED, SZ_3, CVT_16_SSCALED, 1 )
FETCH_ATTRIB( R16G16_SSCALED, SZ_2, CVT_16_SSCALED, 1 )
FETCH_ATTRIB( R16_SSCALED, SZ_1, CVT_16_SSCALED, 1 )
FETCH_ATTRIB( R16G16B16A16_UNORM, SZ_4, CVT_16_UNORM )
FETCH_ATTRIB( R16G16B16_UNORM, SZ_3, CVT_16_UNORM )
FETCH_ATTRIB( R16G16_UNORM, SZ_2, CVT_16_UNORM )
FETCH_ATTRIB( R16_UNORM, SZ_1, CVT_16_UNORM )
FETCH_ATTRIB( R16G16B16A16_UNORM, SZ_4, CVT_16_UNORM, 1 )
FETCH_ATTRIB( R16G16B16_UNORM, SZ_3, CVT_16_UNORM, 1 )
FETCH_ATTRIB( R16G16_UNORM, SZ_2, CVT_16_UNORM, 1 )
FETCH_ATTRIB( R16_UNORM, SZ_1, CVT_16_UNORM, 1 )
FETCH_ATTRIB( R16G16B16A16_SNORM, SZ_4, CVT_16_SNORM )
FETCH_ATTRIB( R16G16B16_SNORM, SZ_3, CVT_16_SNORM )
FETCH_ATTRIB( R16G16_SNORM, SZ_2, CVT_16_SNORM )
FETCH_ATTRIB( R16_SNORM, SZ_1, CVT_16_SNORM )
FETCH_ATTRIB( R16G16B16A16_SNORM, SZ_4, CVT_16_SNORM, 1 )
FETCH_ATTRIB( R16G16B16_SNORM, SZ_3, CVT_16_SNORM, 1 )
FETCH_ATTRIB( R16G16_SNORM, SZ_2, CVT_16_SNORM, 1 )
FETCH_ATTRIB( R16_SNORM, SZ_1, CVT_16_SNORM, 1 )
FETCH_ATTRIB( R8G8B8A8_USCALED, SZ_4, CVT_8_USCALED )
FETCH_ATTRIB( R8G8B8_USCALED, SZ_3, CVT_8_USCALED )
FETCH_ATTRIB( R8G8_USCALED, SZ_2, CVT_8_USCALED )
FETCH_ATTRIB( R8_USCALED, SZ_1, CVT_8_USCALED )
FETCH_ATTRIB( R8G8B8A8_USCALED, SZ_4, CVT_8_USCALED, 1 )
FETCH_ATTRIB( R8G8B8_USCALED, SZ_3, CVT_8_USCALED, 1 )
FETCH_ATTRIB( R8G8_USCALED, SZ_2, CVT_8_USCALED, 1 )
FETCH_ATTRIB( R8_USCALED, SZ_1, CVT_8_USCALED, 1 )
FETCH_ATTRIB( R8G8B8A8_SSCALED, SZ_4, CVT_8_SSCALED )
FETCH_ATTRIB( R8G8B8_SSCALED, SZ_3, CVT_8_SSCALED )
FETCH_ATTRIB( R8G8_SSCALED, SZ_2, CVT_8_SSCALED )
FETCH_ATTRIB( R8_SSCALED, SZ_1, CVT_8_SSCALED )
FETCH_ATTRIB( R8G8B8A8_SSCALED, SZ_4, CVT_8_SSCALED, 1 )
FETCH_ATTRIB( R8G8B8_SSCALED, SZ_3, CVT_8_SSCALED, 1 )
FETCH_ATTRIB( R8G8_SSCALED, SZ_2, CVT_8_SSCALED, 1 )
FETCH_ATTRIB( R8_SSCALED, SZ_1, CVT_8_SSCALED, 1 )
FETCH_ATTRIB( R8G8B8A8_UNORM, SZ_4, CVT_8_UNORM )
FETCH_ATTRIB( R8G8B8_UNORM, SZ_3, CVT_8_UNORM )
FETCH_ATTRIB( R8G8_UNORM, SZ_2, CVT_8_UNORM )
FETCH_ATTRIB( R8_UNORM, SZ_1, CVT_8_UNORM )
FETCH_ATTRIB( R8G8B8A8_UNORM, SZ_4, CVT_8_UNORM, 1 )
FETCH_ATTRIB( R8G8B8_UNORM, SZ_3, CVT_8_UNORM, 1 )
FETCH_ATTRIB( R8G8_UNORM, SZ_2, CVT_8_UNORM, 1 )
FETCH_ATTRIB( R8_UNORM, SZ_1, CVT_8_UNORM, 1 )
FETCH_ATTRIB( R8G8B8A8_SNORM, SZ_4, CVT_8_SNORM )
FETCH_ATTRIB( R8G8B8_SNORM, SZ_3, CVT_8_SNORM )
FETCH_ATTRIB( R8G8_SNORM, SZ_2, CVT_8_SNORM )
FETCH_ATTRIB( R8_SNORM, SZ_1, CVT_8_SNORM )
FETCH_ATTRIB( R8G8B8A8_SNORM, SZ_4, CVT_8_SNORM, 1 )
FETCH_ATTRIB( R8G8B8_SNORM, SZ_3, CVT_8_SNORM, 1 )
FETCH_ATTRIB( R8G8_SNORM, SZ_2, CVT_8_SNORM, 1 )
FETCH_ATTRIB( R8_SNORM, SZ_1, CVT_8_SNORM, 1 )
FETCH_ATTRIB( A8R8G8B8_UNORM, SZ_4, CVT_8_UNORM )
FETCH_ATTRIB( A8R8G8B8_UNORM, SZ_4, CVT_8_UNORM, 1 )
@ -584,7 +589,6 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
unsigned idx;
const unsigned bytes_per_entry = draw->vertex_fetch.size[attr];
const unsigned quads_per_entry = (bytes_per_entry + 15) / 16;
qword p[4];
qword in[2 * 4];
@ -609,23 +613,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
/* Convert all 4 vertices to vectors of float.
*/
idx = 0;
for (i = 0; i < 4; i++) {
p[i] = (*fetch)(in + idx);
idx += quads_per_entry;
}
/* Transpose/swizzle into vector-friendly format. Currently
* assuming that all vertex shader inputs are float[4], but this
* isn't true -- if the vertex shader only wants tex0.xy, we
* could optimize for that.
*
* To do so fully without codegen would probably require an
* excessive number of fetch functions, but we could at least
* minimize the transpose step:
*/
_transpose_matrix4x4(&machine->Inputs[attr].xyzw[0].q, p);
(*fetch)(&machine->Inputs[attr].xyzw[0].q, in);
}
}

View file

@ -6,7 +6,7 @@
struct spu_vs_context;
typedef qword (*spu_fetch_func)(const qword *qw);
typedef void (*spu_fetch_func)(qword *out, const qword *in);
typedef void (*spu_full_fetch_func)( struct spu_vs_context *draw,
struct spu_exec_machine *machine,
const unsigned *elts,