mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-29 07:20:23 +01:00
llvmpipe: Optimize do_triangle_ccw for POWER8
This patch converts the SSE optimization done in do_triangle_ccw to
VMX/VSX.
I measured the results on POWER8 machine with 32 cores at 3.4GHz and
16GB of RAM.
FPS/Score
Name Before After Delta
------------------------------------------------
glmark2 (score) 136.6 139.8 2.34%
openarena 16.14 16.35 1.30%
xonotic 4.655 4.707 1.11%
v2:
- Convert loads to use aligned loads
- Make sure code is build only on POWER8 LE machine
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
This commit is contained in:
parent
e99555ef0b
commit
3bbe16ea79
1 changed files with 100 additions and 0 deletions
|
|
@ -46,6 +46,9 @@
|
|||
|
||||
#if defined(PIPE_ARCH_SSE)
|
||||
#include <emmintrin.h>
|
||||
#elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
|
||||
#include <altivec.h>
|
||||
#include "util/u_pwr8.h"
|
||||
#endif
|
||||
|
||||
static inline int
|
||||
|
|
@ -462,6 +465,103 @@ do_triangle_ccw(struct lp_setup_context *setup,
|
|||
STORE_PLANE(plane[2], p2);
|
||||
#undef STORE_PLANE
|
||||
} else
|
||||
#elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
|
||||
/*
|
||||
* XXX this code is effectively disabled for all practical purposes,
|
||||
* as the allowed fb size is tiny if FIXED_ORDER is 8.
|
||||
*/
|
||||
if (setup->fb.width <= MAX_FIXED_LENGTH32 &&
|
||||
setup->fb.height <= MAX_FIXED_LENGTH32 &&
|
||||
(bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 &&
|
||||
(bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32) {
|
||||
unsigned int bottom_edge;
|
||||
__m128i vertx, verty;
|
||||
__m128i shufx, shufy;
|
||||
__m128i dcdx, dcdy, c;
|
||||
__m128i unused;
|
||||
__m128i dcdx_neg_mask;
|
||||
__m128i dcdy_neg_mask;
|
||||
__m128i dcdx_zero_mask;
|
||||
__m128i top_left_flag;
|
||||
__m128i c_inc_mask, c_inc;
|
||||
__m128i eo, p0, p1, p2;
|
||||
__m128i_union vshuf_mask;
|
||||
__m128i zero = vec_splats((unsigned char) 0);
|
||||
PIPE_ALIGN_VAR(16) int32_t temp_vec[4];
|
||||
|
||||
#ifdef PIPE_ARCH_LITTLE_ENDIAN
|
||||
vshuf_mask.i[0] = 0x07060504;
|
||||
vshuf_mask.i[1] = 0x0B0A0908;
|
||||
vshuf_mask.i[2] = 0x03020100;
|
||||
vshuf_mask.i[3] = 0x0F0E0D0C;
|
||||
#else
|
||||
vshuf_mask.i[0] = 0x00010203;
|
||||
vshuf_mask.i[1] = 0x0C0D0E0F;
|
||||
vshuf_mask.i[2] = 0x04050607;
|
||||
vshuf_mask.i[3] = 0x08090A0B;
|
||||
#endif
|
||||
|
||||
/* vertex x coords */
|
||||
vertx = vec_load_si128((const uint32_t *) position->x);
|
||||
/* vertex y coords */
|
||||
verty = vec_load_si128((const uint32_t *) position->y);
|
||||
|
||||
shufx = vec_perm (vertx, vertx, vshuf_mask.m128i);
|
||||
shufy = vec_perm (verty, verty, vshuf_mask.m128i);
|
||||
|
||||
dcdx = vec_sub_epi32(verty, shufy);
|
||||
dcdy = vec_sub_epi32(vertx, shufx);
|
||||
|
||||
dcdx_neg_mask = vec_srai_epi32(dcdx, 31);
|
||||
dcdx_zero_mask = vec_cmpeq_epi32(dcdx, zero);
|
||||
dcdy_neg_mask = vec_srai_epi32(dcdy, 31);
|
||||
|
||||
bottom_edge = (setup->bottom_edge_rule == 0) ? ~0 : 0;
|
||||
top_left_flag = (__m128i) vec_splats(bottom_edge);
|
||||
|
||||
c_inc_mask = vec_or(dcdx_neg_mask,
|
||||
vec_and(dcdx_zero_mask,
|
||||
vec_xor(dcdy_neg_mask,
|
||||
top_left_flag)));
|
||||
|
||||
c_inc = vec_srli_epi32(c_inc_mask, 31);
|
||||
|
||||
c = vec_sub_epi32(vec_mullo_epi32(dcdx, vertx),
|
||||
vec_mullo_epi32(dcdy, verty));
|
||||
|
||||
c = vec_add_epi32(c, c_inc);
|
||||
|
||||
/* Scale up to match c:
|
||||
*/
|
||||
dcdx = vec_slli_epi32(dcdx, FIXED_ORDER);
|
||||
dcdy = vec_slli_epi32(dcdy, FIXED_ORDER);
|
||||
|
||||
/* Calculate trivial reject values:
|
||||
*/
|
||||
eo = vec_sub_epi32(vec_andc(dcdy_neg_mask, dcdy),
|
||||
vec_and(dcdx_neg_mask, dcdx));
|
||||
|
||||
/* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
|
||||
|
||||
/* Pointless transpose which gets undone immediately in
|
||||
* rasterization:
|
||||
*/
|
||||
transpose4_epi32(&c, &dcdx, &dcdy, &eo,
|
||||
&p0, &p1, &p2, &unused);
|
||||
|
||||
#define STORE_PLANE(plane, vec) do { \
|
||||
vec_store_si128((uint32_t *)&temp_vec, vec); \
|
||||
plane.c = (int64_t)temp_vec[0]; \
|
||||
plane.dcdx = temp_vec[1]; \
|
||||
plane.dcdy = temp_vec[2]; \
|
||||
plane.eo = temp_vec[3]; \
|
||||
} while(0)
|
||||
|
||||
STORE_PLANE(plane[0], p0);
|
||||
STORE_PLANE(plane[1], p1);
|
||||
STORE_PLANE(plane[2], p2);
|
||||
#undef STORE_PLANE
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
int i;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue