diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h index eb401986278..de06745b311 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.h +++ b/src/gallium/drivers/llvmpipe/lp_rast.h @@ -60,16 +60,25 @@ struct cmd_bin; * FIXED_TYPE_WIDTH, any larger and we could overflow a * FIXED_TYPE_WIDTH_-bit int. */ -#define MAX_FIXED_LENGTH (1 << (((FIXED_TYPE_WIDTH/2) - 1) - FIXED_ORDER)) +#define MAX_FIXED_LENGTH (1 << (FIXED_TYPE_WIDTH / 2 - 1 - FIXED_ORDER)) -#define MAX_FIXED_LENGTH32 (1 << (((32/2) - 1) - FIXED_ORDER)) +#define MAX_FIXED_LENGTH32_BLOCK (1 << (32 / 2 - 1 - FIXED_ORDER)) +/* Maximum length of an edge supported by build_mask_linear_32, + * empirically determined. + */ +#define MAX_FIXED_LENGTH32_TILE (1 << 16) /* Rasterizer output size going to jit fs, width/height */ #define LP_RASTER_BLOCK_SIZE 4 #define LP_MAX_ACTIVE_BINNED_QUERIES 64 +#define TO_FIXED64(a) (((int64_t)a) << FIXED_ORDER) + +/* 64bit wide multiplication of 2 ints */ #define IMUL64(a, b) (((int64_t)(a)) * ((int64_t)(b))) +/* 64bit wide multiplication of 2 ints, returned as fixed point */ +#define IMUL64_FIXED(a, b) IMUL64(TO_FIXED64(a), b) struct lp_rasterizer_task; @@ -131,7 +140,7 @@ struct lp_rast_plane { int32_t dcdy; /* one-pixel sized trivial reject offsets for each plane */ - uint32_t eo; + int32_t eo; /* * We rely on this struct being 64bit aligned (ideally it would be 128bit * but that's quite the waste) and therefore on 32bit we need padding diff --git a/src/gallium/drivers/llvmpipe/lp_rast_debug.c b/src/gallium/drivers/llvmpipe/lp_rast_debug.c index 5d1702e945a..ffd8ace1f89 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_debug.c +++ b/src/gallium/drivers/llvmpipe/lp_rast_debug.c @@ -324,8 +324,8 @@ debug_triangle(int tilex, int tiley, while (plane_mask) { plane[nr_planes] = tri_plane[u_bit_scan(&plane_mask)]; plane[nr_planes].c = (plane[nr_planes].c + - IMUL64(plane[nr_planes].dcdy, tiley) - - IMUL64(plane[nr_planes].dcdx, tilex)); + IMUL64_FIXED(plane[nr_planes].dcdy, tiley) - + IMUL64_FIXED(plane[nr_planes].dcdx, tilex)); nr_planes++; } @@ -340,12 +340,12 @@ debug_triangle(int tilex, int tiley, out: for (i = 0; i < nr_planes; i++) - plane[i].c -= plane[i].dcdx; + plane[i].c -= TO_FIXED64(plane[i].dcdx); } for (i = 0; i < nr_planes; i++) { - plane[i].c += IMUL64(plane[i].dcdx, TILE_SIZE); - plane[i].c += plane[i].dcdy; + plane[i].c += IMUL64_FIXED(plane[i].dcdx, TILE_SIZE); + plane[i].c += TO_FIXED64(plane[i].dcdy); } } return count; diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c index 8415a23a2c1..6ab962edcb9 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c @@ -35,6 +35,14 @@ #include "lp_perf.h" #include "lp_rast_priv.h" +#if DETECT_ARCH_SSE +#include +#include "util/u_sse.h" +#elif defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN +#include +#include "util/u_pwr8.h" +#endif + /** * Shade all pixels in a 4x4 block. */ @@ -46,7 +54,6 @@ block_full_4(struct lp_rasterizer_task *task, lp_rast_shade_quads_all(task, &tri->inputs, x, y); } - /** * Shade all pixels in a 16x16 block. */ @@ -62,49 +69,6 @@ block_full_16(struct lp_rasterizer_task *task, block_full_4(task, tri, x + ix, y + iy); } -static inline unsigned -build_mask_linear(int32_t c, int32_t dcdx, int32_t dcdy) -{ - unsigned mask = 0; - - int32_t c0 = c; - int32_t c1 = c0 + dcdy; - int32_t c2 = c1 + dcdy; - int32_t c3 = c2 + dcdy; - - mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0); - mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1); - mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2); - mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3); - mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4); - mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5); - mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6); - mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7); - mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8); - mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9); - mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10); - mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11); - mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12); - mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13); - mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14); - mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15); - - return mask; -} - - -UNUSED static inline void -build_masks(int32_t c, - int32_t cdiff, - int32_t dcdx, - int32_t dcdy, - unsigned *outmask, - unsigned *partmask) -{ - *outmask |= build_mask_linear(c, dcdx, dcdy); - *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy); -} - void lp_rast_triangle_3_16(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) @@ -161,12 +125,10 @@ lp_rast_triangle_ms_4_16(struct lp_rasterizer_task *task, #if DETECT_ARCH_SSE -#include -#include "util/u_sse.h" - +#define HAS_BUILD_MASKS_32_SIMD 1 static inline void -build_masks_sse(int c, +build_masks_32(int c, int cdiff, int dcdx, int dcdy, @@ -212,7 +174,7 @@ build_masks_sse(int c, static inline unsigned -build_mask_linear_sse(int c, int dcdx, int dcdy) +build_mask_linear_32(int c, int dcdx, int dcdy) { __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); __m128i xdcdy = _mm_set1_epi32(dcdy); @@ -237,56 +199,7 @@ build_mask_linear_sse(int c, int dcdx, int dcdy) return _mm_movemask_epi8(result); } -static inline unsigned -sign_bits4(const __m128i *cstep, int cdiff) -{ - - /* Adjust the step values - */ - __m128i cio4 = _mm_set1_epi32(cdiff); - __m128i cstep0 = _mm_add_epi32(cstep[0], cio4); - __m128i cstep1 = _mm_add_epi32(cstep[1], cio4); - __m128i cstep2 = _mm_add_epi32(cstep[2], cio4); - __m128i cstep3 = _mm_add_epi32(cstep[3], cio4); - - /* Pack down to epi8 - */ - __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1); - __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3); - __m128i result = _mm_packs_epi16(cstep01, cstep23); - - /* Extract the sign bits - */ - return _mm_movemask_epi8(result); -} - -#define COLUMN0 ((1<<0)|(1<<4)|(1<<8) |(1<<12)) -#define COLUMN1 ((1<<1)|(1<<5)|(1<<9) |(1<<13)) -#define COLUMN2 ((1<<2)|(1<<6)|(1<<10)|(1<<14)) -#define COLUMN3 ((1<<3)|(1<<7)|(1<<11)|(1<<15)) - -#define ROW0 ((1<<0) |(1<<1) |(1<<2) |(1<<3)) -#define ROW1 ((1<<4) |(1<<5) |(1<<6) |(1<<7)) -#define ROW2 ((1<<8) |(1<<9) |(1<<10)|(1<<11)) -#define ROW3 ((1<<12)|(1<<13)|(1<<14)|(1<<15)) - -#define STAMP_SIZE 4 -static unsigned bottom_mask_tab[STAMP_SIZE] = { - ROW3, - ROW3 | ROW2, - ROW3 | ROW2 | ROW1, - ROW3 | ROW2 | ROW1 | ROW0, -}; - -static unsigned right_mask_tab[STAMP_SIZE] = { - COLUMN3, - COLUMN3 | COLUMN2, - COLUMN3 | COLUMN2 | COLUMN1, - COLUMN3 | COLUMN2 | COLUMN1 | COLUMN0, -}; - - -#define NR_PLANES 3 +#define HAS_RAST_TRIANGLE_3_16_SIMD 1 void lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, @@ -328,14 +241,18 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, */ dcdx = _mm_sub_epi32(zero, dcdx); - c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); - c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); + /* c, dcdx, dcdy are in fixed point, x and y are integers. */ + c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x << FIXED_ORDER))); + c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y << FIXED_ORDER))); rej4 = _mm_slli_epi32(rej4, 2); /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */ c = _mm_sub_epi32(c, _mm_set1_epi32(1)); rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1)); + /* We can do the rest with integers */ + c = _mm_srai_epi32(c, FIXED_ORDER); + dcdx2 = _mm_add_epi32(dcdx, dcdx); dcdx3 = _mm_add_epi32(dcdx2, dcdx); @@ -400,6 +317,8 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, 0xffff & ~out[i].mask); } +#define HAS_RAST_TRIANGLE_3_4_SIMD 1 + void lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) @@ -430,12 +349,16 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, */ dcdx = _mm_sub_epi32(zero, dcdx); - c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); - c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); + /* c, dcdx, dcdy are in fixed point, x and y are integers. */ + c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x << FIXED_ORDER))); + c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y << FIXED_ORDER))); /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */ c = _mm_sub_epi32(c, _mm_set1_epi32(1)); + /* We can do the rest with integers */ + c = _mm_srai_epi32(c, FIXED_ORDER); + dcdx2 = _mm_add_epi32(dcdx, dcdx); dcdx3 = _mm_add_epi32(dcdx2, dcdx); @@ -482,17 +405,63 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, } } -#undef NR_PLANES +/* Defined in lp_rast_tri_tmp.h */ +#define HAS_RAST_TRIANGLE_4_16_SIMD 1 -#else +static inline unsigned +sign_bits4(const __m128i *cstep, int cdiff) +{ -#if defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN + /* Adjust the step values + */ + __m128i cio4 = _mm_set1_epi32(cdiff); + __m128i cstep0 = _mm_add_epi32(cstep[0], cio4); + __m128i cstep1 = _mm_add_epi32(cstep[1], cio4); + __m128i cstep2 = _mm_add_epi32(cstep[2], cio4); + __m128i cstep3 = _mm_add_epi32(cstep[3], cio4); -#include -#include "util/u_pwr8.h" + /* Pack down to epi8 + */ + __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1); + __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3); + __m128i result = _mm_packs_epi16(cstep01, cstep23); + + /* Extract the sign bits + */ + return _mm_movemask_epi8(result); +} + +#define COLUMN0 ((1<<0)|(1<<4)|(1<<8) |(1<<12)) +#define COLUMN1 ((1<<1)|(1<<5)|(1<<9) |(1<<13)) +#define COLUMN2 ((1<<2)|(1<<6)|(1<<10)|(1<<14)) +#define COLUMN3 ((1<<3)|(1<<7)|(1<<11)|(1<<15)) + +#define ROW0 ((1<<0) |(1<<1) |(1<<2) |(1<<3)) +#define ROW1 ((1<<4) |(1<<5) |(1<<6) |(1<<7)) +#define ROW2 ((1<<8) |(1<<9) |(1<<10)|(1<<11)) +#define ROW3 ((1<<12)|(1<<13)|(1<<14)|(1<<15)) + +#define STAMP_SIZE 4 +static unsigned bottom_mask_tab[STAMP_SIZE] = { + ROW3, + ROW3 | ROW2, + ROW3 | ROW2 | ROW1, + ROW3 | ROW2 | ROW1 | ROW0, +}; + +static unsigned right_mask_tab[STAMP_SIZE] = { + COLUMN3, + COLUMN3 | COLUMN2, + COLUMN3 | COLUMN2 | COLUMN1, + COLUMN3 | COLUMN2 | COLUMN1 | COLUMN0, +}; + +#elif defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN + +#define HAS_BUILD_MASKS_32_SIMD 1 static inline void -build_masks_ppc(int c, +build_masks_32(int c, int cdiff, int dcdx, int dcdy, @@ -537,7 +506,7 @@ build_masks_ppc(int c, } static inline unsigned -build_mask_linear_ppc(int c, int dcdx, int dcdy) +build_mask_linear_32(int c, int dcdx, int dcdy) { __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); __m128i xdcdy = (__m128i) vec_splats(dcdy); @@ -569,7 +538,7 @@ lp_plane_to_m128i(const struct lp_rast_plane *plane) (int32_t)plane->dcdy, (int32_t)plane->eo); } -#define NR_PLANES 3 +#define HAS_RAST_TRIANGLE_3_16_SIMD 1 void lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, @@ -622,8 +591,9 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, */ dcdx = vec_sub_epi32(zero, dcdx); - c = vec_add_epi32(c, vec_mullo_epi32(dcdx, (__m128i) vec_splats(x))); - c = vec_add_epi32(c, vec_mullo_epi32(dcdy, (__m128i) vec_splats(y))); + /* c, dcdx, dcdy are in fixed point, x and y are integers. */ + c = vec_add_epi32(c, vec_mullo_epi32(dcdx, (__m128i)vec_splats(x << FIXED_ORDER))); + c = vec_add_epi32(c, vec_mullo_epi32(dcdy, (__m128i)vec_splats(y << FIXED_ORDER))); rej4 = vec_slli_epi32(rej4, 2); /* @@ -633,6 +603,9 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, c = vec_sub_epi32(c, (__m128i) vec_splats((unsigned int) 1)); rej4 = vec_add_epi32(rej4, (__m128i) vec_splats((unsigned int) 1)); + /* We can do the rest with integers */ + c = vec_srai_epi32(c, FIXED_ORDER); + dcdx2 = vec_add_epi32(dcdx, dcdx); dcdx3 = vec_add_epi32(dcdx2, dcdx); @@ -699,8 +672,97 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, #undef NR_PLANES -#else +#endif +#if !HAS_BUILD_MASKS_SIMD +static inline unsigned +build_mask_linear(int64_t c, int32_t dcdx, int32_t dcdy) +{ + unsigned mask = 0; + + int64_t c0 = c; + int64_t c1 = c0 + dcdy; + int64_t c2 = c1 + dcdy; + int64_t c3 = c2 + dcdy; + + mask |= ((c0 + 0 * dcdx) >> 63) & (1 << 0); + mask |= ((c0 + 1 * dcdx) >> 63) & (1 << 1); + mask |= ((c0 + 2 * dcdx) >> 63) & (1 << 2); + mask |= ((c0 + 3 * dcdx) >> 63) & (1 << 3); + mask |= ((c1 + 0 * dcdx) >> 63) & (1 << 4); + mask |= ((c1 + 1 * dcdx) >> 63) & (1 << 5); + mask |= ((c1 + 2 * dcdx) >> 63) & (1 << 6); + mask |= ((c1 + 3 * dcdx) >> 63) & (1 << 7); + mask |= ((c2 + 0 * dcdx) >> 63) & (1 << 8); + mask |= ((c2 + 1 * dcdx) >> 63) & (1 << 9); + mask |= ((c2 + 2 * dcdx) >> 63) & (1 << 10); + mask |= ((c2 + 3 * dcdx) >> 63) & (1 << 11); + mask |= ((c3 + 0 * dcdx) >> 63) & (1 << 12); + mask |= ((c3 + 1 * dcdx) >> 63) & (1 << 13); + mask |= ((c3 + 2 * dcdx) >> 63) & (1 << 14); + mask |= ((c3 + 3 * dcdx) >> 63) & (1 << 15); + + return mask; +} + +static inline void +build_masks(int64_t c, + int32_t cdiff, + int32_t dcdx, + int32_t dcdy, + unsigned *outmask, + unsigned *partmask) +{ + *outmask |= build_mask_linear(c, dcdx, dcdy); + *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy); +} +#endif + +#if !HAS_BUILD_MASKS_32_SIMD +static inline unsigned +build_mask_linear_32(int32_t c, int32_t dcdx, int32_t dcdy) +{ + unsigned mask = 0; + + int32_t c0 = c; + int32_t c1 = c0 + dcdy; + int32_t c2 = c1 + dcdy; + int32_t c3 = c2 + dcdy; + + mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0); + mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1); + mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2); + mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3); + mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4); + mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5); + mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6); + mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7); + mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8); + mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9); + mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10); + mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11); + mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12); + mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13); + mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14); + mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15); + + return mask; +} + +static inline void +build_masks_32(int32_t c, + int32_t cdiff, + int32_t dcdx, + int32_t dcdy, + unsigned *outmask, + unsigned *partmask) +{ + *outmask |= build_mask_linear_32(c, dcdx, dcdy); + *partmask |= build_mask_linear_32(c + cdiff, dcdx, dcdy); +} +#endif + +#if !HAS_RAST_TRIANGLE_3_16_SIMD void lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) @@ -710,9 +772,9 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, arg2.triangle.plane_mask = (1<<3)-1; lp_rast_triangle_32_3(task, arg2); } +#endif -#endif /* _ARCH_PWR8 && UTIL_ARCH_LITTLE_ENDIAN */ - +#if !HAS_RAST_TRIANGLE_4_16_SIMD void lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) @@ -722,25 +784,15 @@ lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task, arg2.triangle.plane_mask = (1<<4)-1; lp_rast_triangle_32_4(task, arg2); } +#endif +#if !HAS_RAST_TRIANGLE_3_4_SIMD void lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) { lp_rast_triangle_32_3_16(task, arg); } - -#endif - -#if DETECT_ARCH_SSE -#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_sse((int)c, (int)cdiff, dcdx, dcdy, omask, pmask) -#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_sse((int)c, dcdx, dcdy) -#elif (defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN) -#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_ppc((int)c, (int)cdiff, dcdx, dcdy, omask, pmask) -#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_ppc((int)c, dcdx, dcdy) -#else -#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks(c, cdiff, dcdx, dcdy, omask, pmask) -#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear(c, dcdx, dcdy) #endif #define RASTER_64 1 diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h index 0aeb52810ac..443eda0d981 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h @@ -55,28 +55,19 @@ TAG(do_block_4)(struct lp_rasterizer_task *task, for (unsigned j = 0; j < NR_PLANES; j++) { #ifndef MULTISAMPLE -#ifdef RASTER_64 - mask[0] &= ~BUILD_MASK_LINEAR(((c[j] - 1) >> (int64_t)FIXED_ORDER), - -plane[j].dcdx >> FIXED_ORDER, - plane[j].dcdy >> FIXED_ORDER); -#else - mask[0] &= ~BUILD_MASK_LINEAR((c[j] - 1), - -plane[j].dcdx, - plane[j].dcdy); -#endif + mask[0] &= ~build_mask_linear_32( + (int32_t)((c[j] - 1) >> FIXED_ORDER), + -plane[j].dcdx, + plane[j].dcdy); #else for (unsigned s = 0; s < task->scene->fb_max_samples; s++) { - int64_t new_c = (c[j]) + ((IMUL64(task->scene->fixed_sample_pos[s][1], plane[j].dcdy) + IMUL64(task->scene->fixed_sample_pos[s][0], -plane[j].dcdx)) >> FIXED_ORDER); - uint32_t build_mask; -#ifdef RASTER_64 - build_mask = BUILD_MASK_LINEAR((int32_t)((new_c - 1) >> (int64_t)FIXED_ORDER), - -plane[j].dcdx >> FIXED_ORDER, - plane[j].dcdy >> FIXED_ORDER); -#else - build_mask = BUILD_MASK_LINEAR((new_c - 1), - -plane[j].dcdx, - plane[j].dcdy); -#endif + int64_t new_c = c[j] + + (IMUL64(task->scene->fixed_sample_pos[s][1], plane[j].dcdy) + + IMUL64(task->scene->fixed_sample_pos[s][0], -plane[j].dcdx)); + uint32_t build_mask = build_mask_linear_32( + (int32_t)((new_c - 1) >> FIXED_ORDER), + -plane[j].dcdx, + plane[j].dcdy); mask[s / 4] &= ~((uint64_t)build_mask << ((s % 4) * 16)); } #endif @@ -104,33 +95,17 @@ TAG(do_block_16)(struct lp_rasterizer_task *task, unsigned partmask = 0; /* outside one or more trivial accept planes */ for (unsigned j = 0; j < NR_PLANES; j++) { -#ifdef RASTER_64 - int32_t dcdx = -plane[j].dcdx >> FIXED_ORDER; - int32_t dcdy = plane[j].dcdy >> FIXED_ORDER; - const int32_t cox = plane[j].eo >> FIXED_ORDER; - const int32_t ei = (dcdy + dcdx - cox) << 2; - const int32_t cox_s = cox << 2; - const int32_t co = (int32_t)(c[j] >> (int64_t)FIXED_ORDER) + cox_s; - int32_t cdiff; - cdiff = ei - cox_s + ((int32_t)((c[j] - 1) >> (int64_t)FIXED_ORDER) - - (int32_t)(c[j] >> (int64_t)FIXED_ORDER)); - dcdx <<= 2; - dcdy <<= 2; -#else - const int64_t dcdx = -IMUL64(plane[j].dcdx, 4); - const int64_t dcdy = IMUL64(plane[j].dcdy, 4); - const int64_t cox = IMUL64(plane[j].eo, 4); - const int32_t ei = plane[j].dcdy - plane[j].dcdx - (int64_t)plane[j].eo; - const int64_t cio = IMUL64(ei, 4) - 1; - int32_t co, cdiff; - co = c[j] + cox; - cdiff = cio - cox; -#endif - - BUILD_MASKS(co, cdiff, - dcdx, dcdy, - &outmask, /* sign bits from c[i][0..15] + cox */ - &partmask); /* sign bits from c[i][0..15] + cio */ + const int32_t dcdx = -plane[j].dcdx << 2; + const int32_t dcdy = plane[j].dcdy << 2; + const int32_t cox = plane[j].eo << 2; + const int32_t ei = dcdy + dcdx - cox; + const int32_t co = (int32_t)((c[j] >> FIXED_ORDER) + cox); + const int32_t cdiff = ei - cox + + (int32_t)(((c[j] - 1) >> FIXED_ORDER) - (c[j] >> FIXED_ORDER)); + build_masks_32(co, cdiff, + dcdx, dcdy, + &outmask, /* sign bits from c[i][0..15] + cox */ + &partmask); /* sign bits from c[i][0..15] + cio */ } if (outmask == 0xffff) @@ -165,8 +140,8 @@ TAG(do_block_16)(struct lp_rasterizer_task *task, for (unsigned j = 0; j < NR_PLANES; j++) { cx[j] = (c[j] - - IMUL64(plane[j].dcdx, ix) - + IMUL64(plane[j].dcdy, iy)); + - IMUL64_FIXED(plane[j].dcdx, ix) + + IMUL64_FIXED(plane[j].dcdy, iy)); } TAG(do_block_4)(task, tri, plane, px, py, cx); @@ -218,73 +193,29 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task, int i = ffs(plane_mask) - 1; plane[j] = tri_plane[i]; plane_mask &= ~(1 << i); - c[j] = plane[j].c + IMUL64(plane[j].dcdy, y) - IMUL64(plane[j].dcdx, x); + c[j] = plane[j].c + + IMUL64_FIXED(plane[j].dcdy, y) - + IMUL64_FIXED(plane[j].dcdx, x); { -#ifdef RASTER_64 - /* - * Strip off lower FIXED_ORDER bits. Note that those bits from - * dcdx, dcdy, eo are always 0 (by definition). - * c values, however, are not. This means that for every - * addition of the form c + n*dcdx the lower FIXED_ORDER bits will - * NOT change. And those bits are not relevant to the sign bit (which - * is only what we need!) that is, - * sign(c + n*dcdx) == sign((c >> FIXED_ORDER) + n*(dcdx >> FIXED_ORDER)) - * This means we can get away with using 32bit math for the most part. - * Only tricky part is the -1 adjustment for cdiff. - */ - int32_t dcdx = -plane[j].dcdx >> FIXED_ORDER; - int32_t dcdy = plane[j].dcdy >> FIXED_ORDER; - const int32_t cox = plane[j].eo >> FIXED_ORDER; - const int32_t ei = (dcdy + dcdx - cox) << 4; - const int32_t cox_s = cox << 4; - const int32_t co = (int32_t)(c[j] >> (int64_t)FIXED_ORDER) + cox_s; - int32_t cdiff; - /* - * Plausibility check to ensure the 32bit math works. - * Note that within a tile, the max we can move the edge function - * is essentially dcdx * TILE_SIZE + dcdy * TILE_SIZE. - * TILE_SIZE is 64, dcdx/dcdy are nominally 21 bit (for 8192 max size - * and 8 subpixel bits), I'd be happy with 2 bits more too (1 for - * increasing fb size to 16384, the required d3d11 value, another one - * because I'm not quite sure we can't be _just_ above the max value - * here). This gives us 30 bits max - hence if c would exceed that here - * that means the plane is either trivial reject for the whole tile - * (in which case the tri will not get binned), or trivial accept for - * the whole tile (in which case plane_mask will not include it). - */ -#if 0 - assert((c[j] >> (int64_t)FIXED_ORDER) > (int32_t)0xb0000000 && - (c[j] >> (int64_t)FIXED_ORDER) < (int32_t)0x3fffffff); -#endif - /* - * Note the fixup part is constant throughout the tile - thus could - * just calculate this and avoid _all_ 64bit math in rasterization - * (except exactly this fixup calc). - * In fact theoretically could move that even to setup, albeit that - * seems tricky (pre-bin certainly can have values larger than 32bit, - * and would need to communicate that fixup value through). - * And if we want to support msaa, we'd probably don't want to do the - * downscaling in setup in any case... - */ - cdiff = ei - cox_s + ((int32_t)((c[j] - 1) >> (int64_t)FIXED_ORDER) - - (int32_t)(c[j] >> (int64_t)FIXED_ORDER)); - dcdx <<= 4; - dcdy <<= 4; -#else const int32_t dcdx = -plane[j].dcdx << 4; const int32_t dcdy = plane[j].dcdy << 4; const int32_t cox = plane[j].eo << 4; - const int32_t ei = plane[j].dcdy - plane[j].dcdx - (int32_t)plane[j].eo; - const int32_t cio = (ei << 4) - 1; - int32_t co, cdiff; - co = c[j] + cox; - cdiff = cio - cox; -#endif - BUILD_MASKS(co, cdiff, + const int32_t ei = dcdy + dcdx - cox; + const int64_t co = (c[j] >> FIXED_ORDER) + cox; + const int32_t cdiff = ei - cox + + (int32_t)(((c[j] - 1) >> FIXED_ORDER) - (c[j] >> FIXED_ORDER)); +#ifdef RASTER_64 + build_masks(co, cdiff, dcdx, dcdy, &outmask, /* sign bits from c[i][0..15] + cox */ &partmask); /* sign bits from c[i][0..15] + cio */ +#else + build_masks_32((int32_t)co, cdiff, + dcdx, dcdy, + &outmask, /* sign bits from c[i][0..15] + cox */ + &partmask); /* sign bits from c[i][0..15] + cio */ +#endif } j++; @@ -317,9 +248,9 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task, int64_t cx[NR_PLANES]; for (j = 0; j < NR_PLANES; j++) - cx[j] = (c[j] - - IMUL64(plane[j].dcdx, ix) - + IMUL64(plane[j].dcdy, iy)); + cx[j] = c[j] - + IMUL64_FIXED(plane[j].dcdx, ix) + + IMUL64_FIXED(plane[j].dcdy, iy); partial_mask &= ~(1 << i); @@ -375,21 +306,22 @@ TRI_16(struct lp_rasterizer_task *task, y += task->y; for (unsigned j = 0; j < NR_PLANES; j++) { - const int dcdx = -plane[j].dcdx * 4; - const int dcdy = plane[j].dcdy * 4; - __m128i xdcdy = _mm_set1_epi32(dcdy); + const int64_t c = plane[j].c + + IMUL64_FIXED(plane[j].dcdy, y) - + IMUL64_FIXED(plane[j].dcdx, x); + const int dcdx = -plane[j].dcdx << 2; + const int dcdy = plane[j].dcdy << 2; + const int cox = plane[j].eo << 2; + const int co = (int)(c >> FIXED_ORDER) + cox; + + __m128i xdcdy = _mm_set1_epi32(dcdy); cstep4[j][0] = _mm_setr_epi32(0, dcdx, dcdx*2, dcdx*3); cstep4[j][1] = _mm_add_epi32(cstep4[j][0], xdcdy); cstep4[j][2] = _mm_add_epi32(cstep4[j][1], xdcdy); cstep4[j][3] = _mm_add_epi32(cstep4[j][2], xdcdy); - { - const int c = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x; - const int cox = plane[j].eo * 4; - - outmask |= sign_bits4(cstep4[j], c + cox); - } + outmask |= sign_bits4(cstep4[j], co); } if (outmask == 0xffff) @@ -414,11 +346,11 @@ TRI_16(struct lp_rasterizer_task *task, partial_mask &= ~(1 << i); for (unsigned j = 0; j < NR_PLANES; j++) { - const int cx = (plane[j].c - 1 - - plane[j].dcdx * px - + plane[j].dcdy * py) * 4; + const int64_t cx = (plane[j].c - 1 + - IMUL64_FIXED(plane[j].dcdx, px) + + IMUL64_FIXED(plane[j].dcdy, py)) << 2; - mask &= ~sign_bits4(cstep4[j], cx); + mask &= ~sign_bits4(cstep4[j], (int)(cx >> FIXED_ORDER)); } if (mask) diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h b/src/gallium/drivers/llvmpipe/lp_scene.h index 0c68ae90d4e..9244d645f8c 100644 --- a/src/gallium/drivers/llvmpipe/lp_scene.h +++ b/src/gallium/drivers/llvmpipe/lp_scene.h @@ -293,7 +293,9 @@ lp_scene_alloc_aligned(struct lp_scene *scene, unsigned size, { uint8_t *data = block->data + block->used; - unsigned offset = (((uintptr_t)data + alignment - 1) & ~(alignment - 1)) - (uintptr_t)data; + unsigned offset = + (((uintptr_t)data + alignment - 1) & ~((uintptr_t)alignment - 1)) - + (uintptr_t)data; block->used += offset + size; return data + offset; } diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c index 1962e60ef78..0f8330371fb 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup.c +++ b/src/gallium/drivers/llvmpipe/lp_setup.c @@ -1623,44 +1623,44 @@ lp_setup_add_scissor_planes(const struct u_rect *scissor, */ if (s_planes[0]) { int x0 = scissor->x0; - plane_s->dcdx = ~0U << 8; + plane_s->dcdx = ~0U; plane_s->dcdy = 0; - plane_s->c = x0 << 8; + plane_s->c = TO_FIXED64(x0); plane_s->c = -plane_s->c; /* flip sign */ /* * we need x0 to be exactly on plane edge, adjust by 1 since * this is an inclusive edge. */ plane_s->c += 1; - plane_s->eo = 1 << 8; + plane_s->eo = 1; plane_s++; } if (s_planes[1]) { int x1 = scissor->x1 + 1; - plane_s->dcdx = 1 << 8; + plane_s->dcdx = 1; plane_s->dcdy = 0; - plane_s->c = x1 << 8; + plane_s->c = TO_FIXED64(x1); /* * no c adjustment, this edge should be exclusive. */ - plane_s->eo = 0 << 8; + plane_s->eo = 0; plane_s++; } if (s_planes[2]) { int y0 = scissor->y0; plane_s->dcdx = 0; - plane_s->dcdy = 1 << 8; - plane_s->c = y0 << 8; + plane_s->dcdy = 1; + plane_s->c = TO_FIXED64(y0); plane_s->c = -plane_s->c; /* flip sign */ plane_s->c += 1; - plane_s->eo = 1 << 8; + plane_s->eo = 1; plane_s++; } if (s_planes[3]) { int y1 = scissor->y1 + 1; plane_s->dcdx = 0; - plane_s->dcdy = ~0U << 8; - plane_s->c = y1 << 8; + plane_s->dcdy = ~0U; + plane_s->c = TO_FIXED64(y1); plane_s->eo = 0; plane_s++; } diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h index af7aa50c233..044d8659a48 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_context.h +++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h @@ -296,7 +296,7 @@ lp_setup_analyse_triangles(struct lp_setup_context *setup, bool lp_setup_bin_triangle(struct lp_setup_context *setup, struct lp_rast_triangle *tri, - bool use_32bits, + int max_szorig, bool opaque, const struct u_rect *bbox, int nr_planes, diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c index 948c30cd1ad..3d147b0e5ec 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_line.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c @@ -600,7 +600,6 @@ try_setup_line(struct lp_setup_context *setup, int max_szorig = ((bbox.x1 - (bbox.x0 & ~3)) | (bbox.y1 - (bbox.y0 & ~3))); - bool use_32bits = max_szorig <= MAX_FIXED_LENGTH32; bboxpos = bbox; /* Can safely discard negative regions: @@ -696,9 +695,6 @@ try_setup_line(struct lp_setup_context *setup, } } - plane[i].dcdx *= FIXED_ONE; - plane[i].dcdy *= FIXED_ONE; - /* find trivial reject offsets for each edge for a single-pixel * sized block. These will be scaled up at each recursive level to * match the active blocksize. Scaling in this way works best if @@ -713,7 +709,7 @@ try_setup_line(struct lp_setup_context *setup, lp_setup_add_scissor_planes(scissor, &plane[4], s_planes); } - return lp_setup_bin_triangle(setup, line, use_32bits, false, + return lp_setup_bin_triangle(setup, line, max_szorig, false, &bboxpos, nr_planes, viewport_index); } diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c index f9769adde35..b5d9560c099 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_point.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c @@ -434,10 +434,10 @@ try_setup_point(struct lp_setup_context *setup, bbox.y1 = bbox.y0 + int_width - 1; } - x[0] = (bbox.x0 - 1) << 8; - x[1] = (bbox.x1 + 1) << 8; - y[0] = (bbox.y0 - 1) << 8; - y[1] = (bbox.y1 + 1) << 8; + x[0] = (bbox.x0 - 1) << FIXED_ORDER; + x[1] = (bbox.x1 + 1) << FIXED_ORDER; + y[0] = (bbox.y0 - 1) << FIXED_ORDER; + y[1] = (bbox.y1 + 1) << FIXED_ORDER; } if (0) { @@ -513,24 +513,24 @@ try_setup_point(struct lp_setup_context *setup, plane = GET_PLANES(point); - plane[0].dcdx = ~0U << 8; + plane[0].dcdx = ~0U; plane[0].dcdy = 0; - plane[0].c = -MAX2(x[0], bbox.x0 << 8); - plane[0].eo = 1 << 8; + plane[0].c = -MAX2(x[0], bbox.x0 << FIXED_ORDER); + plane[0].eo = 1; - plane[1].dcdx = 1 << 8; + plane[1].dcdx = 1; plane[1].dcdy = 0; - plane[1].c = MIN2(x[1], (bbox.x1 + 1) << 8); + plane[1].c = MIN2(x[1], (bbox.x1 + 1) << FIXED_ORDER); plane[1].eo = 0; plane[2].dcdx = 0; - plane[2].dcdy = 1 << 8; - plane[2].c = -MAX2(y[0], (bbox.y0 << 8) - adj); - plane[2].eo = 1 << 8; + plane[2].dcdy = 1; + plane[2].c = -MAX2(y[0], (bbox.y0 << FIXED_ORDER) - adj); + plane[2].eo = 1; plane[3].dcdx = 0; - plane[3].dcdy = ~0U << 8; - plane[3].c = MIN2(y[1], (bbox.y1 + 1) << 8); + plane[3].dcdy = ~0U; + plane[3].c = MIN2(y[1], (bbox.y1 + 1) << FIXED_ORDER); plane[3].eo = 0; if (!setup->legacy_points) { @@ -544,9 +544,7 @@ try_setup_point(struct lp_setup_context *setup, int max_szorig = ((bbox.x1 - (bbox.x0 & ~3)) | (bbox.y1 - (bbox.y0 & ~3))); - bool use_32bits = max_szorig <= MAX_FIXED_LENGTH32; - - return lp_setup_bin_triangle(setup, point, use_32bits, + return lp_setup_bin_triangle(setup, point, max_szorig, setup->fs.current.variant->opaque, &bbox, nr_planes, viewport_index); diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c index 1220763489c..ae0fba4df94 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c @@ -199,6 +199,7 @@ lp_rast_tri_tab[MAX_PLANES+1] = { LP_RAST_OP_TRIANGLE_8 }; + static unsigned lp_rast_32_tri_tab[MAX_PLANES+1] = { 0, /* should be impossible */ @@ -331,10 +332,9 @@ do_triangle_ccw(struct lp_setup_context *setup, int max_szorig = ((bbox.x1 - (bbox.x0 & ~3)) | (bbox.y1 - (bbox.y0 & ~3))); - bool use_32bits = max_szorig <= MAX_FIXED_LENGTH32; #if defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN - bool pwr8_limit_check = (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 && - (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32; + bool pwr8_limit_check = (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32_BLOCK && + (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32_BLOCK; #endif /* Can safely discard negative regions, but need to keep hold of @@ -530,11 +530,6 @@ do_triangle_ccw(struct lp_setup_context *setup, * c = _mm_sub_epi32(c, c_dec); */ - /* Scale up to match c: - */ - dcdx = _mm_slli_epi32(dcdx, FIXED_ORDER); - dcdy = _mm_slli_epi32(dcdy, FIXED_ORDER); - /* * Calculate trivial reject values: * Note eo cannot overflow even if dcdx/dcdy would already have @@ -574,8 +569,8 @@ do_triangle_ccw(struct lp_setup_context *setup, * XXX this code is effectively disabled for all practical purposes, * as the allowed fb size is tiny if FIXED_ORDER is 8. */ - if (setup->fb.width <= MAX_FIXED_LENGTH32 && - setup->fb.height <= MAX_FIXED_LENGTH32 && + if (setup->fb.width <= MAX_FIXED_LENGTH32_BLOCK && + setup->fb.height <= MAX_FIXED_LENGTH32_BLOCK && pwr8_limit_check) { unsigned int bottom_edge; __m128i vertx, verty; @@ -634,11 +629,6 @@ do_triangle_ccw(struct lp_setup_context *setup, c = vec_add_epi32(c, c_inc); - /* Scale up to match c: - */ - dcdx = vec_slli_epi32(dcdx, FIXED_ORDER); - dcdy = vec_slli_epi32(dcdy, FIXED_ORDER); - /* Calculate trivial reject values: */ eo = vec_sub_epi32(vec_andnot_si128(dcdy_neg_mask, dcdy), @@ -701,13 +691,6 @@ do_triangle_ccw(struct lp_setup_context *setup, } } - /* Scale up to match c: - */ - assert((plane[i].dcdx << FIXED_ORDER) >> FIXED_ORDER == plane[i].dcdx); - assert((plane[i].dcdy << FIXED_ORDER) >> FIXED_ORDER == plane[i].dcdy); - plane[i].dcdx <<= FIXED_ORDER; - plane[i].dcdy <<= FIXED_ORDER; - /* find trivial reject offsets for each edge for a single-pixel * sized block. These will be scaled up at each recursive level to * match the active blocksize. Scaling in this way works best if @@ -743,7 +726,7 @@ do_triangle_ccw(struct lp_setup_context *setup, lp_setup_add_scissor_planes(scissor, &plane[3], s_planes); } - return lp_setup_bin_triangle(setup, tri, use_32bits, + return lp_setup_bin_triangle(setup, tri, max_szorig, check_opaque(setup, v0, v1, v2), &bbox, nr_planes, viewport_index); } @@ -779,7 +762,7 @@ floor_pot(uint32_t n) bool lp_setup_bin_triangle(struct lp_setup_context *setup, struct lp_rast_triangle *tri, - bool use_32bits, + int max_szorig, bool opaque, const struct u_rect *bbox, int nr_planes, @@ -788,6 +771,9 @@ lp_setup_bin_triangle(struct lp_setup_context *setup, struct lp_scene *scene = setup->scene; unsigned cmd; + const bool use_32bits_block = max_szorig <= MAX_FIXED_LENGTH32_BLOCK; + const bool use_32bits_tile = max_szorig <= MAX_FIXED_LENGTH32_TILE; + /* What is the largest power-of-two boundary this triangle crosses: */ const int dx = floor_pot((bbox->x0 ^ bbox->x1) | @@ -836,7 +822,7 @@ lp_setup_bin_triangle(struct lp_setup_context *setup, if (setup->multisample) cmd = LP_RAST_OP_MS_TRIANGLE_3_4; else - cmd = use_32bits ? LP_RAST_OP_TRIANGLE_32_3_4 : LP_RAST_OP_TRIANGLE_3_4; + cmd = use_32bits_block ? LP_RAST_OP_TRIANGLE_32_3_4 : LP_RAST_OP_TRIANGLE_3_4; return lp_scene_bin_cmd_with_state(scene, ix0, iy0, setup->fs.stored, cmd, lp_rast_arg_triangle_contained(tri, px, py)); @@ -860,7 +846,7 @@ lp_setup_bin_triangle(struct lp_setup_context *setup, if (setup->multisample) cmd = LP_RAST_OP_MS_TRIANGLE_3_16; else - cmd = use_32bits ? LP_RAST_OP_TRIANGLE_32_3_16 : LP_RAST_OP_TRIANGLE_3_16; + cmd = use_32bits_block ? LP_RAST_OP_TRIANGLE_32_3_16 : LP_RAST_OP_TRIANGLE_3_16; return lp_scene_bin_cmd_with_state(scene, ix0, iy0, setup->fs.stored, cmd, lp_rast_arg_triangle_contained(tri, px, py)); @@ -875,7 +861,7 @@ lp_setup_bin_triangle(struct lp_setup_context *setup, if (setup->multisample) cmd = LP_RAST_OP_MS_TRIANGLE_4_16; else - cmd = use_32bits ? LP_RAST_OP_TRIANGLE_32_4_16 : LP_RAST_OP_TRIANGLE_4_16; + cmd = use_32bits_block ? LP_RAST_OP_TRIANGLE_32_4_16 : LP_RAST_OP_TRIANGLE_4_16; return lp_scene_bin_cmd_with_state(scene, ix0, iy0, setup->fs.stored, cmd, lp_rast_arg_triangle_contained(tri, px, py)); @@ -886,7 +872,7 @@ lp_setup_bin_triangle(struct lp_setup_context *setup, if (setup->multisample) cmd = lp_rast_ms_tri_tab[nr_planes]; else - cmd = use_32bits ? lp_rast_32_tri_tab[nr_planes] : lp_rast_tri_tab[nr_planes]; + cmd = use_32bits_tile ? lp_rast_32_tri_tab[nr_planes] : lp_rast_tri_tab[nr_planes]; return lp_scene_bin_cmd_with_state(scene, ix0, iy0, setup->fs.stored, cmd, lp_rast_arg_triangle(tri, @@ -907,16 +893,16 @@ lp_setup_bin_triangle(struct lp_setup_context *setup, for (int i = 0; i < nr_planes; i++) { c[i] = (plane[i].c + - IMUL64(plane[i].dcdy, iy0) * TILE_SIZE - - IMUL64(plane[i].dcdx, ix0) * TILE_SIZE); + IMUL64_FIXED(plane[i].dcdy, iy0) * TILE_SIZE - + IMUL64_FIXED(plane[i].dcdx, ix0) * TILE_SIZE); - ei[i] = (plane[i].dcdy - - plane[i].dcdx - - (int64_t)plane[i].eo) << TILE_ORDER; + ei[i] = (TO_FIXED64(plane[i].dcdy) - + TO_FIXED64(plane[i].dcdx) - + TO_FIXED64(plane[i].eo)) << TILE_ORDER; - eo[i] = (int64_t)plane[i].eo << TILE_ORDER; - xstep[i] = -(((int64_t)plane[i].dcdx) << TILE_ORDER); - ystep[i] = ((int64_t)plane[i].dcdy) << TILE_ORDER; + eo[i] = TO_FIXED64(plane[i].eo) << TILE_ORDER; + xstep[i] = -(TO_FIXED64(plane[i].dcdx) << TILE_ORDER); + ystep[i] = TO_FIXED64(plane[i].dcdy) << TILE_ORDER; } tri->inputs.is_blit = lp_setup_is_blit(setup, &tri->inputs); @@ -958,7 +944,7 @@ lp_setup_bin_triangle(struct lp_setup_context *setup, if (setup->multisample) cmd = lp_rast_ms_tri_tab[count]; else - cmd = use_32bits ? lp_rast_32_tri_tab[count] : lp_rast_tri_tab[count]; + cmd = use_32bits_tile ? lp_rast_32_tri_tab[count] : lp_rast_tri_tab[count]; if (!lp_scene_bin_cmd_with_state(scene, x, y, setup->fs.stored, cmd, lp_rast_arg_triangle(tri, partial)))