From 5b4c43d98556c5a4806757513bcb196a724518c5 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Sun, 5 Sep 2010 13:17:43 +0100 Subject: [PATCH 001/540] llvmpipe: use llvm for attribute interpolant calculation Basically no change relative to hard-coded version, but this will be useful for other changes later. --- src/gallium/drivers/llvmpipe/SConscript | 5 +- src/gallium/drivers/llvmpipe/lp_bld_interp.h | 26 +- src/gallium/drivers/llvmpipe/lp_context.c | 3 + src/gallium/drivers/llvmpipe/lp_context.h | 10 +- src/gallium/drivers/llvmpipe/lp_flush.h | 1 + src/gallium/drivers/llvmpipe/lp_limits.h | 10 + src/gallium/drivers/llvmpipe/lp_setup.c | 19 +- src/gallium/drivers/llvmpipe/lp_setup.h | 29 +- src/gallium/drivers/llvmpipe/lp_setup_coef.c | 279 ------- src/gallium/drivers/llvmpipe/lp_setup_coef.h | 64 -- .../drivers/llvmpipe/lp_setup_coef_intrin.c | 228 ------ .../drivers/llvmpipe/lp_setup_context.h | 12 +- src/gallium/drivers/llvmpipe/lp_setup_line.c | 20 +- src/gallium/drivers/llvmpipe/lp_setup_point.c | 13 +- src/gallium/drivers/llvmpipe/lp_setup_tri.c | 42 +- src/gallium/drivers/llvmpipe/lp_state.h | 3 + .../drivers/llvmpipe/lp_state_derived.c | 69 +- src/gallium/drivers/llvmpipe/lp_state_fs.c | 80 +- src/gallium/drivers/llvmpipe/lp_state_fs.h | 4 + src/gallium/drivers/llvmpipe/lp_state_setup.c | 768 ++++++++++++++++++ src/gallium/drivers/llvmpipe/lp_state_setup.h | 81 ++ .../llvmpipe/lp_state_setup_fallback.c | 265 ++++++ 22 files changed, 1315 insertions(+), 716 deletions(-) delete mode 100644 src/gallium/drivers/llvmpipe/lp_setup_coef.c delete mode 100644 src/gallium/drivers/llvmpipe/lp_setup_coef.h delete mode 100644 src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c create mode 100644 src/gallium/drivers/llvmpipe/lp_state_setup.c create mode 100644 src/gallium/drivers/llvmpipe/lp_state_setup.h create mode 100644 src/gallium/drivers/llvmpipe/lp_state_setup_fallback.c diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript index 650435f0f19..6ddce659206 100644 --- a/src/gallium/drivers/llvmpipe/SConscript +++ b/src/gallium/drivers/llvmpipe/SConscript @@ -61,16 +61,17 @@ llvmpipe = env.ConvenienceLibrary( 'lp_scene_queue.c', 'lp_screen.c', 'lp_setup.c', + 'lp_setup_debug.c', 'lp_setup_line.c', 'lp_setup_point.c', 'lp_setup_tri.c', - 'lp_setup_coef.c', - 'lp_setup_coef_intrin.c', 'lp_setup_vbuf.c', 'lp_state_blend.c', 'lp_state_clip.c', 'lp_state_derived.c', 'lp_state_fs.c', + 'lp_state_setup.c', + 'lp_state_setup_fallback.c', 'lp_state_gs.c', 'lp_state_rasterizer.c', 'lp_state_sampler.c', diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/src/gallium/drivers/llvmpipe/lp_bld_interp.h index 3054030f739..37479fca9dc 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_interp.h +++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.h @@ -46,7 +46,31 @@ #include "tgsi/tgsi_exec.h" -#include "lp_setup.h" +/** + * Describes how to compute the interpolation coefficients (a0, dadx, dady) + * from the vertices passed into our triangle/line/point functions by the + * draw module. + * + * Vertices are treated as an array of float[4] values, indexed by + * src_index. + * + * LP_INTERP_COLOR is translated to either LP_INTERP_CONSTANT or + * LINEAR depending on flatshade state. + */ +enum lp_interp { + LP_INTERP_CONSTANT, + LP_INTERP_COLOR, + LP_INTERP_LINEAR, + LP_INTERP_PERSPECTIVE, + LP_INTERP_POSITION, + LP_INTERP_FACING +}; + +struct lp_shader_input { + ushort interp:4; /* enum lp_interp */ + ushort usage_mask:4; /* bitmask of TGSI_WRITEMASK_x flags */ + ushort src_index:8; /* where to find values in incoming vertices */ +}; struct lp_build_interp_soa_context diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c index 39f2c6085ef..763432ed712 100644 --- a/src/gallium/drivers/llvmpipe/lp_context.c +++ b/src/gallium/drivers/llvmpipe/lp_context.c @@ -82,6 +82,8 @@ static void llvmpipe_destroy( struct pipe_context *pipe ) } } + lp_delete_setup_variants(llvmpipe); + align_free( llvmpipe ); } @@ -108,6 +110,7 @@ llvmpipe_create_context( struct pipe_screen *screen, void *priv ) memset(llvmpipe, 0, sizeof *llvmpipe); make_empty_list(&llvmpipe->fs_variants_list); + make_empty_list(&llvmpipe->setup_variants_list); llvmpipe->pipe.winsys = screen->winsys; llvmpipe->pipe.screen = screen; diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h index 34fa20e204a..db09c95b272 100644 --- a/src/gallium/drivers/llvmpipe/lp_context.h +++ b/src/gallium/drivers/llvmpipe/lp_context.h @@ -39,6 +39,7 @@ #include "lp_jit.h" #include "lp_setup.h" #include "lp_state_fs.h" +#include "lp_state_setup.h" struct llvmpipe_vbuf_render; @@ -48,6 +49,7 @@ struct lp_fragment_shader; struct lp_vertex_shader; struct lp_blend_state; struct lp_setup_context; +struct lp_setup_variant; struct lp_velems_state; struct llvmpipe_context { @@ -105,12 +107,9 @@ struct llvmpipe_context { /** Which vertex shader output slot contains point size */ int psize_slot; - /** Fragment shader input interpolation info */ - unsigned num_inputs; - struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS]; - /** The tiling engine */ struct lp_setup_context *setup; + struct lp_setup_variant setup_variant; /** The primitive drawing context */ struct draw_context *draw; @@ -120,6 +119,9 @@ struct llvmpipe_context { struct lp_fs_variant_list_item fs_variants_list; unsigned nr_fs_variants; + + struct lp_setup_variant_list_item setup_variants_list; + unsigned nr_setup_variants; }; diff --git a/src/gallium/drivers/llvmpipe/lp_flush.h b/src/gallium/drivers/llvmpipe/lp_flush.h index bb538b2bd83..3626ce4a86c 100644 --- a/src/gallium/drivers/llvmpipe/lp_flush.h +++ b/src/gallium/drivers/llvmpipe/lp_flush.h @@ -32,6 +32,7 @@ struct pipe_context; struct pipe_fence_handle; +struct pipe_resource; void llvmpipe_flush(struct pipe_context *pipe, diff --git a/src/gallium/drivers/llvmpipe/lp_limits.h b/src/gallium/drivers/llvmpipe/lp_limits.h index d1c431475d8..2538164ffaa 100644 --- a/src/gallium/drivers/llvmpipe/lp_limits.h +++ b/src/gallium/drivers/llvmpipe/lp_limits.h @@ -72,4 +72,14 @@ */ #define LP_MAX_SHADER_VARIANTS 1024 +/** + * Max number of setup variants that will be kept around. + * + * These are determined by the combination of the fragment shader + * input signature and a small amount of rasterization state (eg + * flatshading). It is likely that many active fragment shaders will + * share the same setup variant. + */ +#define LP_MAX_SETUP_VARIANTS 64 + #endif /* LP_LIMITS_H */ diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c index 6674d281d1e..3854fd70af7 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup.c +++ b/src/gallium/drivers/llvmpipe/lp_setup.c @@ -500,14 +500,12 @@ lp_setup_set_point_state( struct lp_setup_context *setup, } void -lp_setup_set_fs_inputs( struct lp_setup_context *setup, - const struct lp_shader_input *input, - unsigned nr ) +lp_setup_set_setup_variant( struct lp_setup_context *setup, + const struct lp_setup_variant *variant) { - LP_DBG(DEBUG_SETUP, "%s %p %u\n", __FUNCTION__, (void *) input, nr); - - memcpy( setup->fs.input, input, nr * sizeof input[0] ); - setup->fs.nr_inputs = nr; + LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__); + + setup->setup.variant = variant; } void @@ -863,6 +861,13 @@ lp_setup_update_state( struct lp_setup_context *setup, setup->psize = lp->psize_slot; assert(lp->dirty == 0); + + assert(lp->setup_variant.key.size == + setup->setup.variant->key.size); + + assert(memcmp(&lp->setup_variant.key, + &setup->setup.variant->key, + setup->setup.variant->key.size) == 0); } if (update_scene) diff --git a/src/gallium/drivers/llvmpipe/lp_setup.h b/src/gallium/drivers/llvmpipe/lp_setup.h index b94061b7d49..19078ebbcb4 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup.h +++ b/src/gallium/drivers/llvmpipe/lp_setup.h @@ -33,28 +33,6 @@ struct draw_context; struct vertex_info; -enum lp_interp { - LP_INTERP_CONSTANT, - LP_INTERP_LINEAR, - LP_INTERP_PERSPECTIVE, - LP_INTERP_POSITION, - LP_INTERP_FACING -}; - - -/** - * Describes how to compute the interpolation coefficients (a0, dadx, dady) - * from the vertices passed into our triangle/line/point functions by the - * draw module. - * - * Vertices are treated as an array of float[4] values, indexed by - * src_index. - */ -struct lp_shader_input { - enum lp_interp interp; /* how to interpolate values */ - unsigned src_index; /* where to find values in incoming vertices */ - unsigned usage_mask; /* bitmask of TGSI_WRITEMASK_x flags */ -}; struct pipe_resource; struct pipe_query; @@ -66,7 +44,7 @@ struct lp_fragment_shader_variant; struct lp_jit_context; struct llvmpipe_query; struct pipe_fence_handle; - +struct lp_setup_variant; struct lp_setup_context * lp_setup_create( struct pipe_context *pipe, @@ -110,9 +88,8 @@ lp_setup_set_point_state( struct lp_setup_context *setup, uint sprite); void -lp_setup_set_fs_inputs( struct lp_setup_context *setup, - const struct lp_shader_input *interp, - unsigned nr ); +lp_setup_set_setup_variant( struct lp_setup_context *setup, + const struct lp_setup_variant *variant ); void lp_setup_set_fs_variant( struct lp_setup_context *setup, diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef.c b/src/gallium/drivers/llvmpipe/lp_setup_coef.c deleted file mode 100644 index 8dc2688ddb6..00000000000 --- a/src/gallium/drivers/llvmpipe/lp_setup_coef.c +++ /dev/null @@ -1,279 +0,0 @@ -/************************************************************************** - * - * Copyright 2010, VMware. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -/* - * Binning code for triangles - */ - -#include "util/u_math.h" -#include "util/u_memory.h" -#include "lp_perf.h" -#include "lp_setup_context.h" -#include "lp_setup_coef.h" -#include "lp_rast.h" -#include "lp_state_fs.h" - -#if !defined(PIPE_ARCH_SSE) - -/** - * Compute a0 for a constant-valued coefficient (GL_FLAT shading). - */ -static void constant_coef( struct lp_rast_shader_inputs *inputs, - unsigned slot, - const float value, - unsigned i ) -{ - inputs->a0[slot][i] = value; - inputs->dadx[slot][i] = 0.0f; - inputs->dady[slot][i] = 0.0f; -} - - - -static void linear_coef( struct lp_rast_shader_inputs *inputs, - const struct lp_tri_info *info, - unsigned slot, - unsigned vert_attr, - unsigned i) -{ - float a0 = info->v0[vert_attr][i]; - float a1 = info->v1[vert_attr][i]; - float a2 = info->v2[vert_attr][i]; - - float da01 = a0 - a1; - float da20 = a2 - a0; - float dadx = (da01 * info->dy20_ooa - info->dy01_ooa * da20); - float dady = (da20 * info->dx01_ooa - info->dx20_ooa * da01); - - inputs->dadx[slot][i] = dadx; - inputs->dady[slot][i] = dady; - - /* calculate a0 as the value which would be sampled for the - * fragment at (0,0), taking into account that we want to sample at - * pixel centers, in other words (0.5, 0.5). - * - * this is neat but unfortunately not a good way to do things for - * triangles with very large values of dadx or dady as it will - * result in the subtraction and re-addition from a0 of a very - * large number, which means we'll end up loosing a lot of the - * fractional bits and precision from a0. the way to fix this is - * to define a0 as the sample at a pixel center somewhere near vmin - * instead - i'll switch to this later. - */ - inputs->a0[slot][i] = a0 - (dadx * info->x0_center + - dady * info->y0_center); -} - - -/** - * Compute a0, dadx and dady for a perspective-corrected interpolant, - * for a triangle. - * We basically multiply the vertex value by 1/w before computing - * the plane coefficients (a0, dadx, dady). - * Later, when we compute the value at a particular fragment position we'll - * divide the interpolated value by the interpolated W at that fragment. - */ -static void perspective_coef( struct lp_rast_shader_inputs *inputs, - const struct lp_tri_info *info, - unsigned slot, - unsigned vert_attr, - unsigned i) -{ - /* premultiply by 1/w (v[0][3] is always 1/w): - */ - float a0 = info->v0[vert_attr][i] * info->v0[0][3]; - float a1 = info->v1[vert_attr][i] * info->v1[0][3]; - float a2 = info->v2[vert_attr][i] * info->v2[0][3]; - float da01 = a0 - a1; - float da20 = a2 - a0; - float dadx = da01 * info->dy20_ooa - info->dy01_ooa * da20; - float dady = da20 * info->dx01_ooa - info->dx20_ooa * da01; - - inputs->dadx[slot][i] = dadx; - inputs->dady[slot][i] = dady; - inputs->a0[slot][i] = a0 - (dadx * info->x0_center + - dady * info->y0_center); -} - - -/** - * Special coefficient setup for gl_FragCoord. - * X and Y are trivial - * Z and W are copied from position_coef which should have already been computed. - * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask. - */ -static void -setup_fragcoord_coef(struct lp_rast_shader_inputs *inputs, - const struct lp_tri_info *info, - unsigned slot, - unsigned usage_mask) -{ - /*X*/ - if (usage_mask & TGSI_WRITEMASK_X) { - inputs->a0[slot][0] = 0.0; - inputs->dadx[slot][0] = 1.0; - inputs->dady[slot][0] = 0.0; - } - - /*Y*/ - if (usage_mask & TGSI_WRITEMASK_Y) { - inputs->a0[slot][1] = 0.0; - inputs->dadx[slot][1] = 0.0; - inputs->dady[slot][1] = 1.0; - } - - /*Z*/ - if (usage_mask & TGSI_WRITEMASK_Z) { - linear_coef(inputs, info, slot, 0, 2); - } - - /*W*/ - if (usage_mask & TGSI_WRITEMASK_W) { - linear_coef(inputs, info, slot, 0, 3); - } -} - - -/** - * Setup the fragment input attribute with the front-facing value. - * \param frontface is the triangle front facing? - */ -static void setup_facing_coef( struct lp_rast_shader_inputs *inputs, - unsigned slot, - boolean frontface, - unsigned usage_mask) -{ - /* convert TRUE to 1.0 and FALSE to -1.0 */ - if (usage_mask & TGSI_WRITEMASK_X) - constant_coef( inputs, slot, 2.0f * frontface - 1.0f, 0 ); - - if (usage_mask & TGSI_WRITEMASK_Y) - constant_coef( inputs, slot, 0.0f, 1 ); /* wasted */ - - if (usage_mask & TGSI_WRITEMASK_Z) - constant_coef( inputs, slot, 0.0f, 2 ); /* wasted */ - - if (usage_mask & TGSI_WRITEMASK_W) - constant_coef( inputs, slot, 0.0f, 3 ); /* wasted */ -} - - -/** - * Compute the tri->coef[] array dadx, dady, a0 values. - */ -void lp_setup_tri_coef( struct lp_setup_context *setup, - struct lp_rast_shader_inputs *inputs, - const float (*v0)[4], - const float (*v1)[4], - const float (*v2)[4], - boolean frontfacing) -{ - unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ; - unsigned slot; - unsigned i; - struct lp_tri_info info; - float dx01 = v0[0][0] - v1[0][0]; - float dy01 = v0[0][1] - v1[0][1]; - float dx20 = v2[0][0] - v0[0][0]; - float dy20 = v2[0][1] - v0[0][1]; - float oneoverarea = 1.0f / (dx01 * dy20 - dx20 * dy01); - - info.v0 = v0; - info.v1 = v1; - info.v2 = v2; - info.frontfacing = frontfacing; - info.x0_center = v0[0][0] - setup->pixel_offset; - info.y0_center = v0[0][1] - setup->pixel_offset; - info.dx01_ooa = dx01 * oneoverarea; - info.dx20_ooa = dx20 * oneoverarea; - info.dy01_ooa = dy01 * oneoverarea; - info.dy20_ooa = dy20 * oneoverarea; - - - /* setup interpolation for all the remaining attributes: - */ - for (slot = 0; slot < setup->fs.nr_inputs; slot++) { - unsigned vert_attr = setup->fs.input[slot].src_index; - unsigned usage_mask = setup->fs.input[slot].usage_mask; - - switch (setup->fs.input[slot].interp) { - case LP_INTERP_CONSTANT: - if (setup->flatshade_first) { - for (i = 0; i < NUM_CHANNELS; i++) - if (usage_mask & (1 << i)) - constant_coef(inputs, slot+1, info.v0[vert_attr][i], i); - } - else { - for (i = 0; i < NUM_CHANNELS; i++) - if (usage_mask & (1 << i)) - constant_coef(inputs, slot+1, info.v2[vert_attr][i], i); - } - break; - - case LP_INTERP_LINEAR: - for (i = 0; i < NUM_CHANNELS; i++) - if (usage_mask & (1 << i)) - linear_coef(inputs, &info, slot+1, vert_attr, i); - break; - - case LP_INTERP_PERSPECTIVE: - for (i = 0; i < NUM_CHANNELS; i++) - if (usage_mask & (1 << i)) - perspective_coef(inputs, &info, slot+1, vert_attr, i); - fragcoord_usage_mask |= TGSI_WRITEMASK_W; - break; - - case LP_INTERP_POSITION: - /* - * The generated pixel interpolators will pick up the coeffs from - * slot 0, so all need to ensure that the usage mask is covers all - * usages. - */ - fragcoord_usage_mask |= usage_mask; - break; - - case LP_INTERP_FACING: - setup_facing_coef(inputs, slot+1, info.frontfacing, usage_mask); - break; - - default: - assert(0); - } - } - - /* The internal position input is in slot zero: - */ - setup_fragcoord_coef(inputs, &info, 0, fragcoord_usage_mask); -} - -#else -extern void lp_setup_coef_dummy(void); -void lp_setup_coef_dummy(void) -{ -} - -#endif diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef.h b/src/gallium/drivers/llvmpipe/lp_setup_coef.h deleted file mode 100644 index 87a3255ccc6..00000000000 --- a/src/gallium/drivers/llvmpipe/lp_setup_coef.h +++ /dev/null @@ -1,64 +0,0 @@ -/************************************************************************** - * - * Copyright 2010 VMware, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - -/** - * The setup code is concerned with point/line/triangle setup and - * putting commands/data into the bins. - */ - - -#ifndef LP_SETUP_COEF_H -#define LP_SETUP_COEF_H - - -struct lp_tri_info { - - float x0_center; - float y0_center; - - /* turn these into an aligned float[4] */ - float dy01_ooa; - float dy20_ooa; - float dx01_ooa; - float dx20_ooa; - - const float (*v0)[4]; - const float (*v1)[4]; - const float (*v2)[4]; - - boolean frontfacing; /* remove eventually */ -}; - -void lp_setup_tri_coef( struct lp_setup_context *setup, - struct lp_rast_shader_inputs *inputs, - const float (*v0)[4], - const float (*v1)[4], - const float (*v2)[4], - boolean frontfacing); - -#endif diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c b/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c deleted file mode 100644 index 3742fd672b2..00000000000 --- a/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c +++ /dev/null @@ -1,228 +0,0 @@ -/************************************************************************** - * - * Copyright 2010 VMware. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -/* - * Binning code for triangles - */ - -#include "util/u_math.h" -#include "util/u_memory.h" -#include "lp_perf.h" -#include "lp_setup_context.h" -#include "lp_setup_coef.h" -#include "lp_rast.h" - -#if defined(PIPE_ARCH_SSE) -#include - - -static void constant_coef4( struct lp_rast_shader_inputs *inputs, - const struct lp_tri_info *info, - unsigned slot, - const float *attr) -{ - *(__m128 *)inputs->a0[slot] = *(__m128 *)attr; - *(__m128 *)inputs->dadx[slot] = _mm_set1_ps(0.0); - *(__m128 *)inputs->dady[slot] = _mm_set1_ps(0.0); -} - - - -/** - * Setup the fragment input attribute with the front-facing value. - * \param frontface is the triangle front facing? - */ -static void setup_facing_coef( struct lp_rast_shader_inputs *inputs, - const struct lp_tri_info *info, - unsigned slot ) -{ - /* XXX: just pass frontface directly to the shader, don't bother - * treating it as an input. - */ - __m128 a0 = _mm_setr_ps(info->frontfacing ? 1.0 : -1.0, - 0, 0, 0); - - *(__m128 *)inputs->a0[slot] = a0; - *(__m128 *)inputs->dadx[slot] = _mm_set1_ps(0.0); - *(__m128 *)inputs->dady[slot] = _mm_set1_ps(0.0); -} - - - -static void calc_coef4( struct lp_rast_shader_inputs *inputs, - const struct lp_tri_info *info, - unsigned slot, - __m128 a0, - __m128 a1, - __m128 a2) -{ - __m128 da01 = _mm_sub_ps(a0, a1); - __m128 da20 = _mm_sub_ps(a2, a0); - - __m128 da01_dy20_ooa = _mm_mul_ps(da01, _mm_set1_ps(info->dy20_ooa)); - __m128 da20_dy01_ooa = _mm_mul_ps(da20, _mm_set1_ps(info->dy01_ooa)); - __m128 dadx = _mm_sub_ps(da01_dy20_ooa, da20_dy01_ooa); - - __m128 da01_dx20_ooa = _mm_mul_ps(da01, _mm_set1_ps(info->dx20_ooa)); - __m128 da20_dx01_ooa = _mm_mul_ps(da20, _mm_set1_ps(info->dx01_ooa)); - __m128 dady = _mm_sub_ps(da20_dx01_ooa, da01_dx20_ooa); - - __m128 dadx_x0 = _mm_mul_ps(dadx, _mm_set1_ps(info->x0_center)); - __m128 dady_y0 = _mm_mul_ps(dady, _mm_set1_ps(info->y0_center)); - __m128 attr_v0 = _mm_add_ps(dadx_x0, dady_y0); - __m128 attr_0 = _mm_sub_ps(a0, attr_v0); - - *(__m128 *)inputs->a0[slot] = attr_0; - *(__m128 *)inputs->dadx[slot] = dadx; - *(__m128 *)inputs->dady[slot] = dady; -} - - -static void linear_coef( struct lp_rast_shader_inputs *inputs, - const struct lp_tri_info *info, - unsigned slot, - unsigned vert_attr) -{ - __m128 a0 = *(const __m128 *)info->v0[vert_attr]; - __m128 a1 = *(const __m128 *)info->v1[vert_attr]; - __m128 a2 = *(const __m128 *)info->v2[vert_attr]; - - calc_coef4(inputs, info, slot, a0, a1, a2); -} - - - -/** - * Compute a0, dadx and dady for a perspective-corrected interpolant, - * for a triangle. - * We basically multiply the vertex value by 1/w before computing - * the plane coefficients (a0, dadx, dady). - * Later, when we compute the value at a particular fragment position we'll - * divide the interpolated value by the interpolated W at that fragment. - */ -static void perspective_coef( struct lp_rast_shader_inputs *inputs, - const struct lp_tri_info *info, - unsigned slot, - unsigned vert_attr) -{ - /* premultiply by 1/w (v[0][3] is always 1/w): - */ - __m128 a0 = *(const __m128 *)info->v0[vert_attr]; - __m128 a1 = *(const __m128 *)info->v1[vert_attr]; - __m128 a2 = *(const __m128 *)info->v2[vert_attr]; - - __m128 a0_oow = _mm_mul_ps(a0, _mm_set1_ps(info->v0[0][3])); - __m128 a1_oow = _mm_mul_ps(a1, _mm_set1_ps(info->v1[0][3])); - __m128 a2_oow = _mm_mul_ps(a2, _mm_set1_ps(info->v2[0][3])); - - calc_coef4(inputs, info, slot, a0_oow, a1_oow, a2_oow); -} - - - - - -/** - * Compute the inputs-> dadx, dady, a0 values. - */ -void lp_setup_tri_coef( struct lp_setup_context *setup, - struct lp_rast_shader_inputs *inputs, - const float (*v0)[4], - const float (*v1)[4], - const float (*v2)[4], - boolean frontfacing) -{ - unsigned slot; - struct lp_tri_info info; - float dx01 = v0[0][0] - v1[0][0]; - float dy01 = v0[0][1] - v1[0][1]; - float dx20 = v2[0][0] - v0[0][0]; - float dy20 = v2[0][1] - v0[0][1]; - float oneoverarea = 1.0f / (dx01 * dy20 - dx20 * dy01); - - info.v0 = v0; - info.v1 = v1; - info.v2 = v2; - info.frontfacing = frontfacing; - info.x0_center = v0[0][0] - setup->pixel_offset; - info.y0_center = v0[0][1] - setup->pixel_offset; - info.dx01_ooa = dx01 * oneoverarea; - info.dx20_ooa = dx20 * oneoverarea; - info.dy01_ooa = dy01 * oneoverarea; - info.dy20_ooa = dy20 * oneoverarea; - - - /* The internal position input is in slot zero: - */ - linear_coef(inputs, &info, 0, 0); - - /* setup interpolation for all the remaining attributes: - */ - for (slot = 0; slot < setup->fs.nr_inputs; slot++) { - unsigned vert_attr = setup->fs.input[slot].src_index; - - switch (setup->fs.input[slot].interp) { - case LP_INTERP_CONSTANT: - if (setup->flatshade_first) { - constant_coef4(inputs, &info, slot+1, info.v0[vert_attr]); - } - else { - constant_coef4(inputs, &info, slot+1, info.v2[vert_attr]); - } - break; - - case LP_INTERP_LINEAR: - linear_coef(inputs, &info, slot+1, vert_attr); - break; - - case LP_INTERP_PERSPECTIVE: - perspective_coef(inputs, &info, slot+1, vert_attr); - break; - - case LP_INTERP_POSITION: - /* - * The generated pixel interpolators will pick up the coeffs from - * slot 0. - */ - break; - - case LP_INTERP_FACING: - setup_facing_coef(inputs, &info, slot+1); - break; - - default: - assert(0); - } - } -} - -#else -extern void lp_setup_coef_dummy(void); -void lp_setup_coef_dummy(void) -{ -} -#endif diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h index 80b356476ab..ff3b69afa83 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_context.h +++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h @@ -39,6 +39,7 @@ #include "lp_rast.h" #include "lp_tile_soa.h" /* for TILE_SIZE */ #include "lp_scene.h" +#include "lp_bld_interp.h" /* for struct lp_shader_input */ #include "draw/draw_vbuf.h" #include "util/u_rect.h" @@ -49,6 +50,8 @@ #define LP_SETUP_NEW_SCISSOR 0x08 +struct lp_setup_variant; + /** Max number of scenes */ #define MAX_SCENES 2 @@ -118,9 +121,6 @@ struct lp_setup_context } state; struct { - struct lp_shader_input input[PIPE_MAX_ATTRIBS]; - unsigned nr_inputs; - const struct lp_rast_state *stored; /**< what's in the scene */ struct lp_rast_state current; /**< currently set state */ struct pipe_resource *current_tex[PIPE_MAX_SAMPLERS]; @@ -139,6 +139,10 @@ struct lp_setup_context } blend_color; + struct { + const struct lp_setup_variant *variant; + } setup; + unsigned dirty; /**< bitmask of LP_SETUP_NEW_x bits */ void (*point)( struct lp_setup_context *, @@ -181,7 +185,7 @@ lp_setup_print_vertex(struct lp_setup_context *setup, struct lp_rast_triangle * lp_setup_alloc_triangle(struct lp_scene *scene, - unsigned nr_inputs, + unsigned num_inputs, unsigned nr_planes, unsigned *tri_size); diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c index 9f090d1992e..928ffdc5cb5 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_line.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c @@ -35,6 +35,7 @@ #include "lp_setup_context.h" #include "lp_rast.h" #include "lp_state_fs.h" +#include "lp_state_setup.h" #define NUM_CHANNELS 4 @@ -162,19 +163,20 @@ static void setup_line_coefficients( struct lp_setup_context *setup, struct lp_rast_triangle *tri, struct lp_line_info *info) { + const struct lp_setup_variant_key *key = &setup->setup.variant->key; unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ; unsigned slot; /* setup interpolation for all the remaining attributes: */ - for (slot = 0; slot < setup->fs.nr_inputs; slot++) { - unsigned vert_attr = setup->fs.input[slot].src_index; - unsigned usage_mask = setup->fs.input[slot].usage_mask; + for (slot = 0; slot < key->num_inputs; slot++) { + unsigned vert_attr = key->inputs[slot].src_index; + unsigned usage_mask = key->inputs[slot].usage_mask; unsigned i; - switch (setup->fs.input[slot].interp) { + switch (key->inputs[slot].interp) { case LP_INTERP_CONSTANT: - if (setup->flatshade_first) { + if (key->flatshade_first) { for (i = 0; i < NUM_CHANNELS; i++) if (usage_mask & (1 << i)) constant_coef(setup, tri, slot+1, info->v1[vert_attr][i], i); @@ -235,14 +237,15 @@ print_line(struct lp_setup_context *setup, const float (*v1)[4], const float (*v2)[4]) { + const struct lp_setup_variant_key *key = &setup->setup.variant->key; uint i; debug_printf("llvmpipe line\n"); - for (i = 0; i < 1 + setup->fs.nr_inputs; i++) { + for (i = 0; i < 1 + key->num_inputs; i++) { debug_printf(" v1[%d]: %f %f %f %f\n", i, v1[i][0], v1[i][1], v1[i][2], v1[i][3]); } - for (i = 0; i < 1 + setup->fs.nr_inputs; i++) { + for (i = 0; i < 1 + key->num_inputs; i++) { debug_printf(" v2[%d]: %f %f %f %f\n", i, v2[i][0], v2[i][1], v2[i][2], v2[i][3]); } @@ -269,6 +272,7 @@ try_setup_line( struct lp_setup_context *setup, const float (*v2)[4]) { struct lp_scene *scene = setup->scene; + const struct lp_setup_variant_key *key = &setup->setup.variant->key; struct lp_rast_triangle *line; struct lp_line_info info; float width = MAX2(1.0, setup->line_width); @@ -548,7 +552,7 @@ try_setup_line( struct lp_setup_context *setup, u_rect_find_intersection(&setup->draw_region, &bbox); line = lp_setup_alloc_triangle(scene, - setup->fs.nr_inputs, + key->num_inputs, nr_planes, &tri_bytes); if (!line) diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c index 55389871518..c98966022ee 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_point.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c @@ -36,6 +36,7 @@ #include "lp_setup_context.h" #include "lp_rast.h" #include "lp_state_fs.h" +#include "lp_state_setup.h" #include "tgsi/tgsi_scan.h" #define NUM_CHANNELS 4 @@ -152,17 +153,18 @@ setup_point_coefficients( struct lp_setup_context *setup, struct lp_rast_triangle *point, const struct point_info *info) { + const struct lp_setup_variant_key *key = &setup->setup.variant->key; unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ; unsigned slot; /* setup interpolation for all the remaining attributes: */ - for (slot = 0; slot < setup->fs.nr_inputs; slot++) { - unsigned vert_attr = setup->fs.input[slot].src_index; - unsigned usage_mask = setup->fs.input[slot].usage_mask; + for (slot = 0; slot < key->num_inputs; slot++) { + unsigned vert_attr = key->inputs[slot].src_index; + unsigned usage_mask = key->inputs[slot].usage_mask; unsigned i; - switch (setup->fs.input[slot].interp) { + switch (key->inputs[slot].interp) { case LP_INTERP_POSITION: /* * The generated pixel interpolators will pick up the coeffs from @@ -215,6 +217,7 @@ try_setup_point( struct lp_setup_context *setup, const float (*v0)[4] ) { /* x/y positions in fixed point */ + const struct lp_setup_variant_key *key = &setup->setup.variant->key; const int sizeAttr = setup->psize; const float size = (setup->point_size_per_vertex && sizeAttr > 0) ? v0[sizeAttr][0] @@ -266,7 +269,7 @@ try_setup_point( struct lp_setup_context *setup, u_rect_find_intersection(&setup->draw_region, &bbox); point = lp_setup_alloc_triangle(scene, - setup->fs.nr_inputs, + key->num_inputs, nr_planes, &bytes); if (!point) diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c index 5090f82ab5f..43617a6b672 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c @@ -34,9 +34,9 @@ #include "util/u_rect.h" #include "lp_perf.h" #include "lp_setup_context.h" -#include "lp_setup_coef.h" #include "lp_rast.h" #include "lp_state_fs.h" +#include "lp_state_setup.h" #define NUM_CHANNELS 4 @@ -65,16 +65,16 @@ fixed_to_float(int a) * immediately after it. * The memory is allocated from the per-scene pool, not per-tile. * \param tri_size returns number of bytes allocated - * \param nr_inputs number of fragment shader inputs + * \param num_inputs number of fragment shader inputs * \return pointer to triangle space */ struct lp_rast_triangle * lp_setup_alloc_triangle(struct lp_scene *scene, - unsigned nr_inputs, + unsigned num_inputs, unsigned nr_planes, unsigned *tri_size) { - unsigned input_array_sz = NUM_CHANNELS * (nr_inputs + 1) * sizeof(float); + unsigned input_array_sz = NUM_CHANNELS * (num_inputs + 1) * sizeof(float); struct lp_rast_triangle *tri; unsigned tri_bytes, bytes; char *inputs; @@ -101,25 +101,26 @@ lp_setup_print_vertex(struct lp_setup_context *setup, const char *name, const float (*v)[4]) { + const struct lp_setup_variant_key *key = &setup->setup.variant->key; int i, j; debug_printf(" wpos (%s[0]) xyzw %f %f %f %f\n", name, v[0][0], v[0][1], v[0][2], v[0][3]); - for (i = 0; i < setup->fs.nr_inputs; i++) { - const float *in = v[setup->fs.input[i].src_index]; + for (i = 0; i < key->num_inputs; i++) { + const float *in = v[key->inputs[i].src_index]; debug_printf(" in[%d] (%s[%d]) %s%s%s%s ", i, - name, setup->fs.input[i].src_index, - (setup->fs.input[i].usage_mask & 0x1) ? "x" : " ", - (setup->fs.input[i].usage_mask & 0x2) ? "y" : " ", - (setup->fs.input[i].usage_mask & 0x4) ? "z" : " ", - (setup->fs.input[i].usage_mask & 0x8) ? "w" : " "); + name, key->inputs[i].src_index, + (key->inputs[i].usage_mask & 0x1) ? "x" : " ", + (key->inputs[i].usage_mask & 0x2) ? "y" : " ", + (key->inputs[i].usage_mask & 0x4) ? "z" : " ", + (key->inputs[i].usage_mask & 0x8) ? "w" : " "); for (j = 0; j < 4; j++) - if (setup->fs.input[i].usage_mask & (1<inputs[i].usage_mask & (1<scene; + const struct lp_setup_variant_key *key = &setup->setup.variant->key; struct lp_rast_triangle *tri; int x[3]; int y[3]; @@ -288,7 +290,7 @@ do_triangle_ccw(struct lp_setup_context *setup, u_rect_find_intersection(&setup->draw_region, &bbox); tri = lp_setup_alloc_triangle(scene, - setup->fs.nr_inputs, + key->num_inputs, nr_planes, &tri_bytes); if (!tri) @@ -328,13 +330,25 @@ do_triangle_ccw(struct lp_setup_context *setup, /* Setup parameter interpolants: */ - lp_setup_tri_coef( setup, &tri->inputs, v0, v1, v2, frontfacing ); + setup->setup.variant->jit_function( v0, + v1, + v2, + frontfacing, + tri->inputs.a0, + tri->inputs.dadx, + tri->inputs.dady, + &setup->setup.variant->key ); tri->inputs.facing = frontfacing ? 1.0F : -1.0F; tri->inputs.disable = FALSE; tri->inputs.opaque = setup->fs.current.variant->opaque; tri->inputs.state = setup->fs.stored; + if (0) + lp_dump_setup_coef(&setup->setup.variant->key, + (const float (*)[4])tri->inputs.a0, + (const float (*)[4])tri->inputs.dadx, + (const float (*)[4])tri->inputs.dady); for (i = 0; i < 3; i++) { struct lp_rast_plane *plane = &tri->plane[i]; diff --git a/src/gallium/drivers/llvmpipe/lp_state.h b/src/gallium/drivers/llvmpipe/lp_state.h index 86313e1c484..7893e9cdc0c 100644 --- a/src/gallium/drivers/llvmpipe/lp_state.h +++ b/src/gallium/drivers/llvmpipe/lp_state.h @@ -97,6 +97,9 @@ llvmpipe_set_framebuffer_state(struct pipe_context *, void llvmpipe_update_fs(struct llvmpipe_context *lp); +void +llvmpipe_update_setup(struct llvmpipe_context *lp); + void llvmpipe_update_derived(struct llvmpipe_context *llvmpipe); diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c index edd723f65f2..de242aa93ca 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_derived.c +++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c @@ -50,12 +50,13 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe) { const struct lp_fragment_shader *lpfs = llvmpipe->fs; struct vertex_info *vinfo = &llvmpipe->vertex_info; - struct lp_shader_input *inputs = llvmpipe->inputs; unsigned vs_index; uint i; /* - * Match FS inputs against VS outputs, emitting the necessary attributes. + * Match FS inputs against VS outputs, emitting the necessary + * attributes. Could cache these structs and look them up with a + * combination of fragment shader, vertex shader ids. */ vinfo->num_attribs = 0; @@ -74,64 +75,10 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe) vs_index = draw_find_shader_output(llvmpipe->draw, lpfs->info.input_semantic_name[i], lpfs->info.input_semantic_index[i]); - if (vs_index < 0) { - /* - * This can happen with sprite coordinates - the vertex - * shader doesn't need to provide an output as we generate - * them internally. However, lets keep pretending that there - * is something there to not confuse other code. - */ - vs_index = 0; - } - - /* This can be pre-computed, except for flatshade: - */ - inputs[i].usage_mask = lpfs->info.input_usage_mask[i]; - - switch (lpfs->info.input_interpolate[i]) { - case TGSI_INTERPOLATE_CONSTANT: - inputs[i].interp = LP_INTERP_CONSTANT; - break; - case TGSI_INTERPOLATE_LINEAR: - inputs[i].interp = LP_INTERP_LINEAR; - break; - case TGSI_INTERPOLATE_PERSPECTIVE: - inputs[i].interp = LP_INTERP_PERSPECTIVE; - break; - default: - assert(0); - break; - } - - switch (lpfs->info.input_semantic_name[i]) { - case TGSI_SEMANTIC_FACE: - inputs[i].interp = LP_INTERP_FACING; - break; - case TGSI_SEMANTIC_POSITION: - /* Position was already emitted above - */ - inputs[i].interp = LP_INTERP_POSITION; - inputs[i].src_index = 0; - continue; - case TGSI_SEMANTIC_COLOR: - /* Colors are linearly inputs[i].interpolated in the fragment shader - * even when flatshading is active. This just tells the - * setup module to use coefficients with ddx==0 and - * ddy==0. - */ - if (llvmpipe->rasterizer->flatshade) - inputs[i].interp = LP_INTERP_CONSTANT; - break; - - default: - break; - } /* * Emit the requested fs attribute for all but position. */ - - inputs[i].src_index = vinfo->num_attribs; draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index); } @@ -145,15 +92,9 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe) draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); } - llvmpipe->num_inputs = lpfs->info.num_inputs; - draw_compute_vertex_size(vinfo); - lp_setup_set_vertex_info(llvmpipe->setup, vinfo); - lp_setup_set_fs_inputs(llvmpipe->setup, - inputs, - lpfs->info.num_inputs); } @@ -190,6 +131,10 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe ) LP_NEW_QUERY)) llvmpipe_update_fs( llvmpipe ); + if (llvmpipe->dirty & (LP_NEW_FS | + LP_NEW_RASTERIZER)) + llvmpipe_update_setup( llvmpipe ); + if (llvmpipe->dirty & LP_NEW_BLEND_COLOR) lp_setup_set_blend_color(llvmpipe->setup, &llvmpipe->blend_color); diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c index e54dd9f0a3c..ad058e384ad 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c @@ -255,8 +255,7 @@ generate_quad_mask(LLVMBuilderRef builder, * \param partial_mask if 1, do mask_input testing */ static void -generate_fs(struct llvmpipe_context *lp, - struct lp_fragment_shader *shader, +generate_fs(struct lp_fragment_shader *shader, const struct lp_fragment_shader_variant_key *key, LLVMBuilderRef builder, struct lp_type type, @@ -468,13 +467,13 @@ generate_blend(const struct pipe_blend_state *blend, * 2x2 pixels. */ static void -generate_fragment(struct llvmpipe_context *lp, +generate_fragment(struct llvmpipe_screen *screen, struct lp_fragment_shader *shader, struct lp_fragment_shader_variant *variant, unsigned partial_mask) { - struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen); const struct lp_fragment_shader_variant_key *key = &variant->key; + struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS]; char func_name[256]; struct lp_type fs_type; struct lp_type blend_type; @@ -507,6 +506,18 @@ generate_fragment(struct llvmpipe_context *lp, unsigned chan; unsigned cbuf; + /* Adjust color input interpolation according to flatshade state: + */ + memcpy(inputs, shader->inputs, shader->info.num_inputs * sizeof inputs[0]); + for (i = 0; i < shader->info.num_inputs; i++) { + if (inputs[i].interp == LP_INTERP_COLOR) { + if (key->flatshade) + inputs[i].interp = LP_INTERP_CONSTANT; + else + inputs[i].interp = LP_INTERP_LINEAR; + } + } + /* TODO: actually pick these based on the fs and color buffer * characteristics. */ @@ -558,7 +569,6 @@ generate_fragment(struct llvmpipe_context *lp, variant->function[partial_mask] = function; - /* XXX: need to propagate noalias down into color param now we are * passing a pointer-to-pointer? */ @@ -606,8 +616,8 @@ generate_fragment(struct llvmpipe_context *lp, * already included in the shader key. */ lp_build_interp_soa_init(&interp, - lp->num_inputs, - lp->inputs, + shader->info.num_inputs, + inputs, builder, fs_type, a0_ptr, dadx_ptr, dady_ptr, x, y); @@ -626,7 +636,7 @@ generate_fragment(struct llvmpipe_context *lp, depth_ptr_i = LLVMBuildGEP(builder, depth_ptr, &index, 1, ""); - generate_fs(lp, shader, key, + generate_fs(shader, key, builder, fs_type, context_ptr, @@ -823,7 +833,7 @@ lp_debug_fs_variant(const struct lp_fragment_shader_variant *variant) } static struct lp_fragment_shader_variant * -generate_variant(struct llvmpipe_context *lp, +generate_variant(struct llvmpipe_screen *screen, struct lp_fragment_shader *shader, const struct lp_fragment_shader_variant_key *key) { @@ -869,11 +879,11 @@ generate_variant(struct llvmpipe_context *lp, lp_debug_fs_variant(variant); } - generate_fragment(lp, shader, variant, RAST_EDGE_TEST); + generate_fragment(screen, shader, variant, RAST_EDGE_TEST); if (variant->opaque) { /* Specialized shader, which doesn't need to read the color buffer. */ - generate_fragment(lp, shader, variant, RAST_WHOLE); + generate_fragment(screen, shader, variant, RAST_WHOLE); } else { variant->jit_function[RAST_WHOLE] = variant->jit_function[RAST_EDGE_TEST]; } @@ -888,6 +898,7 @@ llvmpipe_create_fs_state(struct pipe_context *pipe, { struct lp_fragment_shader *shader; int nr_samplers; + int i; shader = CALLOC_STRUCT(lp_fragment_shader); if (!shader) @@ -907,6 +918,46 @@ llvmpipe_create_fs_state(struct pipe_context *pipe, shader->variant_key_size = Offset(struct lp_fragment_shader_variant_key, sampler[nr_samplers]); + for (i = 0; i < shader->info.num_inputs; i++) { + shader->inputs[i].usage_mask = shader->info.input_usage_mask[i]; + + switch (shader->info.input_interpolate[i]) { + case TGSI_INTERPOLATE_CONSTANT: + shader->inputs[i].interp = LP_INTERP_CONSTANT; + break; + case TGSI_INTERPOLATE_LINEAR: + shader->inputs[i].interp = LP_INTERP_LINEAR; + break; + case TGSI_INTERPOLATE_PERSPECTIVE: + shader->inputs[i].interp = LP_INTERP_PERSPECTIVE; + break; + default: + assert(0); + break; + } + + switch (shader->info.input_semantic_name[i]) { + case TGSI_SEMANTIC_COLOR: + /* Colors may be either linearly or constant interpolated in + * the fragment shader, but that information isn't available + * here. Mark color inputs and fix them up later. + */ + shader->inputs[i].interp = LP_INTERP_COLOR; + break; + case TGSI_SEMANTIC_FACE: + shader->inputs[i].interp = LP_INTERP_FACING; + break; + case TGSI_SEMANTIC_POSITION: + /* Position was already emitted above + */ + shader->inputs[i].interp = LP_INTERP_POSITION; + shader->inputs[i].src_index = 0; + continue; + } + + shader->inputs[i].src_index = i+1; + } + if (LP_DEBUG & DEBUG_TGSI) { unsigned attrib; debug_printf("llvmpipe: Create fragment shader #%u %p:\n", shader->no, (void *) shader); @@ -1161,6 +1212,7 @@ make_variant_key(struct llvmpipe_context *lp, void llvmpipe_update_fs(struct llvmpipe_context *lp) { + struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen); struct lp_fragment_shader *shader = lp->fs; struct lp_fragment_shader_variant_key key; struct lp_fragment_shader_variant *variant = NULL; @@ -1201,7 +1253,7 @@ llvmpipe_update_fs(struct llvmpipe_context *lp) } t0 = os_time_get(); - variant = generate_variant(lp, shader, &key); + variant = generate_variant(screen, shader, &key); t1 = os_time_get(); dt = t1 - t0; @@ -1221,6 +1273,10 @@ llvmpipe_update_fs(struct llvmpipe_context *lp) + + + + void llvmpipe_init_fs_funcs(struct llvmpipe_context *llvmpipe) { diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.h b/src/gallium/drivers/llvmpipe/lp_state_fs.h index 2914e7d7efd..f73c7801c00 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.h +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.h @@ -34,6 +34,7 @@ #include "pipe/p_state.h" #include "tgsi/tgsi_scan.h" /* for tgsi_shader_info */ #include "gallivm/lp_bld_sample.h" /* for struct lp_sampler_static_state */ +#include "lp_bld_interp.h" /* for struct lp_shader_input */ struct tgsi_token; @@ -105,6 +106,9 @@ struct lp_fragment_shader unsigned no; unsigned variants_created; unsigned variants_cached; + + /** Fragment shader input interpolation info */ + struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS]; }; diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.c b/src/gallium/drivers/llvmpipe/lp_state_setup.c new file mode 100644 index 00000000000..aa9147a1a15 --- /dev/null +++ b/src/gallium/drivers/llvmpipe/lp_state_setup.c @@ -0,0 +1,768 @@ +/************************************************************************** + * + * Copyright 2010 VMware. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#include "util/u_math.h" +#include "util/u_memory.h" +#include "util/u_simple_list.h" +#include "os/os_time.h" +#include "gallivm/lp_bld_debug.h" +#include "gallivm/lp_bld_init.h" +#include "gallivm/lp_bld_intr.h" +#include /* for LLVMVerifyFunction */ + +#include "lp_perf.h" +#include "lp_debug.h" +#include "lp_flush.h" +#include "lp_screen.h" +#include "lp_context.h" +#include "lp_setup_context.h" +#include "lp_rast.h" +#include "lp_state.h" +#include "lp_state_fs.h" +#include "lp_state_setup.h" + + + +/* currently organized to interpolate full float[4] attributes even + * when some elements are unused. Later, can pack vertex data more + * closely. + */ + + +struct lp_setup_args +{ + /* Function arguments: + */ + LLVMValueRef v0; + LLVMValueRef v1; + LLVMValueRef v2; + LLVMValueRef facing; /* boolean */ + LLVMValueRef a0; + LLVMValueRef dadx; + LLVMValueRef dady; + + /* Derived: + */ + LLVMValueRef x0_center; + LLVMValueRef y0_center; + LLVMValueRef dy20_ooa; + LLVMValueRef dy01_ooa; + LLVMValueRef dx20_ooa; + LLVMValueRef dx01_ooa; +}; + +static LLVMTypeRef type4f(void) +{ + return LLVMVectorType(LLVMFloatType(), 4); +} + + +/* Equivalent of _mm_setr_ps(a,b,c,d) + */ +static LLVMValueRef vec4f(LLVMBuilderRef bld, + LLVMValueRef a, LLVMValueRef b, LLVMValueRef c, LLVMValueRef d, + const char *name) +{ + LLVMValueRef i0 = LLVMConstInt(LLVMInt32Type(), 0, 0); + LLVMValueRef i1 = LLVMConstInt(LLVMInt32Type(), 1, 0); + LLVMValueRef i2 = LLVMConstInt(LLVMInt32Type(), 2, 0); + LLVMValueRef i3 = LLVMConstInt(LLVMInt32Type(), 3, 0); + + LLVMValueRef res = LLVMGetUndef(type4f()); + + res = LLVMBuildInsertElement(bld, res, a, i0, ""); + res = LLVMBuildInsertElement(bld, res, b, i1, ""); + res = LLVMBuildInsertElement(bld, res, c, i2, ""); + res = LLVMBuildInsertElement(bld, res, d, i3, name); + + return res; +} + +/* Equivalent of _mm_set1_ps(a) + */ +static LLVMValueRef vec4f_from_scalar(LLVMBuilderRef bld, + LLVMValueRef a, + const char *name) +{ + LLVMValueRef res = LLVMGetUndef(type4f()); + int i; + + for(i = 0; i < 4; ++i) { + LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0); + res = LLVMBuildInsertElement(bld, res, a, index, i == 3 ? name : ""); + } + + return res; +} + +static void +store_coef(LLVMBuilderRef builder, + struct lp_setup_args *args, + unsigned slot, + LLVMValueRef a0, + LLVMValueRef dadx, + LLVMValueRef dady) +{ + LLVMValueRef idx = LLVMConstInt(LLVMInt32Type(), slot, 0); + + LLVMBuildStore(builder, + a0, + LLVMBuildGEP(builder, args->a0, &idx, 1, "")); + + LLVMBuildStore(builder, + dadx, + LLVMBuildGEP(builder, args->dadx, &idx, 1, "")); + + LLVMBuildStore(builder, + dady, + LLVMBuildGEP(builder, args->dady, &idx, 1, "")); +} + + + +static void +emit_constant_coef4( LLVMBuilderRef builder, + struct lp_setup_args *args, + unsigned slot, + LLVMValueRef vert, + unsigned attr) +{ + LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0); + LLVMValueRef zerovec = vec4f_from_scalar(builder, zero, "zero"); + LLVMValueRef idx = LLVMConstInt(LLVMInt32Type(), attr, 0); + LLVMValueRef attr_ptr = LLVMBuildGEP(builder, vert, &idx, 1, "attr_ptr"); + LLVMValueRef vert_attr = LLVMBuildLoad(builder, attr_ptr, "vert_attr"); + + store_coef(builder, args, slot, vert_attr, zerovec, zerovec); +} + + + +/** + * Setup the fragment input attribute with the front-facing value. + * \param frontface is the triangle front facing? + */ +static void +emit_facing_coef( LLVMBuilderRef builder, + struct lp_setup_args *args, + unsigned slot ) +{ + LLVMValueRef a0_0 = args->facing; + LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0); + LLVMValueRef a0 = vec4f(builder, a0_0, zero, zero, zero, "facing"); + LLVMValueRef zerovec = vec4f_from_scalar(builder, zero, "zero"); + + store_coef(builder, args, slot, a0, zerovec, zerovec); +} + + +static LLVMValueRef +vert_attrib(LLVMBuilderRef b, + LLVMValueRef vert, + int attr, + int elem, + const char *name) +{ + LLVMValueRef idx[2]; + idx[0] = LLVMConstInt(LLVMInt32Type(), attr, 0); + idx[1] = LLVMConstInt(LLVMInt32Type(), elem, 0); + return LLVMBuildLoad(b, LLVMBuildGEP(b, vert, idx, 2, ""), name); +} + + + +static void +emit_coef4( LLVMBuilderRef b, + struct lp_setup_args *args, + unsigned slot, + LLVMValueRef a0, + LLVMValueRef a1, + LLVMValueRef a2) +{ + LLVMValueRef dy20_ooa = args->dy20_ooa; + LLVMValueRef dy01_ooa = args->dy01_ooa; + LLVMValueRef dx20_ooa = args->dx20_ooa; + LLVMValueRef dx01_ooa = args->dx01_ooa; + LLVMValueRef x0_center = args->x0_center; + LLVMValueRef y0_center = args->y0_center; + + /* XXX: using fsub, fmul on vector types -- does this work?? + */ + LLVMValueRef da01 = LLVMBuildFSub(b, a0, a1, "da01"); + LLVMValueRef da20 = LLVMBuildFSub(b, a2, a0, "da20"); + + /* Calculate dadx (vec4f) + */ + LLVMValueRef da01_dy20_ooa = LLVMBuildFMul(b, da01, dy20_ooa, "da01_dy20_ooa"); + LLVMValueRef da20_dy01_ooa = LLVMBuildFMul(b, da20, dy01_ooa, "da20_dy01_ooa"); + LLVMValueRef dadx = LLVMBuildFSub(b, da01_dy20_ooa, da20_dy01_ooa, "dadx"); + + /* Calculate dady (vec4f) + */ + LLVMValueRef da01_dx20_ooa = LLVMBuildFMul(b, da01, dx20_ooa, "da01_dx20_ooa"); + LLVMValueRef da20_dx01_ooa = LLVMBuildFMul(b, da20, dx01_ooa, "da20_dx01_ooa"); + LLVMValueRef dady = LLVMBuildFSub(b, da20_dx01_ooa, da01_dx20_ooa, "dady"); + + /* Calculate a0 - the attribute value at the origin + */ + LLVMValueRef dadx_x0 = LLVMBuildFMul(b, dadx, x0_center, "dadx_x0"); + LLVMValueRef dady_y0 = LLVMBuildFMul(b, dady, y0_center, "dady_y0"); + LLVMValueRef attr_v0 = LLVMBuildFAdd(b, dadx_x0, dady_y0, "attr_v0"); + LLVMValueRef attr_0 = LLVMBuildFSub(b, a0, attr_v0, "attr_0"); + + store_coef(b, args, slot, attr_0, dadx, dady); +} + + +static void +emit_linear_coef( LLVMBuilderRef b, + struct lp_setup_args *args, + unsigned slot, + unsigned vert_attr) +{ + LLVMValueRef idx = LLVMConstInt(LLVMInt32Type(), vert_attr, 0); + + LLVMValueRef a0 = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v0, &idx, 1, ""), "v0a"); + LLVMValueRef a1 = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v1, &idx, 1, ""), "v1a"); + LLVMValueRef a2 = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v2, &idx, 1, ""), "v2a"); + + emit_coef4(b, args, slot, a0, a1, a2); +} + + + +/** + * Compute a0, dadx and dady for a perspective-corrected interpolant, + * for a triangle. + * We basically multiply the vertex value by 1/w before computing + * the plane coefficients (a0, dadx, dady). + * Later, when we compute the value at a particular fragment position we'll + * divide the interpolated value by the interpolated W at that fragment. + */ +static void +emit_perspective_coef( LLVMBuilderRef b, + struct lp_setup_args *args, + unsigned slot, + unsigned vert_attr) +{ + /* premultiply by 1/w (v[0][3] is always 1/w): + */ + LLVMValueRef idx = LLVMConstInt(LLVMInt32Type(), vert_attr, 0); + + LLVMValueRef v0a = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v0, &idx, 1, ""), "v0a"); + LLVMValueRef v1a = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v1, &idx, 1, ""), "v1a"); + LLVMValueRef v2a = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v2, &idx, 1, ""), "v2a"); + + LLVMValueRef v0_oow = vec4f_from_scalar(b, vert_attrib(b, args->v0, 0, 3, ""), "v0_oow"); + LLVMValueRef v1_oow = vec4f_from_scalar(b, vert_attrib(b, args->v1, 0, 3, ""), "v1_oow"); + LLVMValueRef v2_oow = vec4f_from_scalar(b, vert_attrib(b, args->v2, 0, 3, ""), "v2_oow"); + + LLVMValueRef v0_oow_v0a = LLVMBuildFMul(b, v0a, v0_oow, "v0_oow_v0a"); + LLVMValueRef v1_oow_v1a = LLVMBuildFMul(b, v1a, v1_oow, "v1_oow_v1a"); + LLVMValueRef v2_oow_v2a = LLVMBuildFMul(b, v2a, v2_oow, "v2_oow_v2a"); + + emit_coef4(b, args, slot, v0_oow_v0a, v1_oow_v1a, v2_oow_v2a); +} + + +static void +emit_position_coef( LLVMBuilderRef builder, + struct lp_setup_args *args, + int slot, int attrib ) +{ + emit_linear_coef(builder, args, slot, attrib); +} + + + + +/** + * Compute the inputs-> dadx, dady, a0 values. + */ +static void +emit_tri_coef( LLVMBuilderRef builder, + const struct lp_setup_variant_key *key, + struct lp_setup_args *args ) +{ + unsigned slot; + + /* The internal position input is in slot zero: + */ + emit_position_coef(builder, args, 0, 0); + + /* setup interpolation for all the remaining attributes: + */ + for (slot = 0; slot < key->num_inputs; slot++) { + unsigned vert_attr = key->inputs[slot].src_index; + + switch (key->inputs[slot].interp) { + case LP_INTERP_CONSTANT: + if (key->flatshade_first) { + emit_constant_coef4(builder, args, slot+1, args->v0, vert_attr); + } + else { + emit_constant_coef4(builder, args, slot+1, args->v2, vert_attr); + } + break; + + case LP_INTERP_LINEAR: + emit_linear_coef(builder, args, slot+1, vert_attr); + break; + + case LP_INTERP_PERSPECTIVE: + emit_perspective_coef(builder, args, slot+1, vert_attr); + break; + + case LP_INTERP_POSITION: + /* + * The generated pixel interpolators will pick up the coeffs from + * slot 0. + */ + break; + + case LP_INTERP_FACING: + emit_facing_coef(builder, args, slot+1); + break; + + default: + assert(0); + } + } +} + + +/* XXX: This is generic code, share with fs/vs codegen: + */ +static lp_jit_setup_triangle +finalize_function(struct llvmpipe_screen *screen, + LLVMBuilderRef builder, + LLVMValueRef function) +{ + void *f; + + /* Verify the LLVM IR. If invalid, dump and abort */ +#ifdef DEBUG + if (LLVMVerifyFunction(function, LLVMPrintMessageAction)) { + if (1) + lp_debug_dump_value(function); + abort(); + } +#endif + + /* Apply optimizations to LLVM IR */ + LLVMRunFunctionPassManager(screen->pass, function); + + if (gallivm_debug & GALLIVM_DEBUG_IR) + { + /* Print the LLVM IR to stderr */ + lp_debug_dump_value(function); + debug_printf("\n"); + } + + /* + * Translate the LLVM IR into machine code. + */ + f = LLVMGetPointerToGlobal(screen->engine, function); + + if (gallivm_debug & GALLIVM_DEBUG_ASM) + { + lp_disassemble(f); + } + + lp_func_delete_body(function); + + return f; +} + +/* XXX: Generic code: + */ +static void +lp_emit_emms(LLVMBuilderRef builder) +{ +#ifdef PIPE_ARCH_X86 + /* Avoid corrupting the FPU stack on 32bit OSes. */ + lp_build_intrinsic(builder, "llvm.x86.mmx.emms", LLVMVoidType(), NULL, 0); +#endif +} + + +/* XXX: generic code: + */ +static void +set_noalias(LLVMBuilderRef builder, + LLVMValueRef function, + const LLVMTypeRef *arg_types, + int nr_args) +{ + int i; + for(i = 0; i < Elements(arg_types); ++i) + if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) + LLVMAddAttribute(LLVMGetParam(function, i), + LLVMNoAliasAttribute); +} + +static void +init_args(LLVMBuilderRef b, + struct lp_setup_args *args, + const struct lp_setup_variant *variant) +{ + LLVMValueRef v0_x = vert_attrib(b, args->v0, 0, 0, "v0_x"); + LLVMValueRef v0_y = vert_attrib(b, args->v0, 0, 1, "v0_y"); + + LLVMValueRef v1_x = vert_attrib(b, args->v1, 0, 0, "v1_x"); + LLVMValueRef v1_y = vert_attrib(b, args->v1, 0, 1, "v1_y"); + + LLVMValueRef v2_x = vert_attrib(b, args->v2, 0, 0, "v2_x"); + LLVMValueRef v2_y = vert_attrib(b, args->v2, 0, 1, "v2_y"); + + LLVMValueRef pixel_center = LLVMConstReal(LLVMFloatType(), + variant->key.pixel_center_half ? 0.5 : 0); + + LLVMValueRef x0_center = LLVMBuildFSub(b, v0_x, pixel_center, "x0_center" ); + LLVMValueRef y0_center = LLVMBuildFSub(b, v0_y, pixel_center, "y0_center" ); + + LLVMValueRef dx01 = LLVMBuildFSub(b, v0_x, v1_x, "dx01"); + LLVMValueRef dy01 = LLVMBuildFSub(b, v0_y, v1_y, "dy01"); + LLVMValueRef dx20 = LLVMBuildFSub(b, v2_x, v0_x, "dx20"); + LLVMValueRef dy20 = LLVMBuildFSub(b, v2_y, v0_y, "dy20"); + + LLVMValueRef one = LLVMConstReal(LLVMFloatType(), 1.0); + LLVMValueRef e = LLVMBuildFMul(b, dx01, dy20, "e"); + LLVMValueRef f = LLVMBuildFMul(b, dx20, dy01, "f"); + LLVMValueRef ooa = LLVMBuildFDiv(b, one, LLVMBuildFSub(b, e, f, ""), "ooa"); + + LLVMValueRef dy20_ooa = LLVMBuildFMul(b, dy20, ooa, "dy20_ooa"); + LLVMValueRef dy01_ooa = LLVMBuildFMul(b, dy01, ooa, "dy01_ooa"); + LLVMValueRef dx20_ooa = LLVMBuildFMul(b, dx20, ooa, "dx20_ooa"); + LLVMValueRef dx01_ooa = LLVMBuildFMul(b, dx01, ooa, "dx01_ooa"); + + args->dy20_ooa = vec4f_from_scalar(b, dy20_ooa, "dy20_ooa_4f"); + args->dy01_ooa = vec4f_from_scalar(b, dy01_ooa, "dy01_ooa_4f"); + + args->dx20_ooa = vec4f_from_scalar(b, dx20_ooa, "dx20_ooa_4f"); + args->dx01_ooa = vec4f_from_scalar(b, dx01_ooa, "dx01_ooa_4f"); + + args->x0_center = vec4f_from_scalar(b, x0_center, "x0_center_4f"); + args->y0_center = vec4f_from_scalar(b, y0_center, "y0_center_4f"); +} + +/** + * Generate the runtime callable function for the coefficient calculation. + * + */ +static struct lp_setup_variant * +generate_setup_variant(struct llvmpipe_screen *screen, + struct lp_setup_variant_key *key) +{ + struct lp_setup_variant *variant = NULL; + struct lp_setup_args args; + char func_name[256]; + LLVMTypeRef vec4f_type; + LLVMTypeRef func_type; + LLVMTypeRef arg_types[8]; + LLVMBasicBlockRef block; + LLVMBuilderRef builder; + int64_t t0, t1; + + if (0) + goto fail; + + variant = CALLOC_STRUCT(lp_setup_variant); + if (variant == NULL) + goto fail; + + if (LP_DEBUG & DEBUG_COUNTERS) { + t0 = os_time_get(); + } + + memcpy(&variant->key, key, key->size); + variant->list_item_global.base = variant; + + util_snprintf(func_name, sizeof(func_name), "fs%u_setup%u", + 0, + variant->no); + + /* Currently always deal with full 4-wide vertex attributes from + * the vertices. + */ + + vec4f_type = LLVMVectorType(LLVMFloatType(), 4); + + arg_types[0] = LLVMPointerType(vec4f_type, 0); /* v0 */ + arg_types[1] = LLVMPointerType(vec4f_type, 0); /* v1 */ + arg_types[2] = LLVMPointerType(vec4f_type, 0); /* v2 */ + arg_types[3] = LLVMInt32Type(); /* facing */ + arg_types[4] = LLVMPointerType(vec4f_type, 0); /* a0, aligned */ + arg_types[5] = LLVMPointerType(vec4f_type, 0); /* dadx, aligned */ + arg_types[6] = LLVMPointerType(vec4f_type, 0); /* dady, aligned */ + arg_types[7] = LLVMPointerType(LLVMVoidType(), 0); /* key, unused */ + + func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0); + + variant->function = LLVMAddFunction(screen->module, func_name, func_type); + if (!variant->function) + goto fail; + + LLVMSetFunctionCallConv(variant->function, LLVMCCallConv); + + args.v0 = LLVMGetParam(variant->function, 0); + args.v1 = LLVMGetParam(variant->function, 1); + args.v2 = LLVMGetParam(variant->function, 2); + args.facing = LLVMGetParam(variant->function, 3); + args.a0 = LLVMGetParam(variant->function, 4); + args.dadx = LLVMGetParam(variant->function, 5); + args.dady = LLVMGetParam(variant->function, 6); + + lp_build_name(args.v0, "in_v0"); + lp_build_name(args.v1, "in_v1"); + lp_build_name(args.v2, "in_v2"); + lp_build_name(args.facing, "in_facing"); + lp_build_name(args.a0, "out_a0"); + lp_build_name(args.dadx, "out_dadx"); + lp_build_name(args.dady, "out_dady"); + + /* + * Function body + */ + block = LLVMAppendBasicBlock(variant->function, "entry"); + builder = LLVMCreateBuilder(); + LLVMPositionBuilderAtEnd(builder, block); + + set_noalias(builder, variant->function, arg_types, Elements(arg_types)); + init_args(builder, &args, variant); + emit_tri_coef(builder, &variant->key, &args); + + lp_emit_emms(builder); + LLVMBuildRetVoid(builder); + LLVMDisposeBuilder(builder); + + variant->jit_function = finalize_function(screen, builder, + variant->function); + if (!variant->jit_function) + goto fail; + + /* + * Update timing information: + */ + if (LP_DEBUG & DEBUG_COUNTERS) { + t1 = os_time_get(); + LP_COUNT_ADD(llvm_compile_time, t1 - t0); + LP_COUNT_ADD(nr_llvm_compiles, 1); + } + + return variant; + +fail: + if (variant) { + if (variant->function) { + if (variant->jit_function) + LLVMFreeMachineCodeForFunction(screen->engine, + variant->function); + LLVMDeleteFunction(variant->function); + } + FREE(variant); + } + + return NULL; +} + + + +static void +lp_make_setup_variant_key(struct llvmpipe_context *lp, + struct lp_setup_variant_key *key) +{ + struct lp_fragment_shader *fs = lp->fs; + unsigned i; + + assert(sizeof key->inputs[0] == sizeof(ushort)); + + key->num_inputs = fs->info.num_inputs; + key->flatshade_first = lp->rasterizer->flatshade_first; + key->pixel_center_half = lp->rasterizer->gl_rasterization_rules; + key->size = Offset(struct lp_setup_variant_key, + inputs[key->num_inputs]); + key->pad = 0; + + memcpy(key->inputs, fs->inputs, key->num_inputs * sizeof key->inputs[0]); + for (i = 0; i < key->num_inputs; i++) { + if (key->inputs[i].interp == LP_INTERP_COLOR) { + if (lp->rasterizer->flatshade) + key->inputs[i].interp = LP_INTERP_CONSTANT; + else + key->inputs[i].interp = LP_INTERP_LINEAR; + } + } + +} + + +static void +remove_setup_variant(struct llvmpipe_context *lp, + struct lp_setup_variant *variant) +{ + struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen); + + if (gallivm_debug & GALLIVM_DEBUG_IR) { + debug_printf("llvmpipe: del setup_variant #%u total %u\n", + variant->no, lp->nr_setup_variants); + } + + if (variant->function) { + if (variant->jit_function) + LLVMFreeMachineCodeForFunction(screen->engine, + variant->function); + LLVMDeleteFunction(variant->function); + } + + remove_from_list(&variant->list_item_global); + lp->nr_setup_variants--; + FREE(variant); +} + + + +/* When the number of setup variants exceeds a threshold, cull a + * fraction (currently a quarter) of them. + */ +static void +cull_setup_variants(struct llvmpipe_context *lp) +{ + struct pipe_context *pipe = &lp->pipe; + int i; + + /* + * XXX: we need to flush the context until we have some sort of reference + * counting in fragment shaders as they may still be binned + * Flushing alone might not be sufficient we need to wait on it too. + */ + llvmpipe_finish(pipe, __FUNCTION__); + + for (i = 0; i < LP_MAX_SETUP_VARIANTS / 4; i++) { + struct lp_setup_variant_list_item *item = last_elem(&lp->setup_variants_list); + remove_setup_variant(lp, item->base); + } +} + + +/** + * Update fragment/vertex shader linkage state. This is called just + * prior to drawing something when some fragment-related state has + * changed. + */ +void +llvmpipe_update_setup(struct llvmpipe_context *lp) +{ + struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen); + + struct lp_setup_variant_key *key = &lp->setup_variant.key; + struct lp_setup_variant *variant = NULL; + struct lp_setup_variant_list_item *li; + + lp_make_setup_variant_key(lp, key); + + foreach(li, &lp->setup_variants_list) { + if(li->base->key.size == key->size && + memcmp(&li->base->key, key, key->size) == 0) { + variant = li->base; + break; + } + } + + if (variant) { + move_to_head(&lp->setup_variants_list, &variant->list_item_global); + } + else { + if (lp->nr_setup_variants >= LP_MAX_SETUP_VARIANTS) { + cull_setup_variants(lp); + } + + variant = generate_setup_variant(screen, key); + if (variant) { + insert_at_head(&lp->setup_variants_list, &variant->list_item_global); + lp->nr_setup_variants++; + } + else { + /* Keep the old path around for debugging, and also perhaps + * in case malloc fails during compilation. + */ + variant = &lp->setup_variant; + variant->jit_function = lp_setup_tri_fallback; + } + } + + lp_setup_set_setup_variant(lp->setup, + variant); +} + +void +lp_delete_setup_variants(struct llvmpipe_context *lp) +{ + struct lp_setup_variant_list_item *li; + li = first_elem(&lp->setup_variants_list); + while(!at_end(&lp->setup_variants_list, li)) { + struct lp_setup_variant_list_item *next = next_elem(li); + remove_setup_variant(lp, li->base); + li = next; + } +} + +void +lp_dump_setup_coef( const struct lp_setup_variant_key *key, + const float (*sa0)[4], + const float (*sdadx)[4], + const float (*sdady)[4]) +{ + int i, slot; + + for (i = 0; i < NUM_CHANNELS; i++) { + float a0 = sa0 [0][i]; + float dadx = sdadx[0][i]; + float dady = sdady[0][i]; + + debug_printf("POS.%c: a0 = %f, dadx = %f, dady = %f\n", + "xyzw"[i], + a0, dadx, dady); + } + + for (slot = 0; slot < key->num_inputs; slot++) { + unsigned usage_mask = key->inputs[slot].usage_mask; + for (i = 0; i < NUM_CHANNELS; i++) { + if (usage_mask & (1 << i)) { + float a0 = sa0 [1 + slot][i]; + float dadx = sdadx[1 + slot][i]; + float dady = sdady[1 + slot][i]; + + debug_printf("IN[%u].%c: a0 = %f, dadx = %f, dady = %f\n", + slot, + "xyzw"[i], + a0, dadx, dady); + } + } + } +} diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.h b/src/gallium/drivers/llvmpipe/lp_state_setup.h new file mode 100644 index 00000000000..2b080fbc321 --- /dev/null +++ b/src/gallium/drivers/llvmpipe/lp_state_setup.h @@ -0,0 +1,81 @@ +#ifndef LP_STATE_SETUP_H +#define LP_STATE_SETUP_H + +#include "lp_bld_interp.h" + + +struct llvmpipe_context; +struct lp_setup_variant; + +struct lp_setup_variant_list_item +{ + struct lp_setup_variant *base; + struct lp_setup_variant_list_item *next, *prev; +}; + + +struct lp_setup_variant_key { + unsigned num_inputs:8; + unsigned flatshade_first:1; + unsigned pixel_center_half:1; + unsigned pad:7; + unsigned size:16; + struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS]; +}; + + +typedef void (*lp_jit_setup_triangle)( const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4], + boolean front_facing, + float (*a0)[4], + float (*dadx)[4], + float (*dady)[4], + const struct lp_setup_variant_key *key ); + + + + +/* At this stage, for a given variant key, we create a + * draw_vertex_info struct telling the draw module how to format the + * vertices, and an llvm-generated function which calculates the + * attribute interpolants (a0, dadx, dady) from three of those + * vertices. + */ +struct lp_setup_variant { + struct lp_setup_variant_key key; + + struct lp_setup_variant_list_item list_item_global; + + /* XXX: this is a pointer to the LLVM IR. Once jit_function is + * generated, we never need to use the IR again - need to find a + * way to release this data without destroying the generated + * assembly. + */ + LLVMValueRef function; + + /* The actual generated setup function: + */ + lp_jit_setup_triangle jit_function; + + unsigned no; +}; + +void lp_setup_tri_fallback( const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4], + boolean front_facing, + float (*a0)[4], + float (*dadx)[4], + float (*dady)[4], + const struct lp_setup_variant_key *key ); + +void lp_delete_setup_variants(struct llvmpipe_context *lp); + +void +lp_dump_setup_coef( const struct lp_setup_variant_key *key, + const float (*sa0)[4], + const float (*sdadx)[4], + const float (*sdady)[4]); + +#endif diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup_fallback.c b/src/gallium/drivers/llvmpipe/lp_state_setup_fallback.c new file mode 100644 index 00000000000..1922efcc88d --- /dev/null +++ b/src/gallium/drivers/llvmpipe/lp_state_setup_fallback.c @@ -0,0 +1,265 @@ +/************************************************************************** + * + * Copyright 2010, VMware. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* + * Fallback (non-llvm) path for triangle setup. Will remove once llvm + * is up and running. + * + * TODO: line/point setup. + */ + +#include "util/u_math.h" +#include "util/u_memory.h" +#include "lp_state_setup.h" + + + +#if defined(PIPE_ARCH_SSE) +#include + +struct setup_args { + float (*a0)[4]; /* aligned */ + float (*dadx)[4]; /* aligned */ + float (*dady)[4]; /* aligned */ + + float x0_center; + float y0_center; + + /* turn these into an aligned float[4] */ + float dy01_ooa; + float dy20_ooa; + float dx01_ooa; + float dx20_ooa; + + const float (*v0)[4]; /* aligned */ + const float (*v1)[4]; /* aligned */ + const float (*v2)[4]; /* aligned */ + + boolean frontfacing; /* remove eventually */ +}; + + +static void constant_coef4( struct setup_args *args, + unsigned slot, + const float *attr) +{ + *(__m128 *)args->a0[slot] = *(__m128 *)attr; + *(__m128 *)args->dadx[slot] = _mm_set1_ps(0.0); + *(__m128 *)args->dady[slot] = _mm_set1_ps(0.0); +} + + + +/** + * Setup the fragment input attribute with the front-facing value. + * \param frontface is the triangle front facing? + */ +static void setup_facing_coef( struct setup_args *args, + unsigned slot ) +{ + /* XXX: just pass frontface directly to the shader, don't bother + * treating it as an input. + */ + __m128 a0 = _mm_setr_ps(args->frontfacing ? 1.0 : -1.0, + 0, 0, 0); + + *(__m128 *)args->a0[slot] = a0; + *(__m128 *)args->dadx[slot] = _mm_set1_ps(0.0); + *(__m128 *)args->dady[slot] = _mm_set1_ps(0.0); +} + + + +static void calc_coef4( struct setup_args *args, + unsigned slot, + __m128 a0, + __m128 a1, + __m128 a2) +{ + __m128 da01 = _mm_sub_ps(a0, a1); + __m128 da20 = _mm_sub_ps(a2, a0); + + __m128 da01_dy20_ooa = _mm_mul_ps(da01, _mm_set1_ps(args->dy20_ooa)); + __m128 da20_dy01_ooa = _mm_mul_ps(da20, _mm_set1_ps(args->dy01_ooa)); + __m128 dadx = _mm_sub_ps(da01_dy20_ooa, da20_dy01_ooa); + + __m128 da01_dx20_ooa = _mm_mul_ps(da01, _mm_set1_ps(args->dx20_ooa)); + __m128 da20_dx01_ooa = _mm_mul_ps(da20, _mm_set1_ps(args->dx01_ooa)); + __m128 dady = _mm_sub_ps(da20_dx01_ooa, da01_dx20_ooa); + + __m128 dadx_x0 = _mm_mul_ps(dadx, _mm_set1_ps(args->x0_center)); + __m128 dady_y0 = _mm_mul_ps(dady, _mm_set1_ps(args->y0_center)); + __m128 attr_v0 = _mm_add_ps(dadx_x0, dady_y0); + __m128 attr_0 = _mm_sub_ps(a0, attr_v0); + + *(__m128 *)args->a0[slot] = attr_0; + *(__m128 *)args->dadx[slot] = dadx; + *(__m128 *)args->dady[slot] = dady; +} + + +static void linear_coef( struct setup_args *args, + unsigned slot, + unsigned vert_attr) +{ + __m128 a0 = *(const __m128 *)args->v0[vert_attr]; + __m128 a1 = *(const __m128 *)args->v1[vert_attr]; + __m128 a2 = *(const __m128 *)args->v2[vert_attr]; + + calc_coef4(args, slot, a0, a1, a2); +} + + + +/** + * Compute a0, dadx and dady for a perspective-corrected interpolant, + * for a triangle. + * We basically multiply the vertex value by 1/w before computing + * the plane coefficients (a0, dadx, dady). + * Later, when we compute the value at a particular fragment position we'll + * divide the interpolated value by the interpolated W at that fragment. + */ +static void perspective_coef( struct setup_args *args, + unsigned slot, + unsigned vert_attr) +{ + /* premultiply by 1/w (v[0][3] is always 1/w): + */ + __m128 a0 = *(const __m128 *)args->v0[vert_attr]; + __m128 a1 = *(const __m128 *)args->v1[vert_attr]; + __m128 a2 = *(const __m128 *)args->v2[vert_attr]; + + __m128 a0_oow = _mm_mul_ps(a0, _mm_set1_ps(args->v0[0][3])); + __m128 a1_oow = _mm_mul_ps(a1, _mm_set1_ps(args->v1[0][3])); + __m128 a2_oow = _mm_mul_ps(a2, _mm_set1_ps(args->v2[0][3])); + + calc_coef4(args, slot, a0_oow, a1_oow, a2_oow); +} + + + + + +/** + * Compute the args-> dadx, dady, a0 values. + * + * Note that this was effectively a little interpreted program, where + * the opcodes were LP_INTERP_*. This is the program which is now + * being code-generated in lp_state_setup.c. + */ +void lp_setup_tri_fallback( const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4], + boolean front_facing, + float (*a0)[4], + float (*dadx)[4], + float (*dady)[4], + const struct lp_setup_variant_key *key ) +{ + struct setup_args args; + float pixel_offset = key->pixel_center_half ? 0.5 : 0.0; + float dx01 = v0[0][0] - v1[0][0]; + float dy01 = v0[0][1] - v1[0][1]; + float dx20 = v2[0][0] - v0[0][0]; + float dy20 = v2[0][1] - v0[0][1]; + float oneoverarea = 1.0f / (dx01 * dy20 - dx20 * dy01); + unsigned slot; + + args.v0 = v0; + args.v1 = v1; + args.v2 = v2; + args.frontfacing = front_facing; + args.a0 = a0; + args.dadx = dadx; + args.dady = dady; + + args.x0_center = v0[0][0] - pixel_offset; + args.y0_center = v0[0][1] - pixel_offset; + args.dx01_ooa = dx01 * oneoverarea; + args.dx20_ooa = dx20 * oneoverarea; + args.dy01_ooa = dy01 * oneoverarea; + args.dy20_ooa = dy20 * oneoverarea; + + /* The internal position input is in slot zero: + */ + linear_coef(&args, 0, 0); + + /* setup interpolation for all the remaining attributes: + */ + for (slot = 0; slot < key->num_inputs; slot++) { + unsigned vert_attr = key->inputs[slot].src_index; + + switch (key->inputs[slot].interp) { + case LP_INTERP_CONSTANT: + if (key->flatshade_first) { + constant_coef4(&args, slot+1, args.v0[vert_attr]); + } + else { + constant_coef4(&args, slot+1, args.v2[vert_attr]); + } + break; + + case LP_INTERP_LINEAR: + linear_coef(&args, slot+1, vert_attr); + break; + + case LP_INTERP_PERSPECTIVE: + perspective_coef(&args, slot+1, vert_attr); + break; + + case LP_INTERP_POSITION: + /* + * The generated pixel interpolators will pick up the coeffs from + * slot 0. + */ + break; + + case LP_INTERP_FACING: + setup_facing_coef(&args, slot+1); + break; + + default: + assert(0); + } + } +} + +#else + +void lp_setup_tri_fallback( const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4], + boolean front_facing, + float (*a0)[4], + float (*dadx)[4], + float (*dady)[4], + const struct lp_setup_variant_key *key ) +{ + /* this path for debugging only, don't need a non-sse version. */ +} + +#endif From 7ef3d171a0e535ba720d9c7a6cdc6bb165416a3c Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Sat, 18 Sep 2010 09:15:14 +0100 Subject: [PATCH 002/540] graw: add frag-face shader --- .../tests/regress/fragment-shader/frag-face.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 src/gallium/tests/python/tests/regress/fragment-shader/frag-face.sh diff --git a/src/gallium/tests/python/tests/regress/fragment-shader/frag-face.sh b/src/gallium/tests/python/tests/regress/fragment-shader/frag-face.sh new file mode 100644 index 00000000000..5745b6a5aba --- /dev/null +++ b/src/gallium/tests/python/tests/regress/fragment-shader/frag-face.sh @@ -0,0 +1,14 @@ +FRAG + +DCL IN[0], COLOR, LINEAR +DCL IN[1], FACE, CONSTANT +DCL OUT[0], COLOR +DCL TEMP[0] +IMM FLT32 { 0.5, 1.0, 0.0, 0.0 } + +MUL TEMP[0], IN[1].xxxx, IMM[0].xxxx +ADD TEMP[0], TEMP[0], IMM[0].yyyy + +MOV OUT[0], TEMP[0] + +END From 75d22e71a812bbe78414d3f9519f4c7a7157c748 Mon Sep 17 00:00:00 2001 From: Hui Qi Tay Date: Sun, 26 Sep 2010 16:01:59 +0800 Subject: [PATCH 003/540] llvmpipe: minor changes in llvm coefficient calcs --- src/gallium/drivers/llvmpipe/lp_setup_debug.c | 1 + src/gallium/drivers/llvmpipe/lp_state_setup.c | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) create mode 100644 src/gallium/drivers/llvmpipe/lp_setup_debug.c diff --git a/src/gallium/drivers/llvmpipe/lp_setup_debug.c b/src/gallium/drivers/llvmpipe/lp_setup_debug.c new file mode 100644 index 00000000000..a71a4719a86 --- /dev/null +++ b/src/gallium/drivers/llvmpipe/lp_setup_debug.c @@ -0,0 +1 @@ +/* Some debugging stuff */ diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.c b/src/gallium/drivers/llvmpipe/lp_state_setup.c index aa9147a1a15..3261c53f516 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_setup.c +++ b/src/gallium/drivers/llvmpipe/lp_state_setup.c @@ -173,8 +173,9 @@ emit_facing_coef( LLVMBuilderRef builder, unsigned slot ) { LLVMValueRef a0_0 = args->facing; + LLVMValueRef a0_0f = LLVMBuildSIToFP(builder, a0_0, LLVMFloatType(), ""); LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0); - LLVMValueRef a0 = vec4f(builder, a0_0, zero, zero, zero, "facing"); + LLVMValueRef a0 = vec4f(builder, a0_0f, zero, zero, zero, "facing"); LLVMValueRef zerovec = vec4f_from_scalar(builder, zero, "zero"); store_coef(builder, args, slot, a0, zerovec, zerovec); @@ -520,7 +521,7 @@ generate_setup_variant(struct llvmpipe_screen *screen, arg_types[4] = LLVMPointerType(vec4f_type, 0); /* a0, aligned */ arg_types[5] = LLVMPointerType(vec4f_type, 0); /* dadx, aligned */ arg_types[6] = LLVMPointerType(vec4f_type, 0); /* dady, aligned */ - arg_types[7] = LLVMPointerType(LLVMVoidType(), 0); /* key, unused */ + arg_types[7] = LLVMPointerType(vec4f_type, 0); /* key, unused */ func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0); From 94f65d095a1365b69a041302b473e40c6ccae3c3 Mon Sep 17 00:00:00 2001 From: Hui Qi Tay Date: Tue, 28 Sep 2010 17:49:25 +0100 Subject: [PATCH 004/540] draw: cliptest and viewport done in a single loop in vertex shader Cliptesting now done at the end of vs in draw_llvm instead of draw_pt_post_vs. Added viewport mapping transformation and further cliptesting to vertex shader in draw_llvm.c Alternative path where vertex header setup, clip coordinates store, cliptesting and viewport mapping are done earlier in the vertex shader. Still need to hook this up properly according to the return value of "draw_llvm_shader" function. --- src/gallium/auxiliary/draw/draw_llvm.c | 316 +++++++++++++++++++++++-- src/gallium/auxiliary/draw/draw_llvm.h | 6 +- 2 files changed, 294 insertions(+), 28 deletions(-) diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c index 5154c1f3c5f..36254d359c4 100644 --- a/src/gallium/auxiliary/draw/draw_llvm.c +++ b/src/gallium/auxiliary/draw/draw_llvm.c @@ -31,6 +31,9 @@ #include "draw_vs.h" #include "gallivm/lp_bld_arit.h" +#include "gallivm/lp_bld_logic.h" +#include "gallivm/lp_bld_const.h" +#include "gallivm/lp_bld_swizzle.h" #include "gallivm/lp_bld_struct.h" #include "gallivm/lp_bld_type.h" #include "gallivm/lp_bld_flow.h" @@ -549,19 +552,28 @@ static void store_aos(LLVMBuilderRef builder, LLVMValueRef io_ptr, LLVMValueRef index, - LLVMValueRef value) + LLVMValueRef value, + LLVMValueRef clipmask) { LLVMValueRef id_ptr = draw_jit_header_id(builder, io_ptr); LLVMValueRef data_ptr = draw_jit_header_data(builder, io_ptr); LLVMValueRef indices[3]; + LLVMValueRef val, shift; indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0); indices[1] = index; indices[2] = LLVMConstInt(LLVMInt32Type(), 0, 0); - /* undefined vertex */ - LLVMBuildStore(builder, LLVMConstInt(LLVMInt32Type(), - 0xffff, 0), id_ptr); + /* initialize vertex id:16 = 0xffff, pad:3 = 0, edgeflag:1 = 1 */ + val = LLVMConstInt(LLVMInt32Type(), 0xffff1, 0); + shift = LLVMConstInt(LLVMInt32Type(), 12, 0); + val = LLVMBuildShl(builder, val, shift, ""); + /* add clipmask:12 */ + val = LLVMBuildOr(builder, val, clipmask, ""); + + /* store vertex header */ + LLVMBuildStore(builder, val, id_ptr); + #if DEBUG_STORE lp_build_printf(builder, " ---- %p storing attribute %d (io = %p)\n", data_ptr, index, io_ptr); @@ -616,7 +628,8 @@ store_aos_array(LLVMBuilderRef builder, LLVMValueRef io_ptr, LLVMValueRef aos[NUM_CHANNELS], int attrib, - int num_outputs) + int num_outputs, + LLVMValueRef clipmask) { LLVMValueRef attr_index = LLVMConstInt(LLVMInt32Type(), attrib, 0); LLVMValueRef ind0 = LLVMConstInt(LLVMInt32Type(), 0, 0); @@ -624,7 +637,8 @@ store_aos_array(LLVMBuilderRef builder, LLVMValueRef ind2 = LLVMConstInt(LLVMInt32Type(), 2, 0); LLVMValueRef ind3 = LLVMConstInt(LLVMInt32Type(), 3, 0); LLVMValueRef io0_ptr, io1_ptr, io2_ptr, io3_ptr; - + LLVMValueRef clipmask0, clipmask1, clipmask2, clipmask3; + debug_assert(NUM_CHANNELS == 4); io0_ptr = LLVMBuildGEP(builder, io_ptr, @@ -636,21 +650,31 @@ store_aos_array(LLVMBuilderRef builder, io3_ptr = LLVMBuildGEP(builder, io_ptr, &ind3, 1, ""); -#if DEBUG_STORE - lp_build_printf(builder, " io = %p, indexes[%d, %d, %d, %d]\n", - io_ptr, ind0, ind1, ind2, ind3); -#endif + clipmask0 = LLVMBuildExtractElement(builder, clipmask, + ind0, ""); + clipmask1 = LLVMBuildExtractElement(builder, clipmask, + ind1, ""); + clipmask2 = LLVMBuildExtractElement(builder, clipmask, + ind2, ""); + clipmask3 = LLVMBuildExtractElement(builder, clipmask, + ind3, ""); - store_aos(builder, io0_ptr, attr_index, aos[0]); - store_aos(builder, io1_ptr, attr_index, aos[1]); - store_aos(builder, io2_ptr, attr_index, aos[2]); - store_aos(builder, io3_ptr, attr_index, aos[3]); +#if DEBUG_STORE + lp_build_printf(builder, "io = %p, indexes[%d, %d, %d, %d]\n, clipmask0 = %x, clipmask1 = %x, clipmask2 = %x, clipmask3 = %x\n", + io_ptr, ind0, ind1, ind2, ind3, clipmask0, clipmask1, clipmask2, clipmask3); +#endif + /* store for each of the 4 vertices */ + store_aos(builder, io0_ptr, attr_index, aos[0], clipmask0); + store_aos(builder, io1_ptr, attr_index, aos[1], clipmask1); + store_aos(builder, io2_ptr, attr_index, aos[2], clipmask2); + store_aos(builder, io3_ptr, attr_index, aos[3], clipmask3); } static void convert_to_aos(LLVMBuilderRef builder, LLVMValueRef io, LLVMValueRef (*outputs)[NUM_CHANNELS], + LLVMValueRef clipmask, int num_outputs, int max_vertices) { @@ -679,13 +703,214 @@ convert_to_aos(LLVMBuilderRef builder, io, aos, attrib, - num_outputs); + num_outputs, + clipmask); } #if DEBUG_STORE lp_build_printf(builder, " # storing end\n"); #endif } +/* + * Stores original vertex positions in clip coordinates + * There is probably a more efficient way to do this, 4 floats at once + * rather than extracting each element one by one. + */ +static void +store_clip(LLVMBuilderRef builder, + LLVMValueRef io_ptr, + LLVMValueRef (*outputs)[NUM_CHANNELS]) +{ + LLVMValueRef out[4]; + LLVMValueRef indices[2]; + LLVMValueRef io0_ptr, io1_ptr, io2_ptr, io3_ptr; + LLVMValueRef clip_ptr0, clip_ptr1, clip_ptr2, clip_ptr3; + LLVMValueRef clip0_ptr, clip1_ptr, clip2_ptr, clip3_ptr; + LLVMValueRef out0elem, out1elem, out2elem, out3elem; + + LLVMValueRef ind0 = LLVMConstInt(LLVMInt32Type(), 0, 0); + LLVMValueRef ind1 = LLVMConstInt(LLVMInt32Type(), 1, 0); + LLVMValueRef ind2 = LLVMConstInt(LLVMInt32Type(), 2, 0); + LLVMValueRef ind3 = LLVMConstInt(LLVMInt32Type(), 3, 0); + + indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0); + indices[1] = LLVMConstInt(LLVMInt32Type(), 0, 0); + + out[0] = LLVMBuildLoad(builder, outputs[0][0], ""); /*x0 y0 z0 w0*/ + out[1] = LLVMBuildLoad(builder, outputs[0][1], ""); /*x1 y1 z1 w1*/ + out[2] = LLVMBuildLoad(builder, outputs[0][2], ""); /*x2 y2 z2 w2*/ + out[3] = LLVMBuildLoad(builder, outputs[0][3], ""); /*x3 y3 z3 w3*/ + + io0_ptr = LLVMBuildGEP(builder, io_ptr, &ind0, 1, ""); + io1_ptr = LLVMBuildGEP(builder, io_ptr, &ind1, 1, ""); + io2_ptr = LLVMBuildGEP(builder, io_ptr, &ind2, 1, ""); + io3_ptr = LLVMBuildGEP(builder, io_ptr, &ind3, 1, ""); + + clip_ptr0 = draw_jit_header_clip(builder, io0_ptr); + clip_ptr1 = draw_jit_header_clip(builder, io1_ptr); + clip_ptr2 = draw_jit_header_clip(builder, io2_ptr); + clip_ptr3 = draw_jit_header_clip(builder, io3_ptr); + + for (int i = 0; i<4; i++){ + clip0_ptr = LLVMBuildGEP(builder, clip_ptr0, + indices, 2, ""); //x1 + clip1_ptr = LLVMBuildGEP(builder, clip_ptr1, + indices, 2, ""); //y1 + clip2_ptr = LLVMBuildGEP(builder, clip_ptr2, + indices, 2, ""); //z1 + clip3_ptr = LLVMBuildGEP(builder, clip_ptr3, + indices, 2, ""); //w1 + + out0elem = LLVMBuildExtractElement(builder, out[0], + indices[1], ""); //x1 + out1elem = LLVMBuildExtractElement(builder, out[1], + indices[1], ""); //y1 + out2elem = LLVMBuildExtractElement(builder, out[2], + indices[1], ""); //z1 + out3elem = LLVMBuildExtractElement(builder, out[3], + indices[1], ""); //w1 + + LLVMBuildStore(builder, out0elem, clip0_ptr); + LLVMBuildStore(builder, out1elem, clip1_ptr); + LLVMBuildStore(builder, out2elem, clip2_ptr); + LLVMBuildStore(builder, out3elem, clip3_ptr); + + indices[1]= LLVMBuildAdd(builder, indices[1], ind1, ""); + } + +} + +/* + * Transforms the outputs for viewport mapping + */ +static void +generate_viewport(struct draw_llvm *llvm, + LLVMBuilderRef builder, + LLVMValueRef (*outputs)[NUM_CHANNELS]) +{ + int i; + const float *scaleA = llvm->draw->viewport.scale; + const float *transA = llvm->draw->viewport.translate; + struct lp_type f32_type = lp_type_float_vec(32); + LLVMValueRef out3 = LLVMBuildLoad(builder, outputs[0][3], ""); /*w0 w1 w2 w3*/ + LLVMValueRef const1 = lp_build_const_vec(f32_type, 1.0); /*1.0 1.0 1.0 1.0*/ + + /* for 1/w convention*/ + out3 = LLVMBuildFDiv(builder, const1, out3, ""); + + /* Viewport Mapping */ + for (i=0; i<4; i++){ + LLVMValueRef out = LLVMBuildLoad(builder, outputs[0][i], ""); /*x0 x1 x2 x3*/ + LLVMValueRef scale = lp_build_const_vec(f32_type, scaleA[i]); /*sx sx sx sx*/ + LLVMValueRef trans = lp_build_const_vec(f32_type, transA[i]); /*tx tx tx tx*/ + + /* divide by w */ + out = LLVMBuildMul(builder, out, out3, ""); + /* mult by scale */ + out = LLVMBuildMul(builder, out, scale, ""); + /* add translation */ + out = LLVMBuildAdd(builder, out, trans, ""); + + /* store transformed outputs */ + LLVMBuildStore(builder, out, outputs[0][i]); + } + +} + +/* + * Returns clipmask as 4xi32 bitmask for the 4 vertices + */ +static LLVMValueRef +generate_clipmask(LLVMBuilderRef builder, + LLVMValueRef (*outputs)[NUM_CHANNELS]) +{ + LLVMValueRef mask; /* stores the <4xi32> clipmasks */ + LLVMValueRef test, temp; + LLVMValueRef zero, shift; + LLVMValueRef pos_x, pos_y, pos_z, pos_w; + + struct lp_type f32_type = lp_type_float_vec(32); + + zero = lp_build_const_vec(f32_type, 0); /* 0.0f 0.0f 0.0f 0.0f */ + shift = lp_build_const_int_vec(lp_type_int_vec(32), 1); /* 1 1 1 1 */ + + /* Assuming position stored at output[0] */ + pos_x = LLVMBuildLoad(builder, outputs[0][0], ""); /*x0 x1 x2 x3*/ + pos_y = LLVMBuildLoad(builder, outputs[0][1], ""); /*y0 y1 y2 y3*/ + pos_z = LLVMBuildLoad(builder, outputs[0][2], ""); /*z0 z1 z2 z3*/ + pos_w = LLVMBuildLoad(builder, outputs[0][3], ""); /*w0 w1 w2 w3*/ + + /* Cliptest, for hardwired planes */ + /* plane 1 */ + test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, pos_x , pos_w); + temp = shift; + test = LLVMBuildAnd(builder, test, temp, ""); + mask = test; + + /* plane 2 */ + test = LLVMBuildFAdd(builder, pos_x, pos_w, ""); + test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, test); + temp = LLVMBuildShl(builder, temp, shift, ""); + test = LLVMBuildAnd(builder, test, temp, ""); + mask = LLVMBuildOr(builder, mask, test, ""); + + /* plane 3 */ + test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, pos_y, pos_w); + temp = LLVMBuildShl(builder, temp, shift, ""); + test = LLVMBuildAnd(builder, test, temp, ""); + mask = LLVMBuildOr(builder, mask, test, ""); + + /* plane 4 */ + test = LLVMBuildFAdd(builder, pos_y, pos_w, ""); + test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, test); + temp = LLVMBuildShl(builder, temp, shift, ""); + test = LLVMBuildAnd(builder, test, temp, ""); + mask = LLVMBuildOr(builder, mask, test, ""); + + /* plane 5 */ + test = LLVMBuildFAdd(builder, pos_z, pos_w, ""); + test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, test); + temp = LLVMBuildShl(builder, temp, shift, ""); + test = LLVMBuildAnd(builder, test, temp, ""); + mask = LLVMBuildOr(builder, mask, test, ""); + + /* plane 6 */ + test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, pos_z, pos_w); + temp = LLVMBuildShl(builder, temp, shift, ""); + test = LLVMBuildAnd(builder, test, temp, ""); + mask = LLVMBuildOr(builder, mask, test, ""); + + return mask; + +} + +/* + * Returns boolean if any clipping has occurred + * Used zero/non-zero i32 value to represent boolean + */ +static void +clipmask_bool(LLVMBuilderRef builder, + LLVMValueRef clipmask, + LLVMValueRef ret_ptr) +{ + LLVMValueRef ret = LLVMBuildLoad(builder, ret_ptr, ""); + LLVMValueRef temp; + int i; + + LLVMDumpValue(clipmask); + + for (i=0; i<4; i++){ + temp = LLVMBuildExtractElement(builder, clipmask, + LLVMConstInt(LLVMInt32Type(), i, 0) , ""); + ret = LLVMBuildOr(builder, ret, temp, ""); + LLVMDumpValue(ret); + } + + LLVMBuildStore(builder, ret, ret_ptr); + LLVMDumpValue(ret_ptr); + +} + static void draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) { @@ -706,6 +931,8 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) void *code; struct lp_build_sampler_soa *sampler = 0; + LLVMValueRef ret, ret_ptr; + arg_types[0] = llvm->context_ptr_type; /* context */ arg_types[1] = llvm->vertex_header_ptr_type; /* vertex_header */ arg_types[2] = llvm->buffer_ptr_type; /* vbuffers */ @@ -715,7 +942,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) arg_types[6] = llvm->vb_ptr_type; /* pipe_vertex_buffer's */ arg_types[7] = LLVMInt32Type(); /* instance_id */ - func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0); + func_type = LLVMFunctionType(LLVMInt32Type(), arg_types, Elements(arg_types), 0); variant->function = LLVMAddFunction(llvm->module, "draw_llvm_shader", func_type); LLVMSetFunctionCallConv(variant->function, LLVMCCallConv); @@ -755,6 +982,10 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) step = LLVMConstInt(LLVMInt32Type(), max_vertices, 0); + /* function will return non-zero i32 value if any clipped vertices */ + ret_ptr = lp_build_alloca(builder, LLVMInt32Type(), ""); + LLVMBuildStore(builder, LLVMConstInt(LLVMInt32Type(), 0, 0), ret_ptr); + /* code generated texture sampling */ sampler = draw_llvm_sampler_soa_create( draw_llvm_variant_key_samplers(&variant->key), @@ -769,6 +1000,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS]; LLVMValueRef aos_attribs[PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS] = { { 0 } }; LLVMValueRef io; + LLVMValueRef clipmask; /* holds the clipmask value */ const LLVMValueRef (*ptr_aos)[NUM_CHANNELS]; io_itr = LLVMBuildSub(builder, lp_loop.counter, start, ""); @@ -805,10 +1037,22 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) context_ptr, sampler); - convert_to_aos(builder, io, outputs, + /* store original positions in clip before further manipulation */ + store_clip(builder, io, outputs); + + /* allocate clipmask, assign it integer type */ + clipmask = generate_clipmask(builder, outputs); + clipmask_bool(builder, clipmask, ret_ptr); + + /* do viewport mapping */ + generate_viewport(llvm, builder, outputs); + + /* store clipmask in vertex header and positions in data */ + convert_to_aos(builder, io, outputs, clipmask, draw->vs.vertex_shader->info.num_outputs, max_vertices); } + lp_build_loop_end_cond(builder, end, step, LLVMIntUGE, &lp_loop); sampler->destroy(sampler); @@ -818,8 +1062,9 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) lp_build_intrinsic(builder, "llvm.x86.mmx.emms", LLVMVoidType(), NULL, 0); #endif - LLVMBuildRetVoid(builder); - + ret = LLVMBuildLoad(builder, ret_ptr,""); + LLVMBuildRet(builder, ret); + LLVMDisposeBuilder(builder); /* @@ -869,6 +1114,7 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian LLVMValueRef fetch_max; void *code; struct lp_build_sampler_soa *sampler = 0; + LLVMValueRef ret, ret_ptr; arg_types[0] = llvm->context_ptr_type; /* context */ arg_types[1] = llvm->vertex_header_ptr_type; /* vertex_header */ @@ -879,10 +1125,9 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian arg_types[6] = llvm->vb_ptr_type; /* pipe_vertex_buffer's */ arg_types[7] = LLVMInt32Type(); /* instance_id */ - func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0); + func_type = LLVMFunctionType(LLVMInt32Type(), arg_types, Elements(arg_types), 0); - variant->function_elts = LLVMAddFunction(llvm->module, "draw_llvm_shader_elts", - func_type); + variant->function_elts = LLVMAddFunction(llvm->module, "draw_llvm_shader_elts", func_type); LLVMSetFunctionCallConv(variant->function_elts, LLVMCCallConv); for(i = 0; i < Elements(arg_types); ++i) if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) @@ -928,11 +1173,16 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian LLVMConstInt(LLVMInt32Type(), 1, 0), "fetch_max"); + /* function returns non-zero i32 value if any clipped vertices */ + ret_ptr = lp_build_alloca(builder, LLVMInt32Type(), ""); + LLVMBuildStore(builder, LLVMConstInt(LLVMInt32Type(), 0, 0), ret_ptr); + lp_build_loop_begin(builder, LLVMConstInt(LLVMInt32Type(), 0, 0), &lp_loop); { LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS]; LLVMValueRef aos_attribs[PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS] = { { 0 } }; LLVMValueRef io; + LLVMValueRef clipmask; /* holds the clipmask value */ const LLVMValueRef (*ptr_aos)[NUM_CHANNELS]; io_itr = lp_loop.counter; @@ -979,10 +1229,25 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian context_ptr, sampler); - convert_to_aos(builder, io, outputs, + /* store original positions in clip before further manipulation */ + store_clip(builder, io, outputs); + + /* allocate clipmask, assign it integer type */ + clipmask = generate_clipmask(builder, outputs); + clipmask_bool(builder, clipmask, ret_ptr); + + /* do viewport mapping */ + generate_viewport(llvm, builder, outputs); + + /* store clipmask in vertex header, + * original positions in clip + * and transformed positions in data + */ + convert_to_aos(builder, io, outputs, clipmask, draw->vs.vertex_shader->info.num_outputs, max_vertices); } + lp_build_loop_end_cond(builder, fetch_count, step, LLVMIntUGE, &lp_loop); sampler->destroy(sampler); @@ -992,8 +1257,9 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian lp_build_intrinsic(builder, "llvm.x86.mmx.emms", LLVMVoidType(), NULL, 0); #endif - LLVMBuildRetVoid(builder); - + ret = LLVMBuildLoad(builder, ret_ptr,""); + LLVMBuildRet(builder, ret); + LLVMDisposeBuilder(builder); /* diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h index b881ef6113b..1142ea51cfb 100644 --- a/src/gallium/auxiliary/draw/draw_llvm.h +++ b/src/gallium/auxiliary/draw/draw_llvm.h @@ -120,7 +120,7 @@ struct draw_jit_context lp_build_struct_get_ptr(_builder, _ptr, 0, "id") #define draw_jit_header_clip(_builder, _ptr) \ - lp_build_struct_get(_builder, _ptr, 1, "clip") + lp_build_struct_get_ptr(_builder, _ptr, 1, "clip") #define draw_jit_header_data(_builder, _ptr) \ lp_build_struct_get_ptr(_builder, _ptr, 2, "data") @@ -136,7 +136,7 @@ struct draw_jit_context lp_build_struct_get(_builder, _ptr, 2, "buffer_offset") -typedef void +typedef int (*draw_jit_vert_func)(struct draw_jit_context *context, struct vertex_header *io, const char *vbuffers[PIPE_MAX_ATTRIBS], @@ -147,7 +147,7 @@ typedef void unsigned instance_id); -typedef void +typedef int (*draw_jit_vert_func_elts)(struct draw_jit_context *context, struct vertex_header *io, const char *vbuffers[PIPE_MAX_ATTRIBS], From 3744d1c7d30543520cede8a6c580f26985978953 Mon Sep 17 00:00:00 2001 From: Hui Qi Tay Date: Tue, 28 Sep 2010 17:52:14 +0100 Subject: [PATCH 005/540] draw: added viewport and cliptest flags Corrections in store_clip to store clip coordinates in AoS form. Viewport & cliptest flag options based on variant key. Put back draw_pt_post_vs and now 2 paths based on whether clipping occurs or not. --- src/gallium/auxiliary/draw/draw_llvm.c | 128 ++++++++++++------ src/gallium/auxiliary/draw/draw_llvm.h | 6 +- .../draw/draw_pt_fetch_shade_pipeline_llvm.c | 12 +- 3 files changed, 97 insertions(+), 49 deletions(-) diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c index 36254d359c4..940b2d7ae55 100644 --- a/src/gallium/auxiliary/draw/draw_llvm.c +++ b/src/gallium/auxiliary/draw/draw_llvm.c @@ -736,10 +736,10 @@ store_clip(LLVMBuilderRef builder, indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0); indices[1] = LLVMConstInt(LLVMInt32Type(), 0, 0); - out[0] = LLVMBuildLoad(builder, outputs[0][0], ""); /*x0 y0 z0 w0*/ - out[1] = LLVMBuildLoad(builder, outputs[0][1], ""); /*x1 y1 z1 w1*/ - out[2] = LLVMBuildLoad(builder, outputs[0][2], ""); /*x2 y2 z2 w2*/ - out[3] = LLVMBuildLoad(builder, outputs[0][3], ""); /*x3 y3 z3 w3*/ + out[0] = LLVMBuildLoad(builder, outputs[0][0], ""); /*x0 x1 x2 x3*/ + out[1] = LLVMBuildLoad(builder, outputs[0][1], ""); /*y0 y1 y2 y3*/ + out[2] = LLVMBuildLoad(builder, outputs[0][2], ""); /*z0 z1 z2 z3*/ + out[3] = LLVMBuildLoad(builder, outputs[0][3], ""); /*w0 w1 w2 w3*/ io0_ptr = LLVMBuildGEP(builder, io_ptr, &ind0, 1, ""); io1_ptr = LLVMBuildGEP(builder, io_ptr, &ind1, 1, ""); @@ -753,22 +753,22 @@ store_clip(LLVMBuilderRef builder, for (int i = 0; i<4; i++){ clip0_ptr = LLVMBuildGEP(builder, clip_ptr0, - indices, 2, ""); //x1 + indices, 2, ""); //x0 clip1_ptr = LLVMBuildGEP(builder, clip_ptr1, - indices, 2, ""); //y1 + indices, 2, ""); //x1 clip2_ptr = LLVMBuildGEP(builder, clip_ptr2, - indices, 2, ""); //z1 + indices, 2, ""); //x2 clip3_ptr = LLVMBuildGEP(builder, clip_ptr3, - indices, 2, ""); //w1 + indices, 2, ""); //x3 - out0elem = LLVMBuildExtractElement(builder, out[0], - indices[1], ""); //x1 - out1elem = LLVMBuildExtractElement(builder, out[1], - indices[1], ""); //y1 - out2elem = LLVMBuildExtractElement(builder, out[2], - indices[1], ""); //z1 - out3elem = LLVMBuildExtractElement(builder, out[3], - indices[1], ""); //w1 + out0elem = LLVMBuildExtractElement(builder, out[i], + ind0, ""); //x0 + out1elem = LLVMBuildExtractElement(builder, out[i], + ind1, ""); //x1 + out2elem = LLVMBuildExtractElement(builder, out[i], + ind2, ""); //x2 + out3elem = LLVMBuildExtractElement(builder, out[i], + ind3, ""); //x3 LLVMBuildStore(builder, out0elem, clip0_ptr); LLVMBuildStore(builder, out1elem, clip1_ptr); @@ -822,7 +822,9 @@ generate_viewport(struct draw_llvm *llvm, */ static LLVMValueRef generate_clipmask(LLVMBuilderRef builder, - LLVMValueRef (*outputs)[NUM_CHANNELS]) + LLVMValueRef (*outputs)[NUM_CHANNELS], + boolean disable_zclipping, + boolean enable_d3dclipping) { LLVMValueRef mask; /* stores the <4xi32> clipmasks */ LLVMValueRef test, temp; @@ -866,22 +868,31 @@ generate_clipmask(LLVMBuilderRef builder, temp = LLVMBuildShl(builder, temp, shift, ""); test = LLVMBuildAnd(builder, test, temp, ""); mask = LLVMBuildOr(builder, mask, test, ""); - - /* plane 5 */ - test = LLVMBuildFAdd(builder, pos_z, pos_w, ""); - test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, test); - temp = LLVMBuildShl(builder, temp, shift, ""); - test = LLVMBuildAnd(builder, test, temp, ""); - mask = LLVMBuildOr(builder, mask, test, ""); - - /* plane 6 */ - test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, pos_z, pos_w); - temp = LLVMBuildShl(builder, temp, shift, ""); - test = LLVMBuildAnd(builder, test, temp, ""); - mask = LLVMBuildOr(builder, mask, test, ""); - + + if (!disable_zclipping){ + if (enable_d3dclipping){ + /* plane 5 */ + test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, pos_z); + temp = LLVMBuildShl(builder, temp, shift, ""); + test = LLVMBuildAnd(builder, test, temp, ""); + mask = LLVMBuildOr(builder, mask, test, ""); + } + else{ + /* plane 5 */ + test = LLVMBuildFAdd(builder, pos_z, pos_w, ""); + test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, test); + temp = LLVMBuildShl(builder, temp, shift, ""); + test = LLVMBuildAnd(builder, test, temp, ""); + mask = LLVMBuildOr(builder, mask, test, ""); + } + /* plane 6 */ + test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, pos_z, pos_w); + temp = LLVMBuildShl(builder, temp, shift, ""); + test = LLVMBuildAnd(builder, test, temp, ""); + mask = LLVMBuildOr(builder, mask, test, ""); + } + return mask; - } /* @@ -930,8 +941,9 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS]; void *code; struct lp_build_sampler_soa *sampler = 0; - LLVMValueRef ret, ret_ptr; + boolean disable_cliptest = variant->key.disable_cliptest; + boolean disable_viewport = variant->key.disable_viewport; arg_types[0] = llvm->context_ptr_type; /* context */ arg_types[1] = llvm->vertex_header_ptr_type; /* vertex_header */ @@ -1040,13 +1052,23 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) /* store original positions in clip before further manipulation */ store_clip(builder, io, outputs); - /* allocate clipmask, assign it integer type */ - clipmask = generate_clipmask(builder, outputs); - clipmask_bool(builder, clipmask, ret_ptr); + /* do cliptest */ + if (!disable_cliptest){ + /* allocate clipmask, assign it integer type */ + clipmask = generate_clipmask(builder, outputs, + variant->key.disable_zclipping, variant->key.enable_d3dclipping); + /* return clipping boolean value for function */ + clipmask_bool(builder, clipmask, ret_ptr); + } + else{ + clipmask = lp_build_const_int_vec(lp_type_int_vec(32), 0); + } /* do viewport mapping */ - generate_viewport(llvm, builder, outputs); - + if (!disable_viewport){ + generate_viewport(llvm, builder, outputs); + } + /* store clipmask in vertex header and positions in data */ convert_to_aos(builder, io, outputs, clipmask, draw->vs.vertex_shader->info.num_outputs, @@ -1115,6 +1137,8 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian void *code; struct lp_build_sampler_soa *sampler = 0; LLVMValueRef ret, ret_ptr; + boolean disable_cliptest = variant->key.disable_cliptest; + boolean disable_viewport = variant->key.disable_viewport; arg_types[0] = llvm->context_ptr_type; /* context */ arg_types[1] = llvm->vertex_header_ptr_type; /* vertex_header */ @@ -1232,13 +1256,23 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian /* store original positions in clip before further manipulation */ store_clip(builder, io, outputs); - /* allocate clipmask, assign it integer type */ - clipmask = generate_clipmask(builder, outputs); - clipmask_bool(builder, clipmask, ret_ptr); - - /* do viewport mapping */ - generate_viewport(llvm, builder, outputs); + /* do cliptest */ + if (!disable_cliptest){ + /* allocate clipmask, assign it integer type */ + clipmask = generate_clipmask(builder, outputs, + variant->key.disable_zclipping, variant->key.enable_d3dclipping); + /* return clipping boolean value for function */ + clipmask_bool(builder, clipmask, ret_ptr); + } + else{ + clipmask = lp_build_const_int_vec(lp_type_int_vec(32), 0); + } + /* do viewport mapping */ + if (!disable_viewport){ + generate_viewport(llvm, builder, outputs); + } + /* store clipmask in vertex header, * original positions in clip * and transformed positions in data @@ -1303,6 +1337,12 @@ draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store) */ key->nr_vertex_elements = llvm->draw->pt.nr_vertex_elements; + /* will have to rig this up properly later */ + key->disable_cliptest = 0; + key->disable_viewport = 0; + key->disable_zclipping = 0; + key->enable_d3dclipping = 0; + /* All variants of this shader will have the same value for * nr_samplers. Not yet trying to compact away holes in the * sampler array. diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h index 1142ea51cfb..8ff366956ce 100644 --- a/src/gallium/auxiliary/draw/draw_llvm.h +++ b/src/gallium/auxiliary/draw/draw_llvm.h @@ -160,7 +160,11 @@ typedef int struct draw_llvm_variant_key { unsigned nr_vertex_elements:16; - unsigned nr_samplers:16; + unsigned nr_samplers:12; + unsigned disable_cliptest:1; + unsigned disable_viewport:1; + unsigned disable_zclipping:1; + unsigned enable_d3dclipping:1; /* Variable number of vertex elements: */ diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c index 77291e304e1..dc138b980da 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c @@ -217,6 +217,7 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle, struct draw_vertex_info gs_vert_info; struct draw_vertex_info *vert_info; unsigned opt = fpme->opt; + unsigned clipped = 0; llvm_vert_info.count = fetch_info->count; llvm_vert_info.vertex_size = fpme->vertex_size; @@ -230,7 +231,7 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle, } if (fetch_info->linear) - fpme->current_variant->jit_func( &fpme->llvm->jit_context, + clipped = fpme->current_variant->jit_func( &fpme->llvm->jit_context, llvm_vert_info.verts, (const char **)draw->pt.user.vbuffer, fetch_info->start, @@ -239,7 +240,7 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle, draw->pt.vertex_buffer, draw->instance_id); else - fpme->current_variant->jit_func_elts( &fpme->llvm->jit_context, + clipped = fpme->current_variant->jit_func_elts( &fpme->llvm->jit_context, llvm_vert_info.verts, (const char **)draw->pt.user.vbuffer, fetch_info->elts, @@ -266,6 +267,9 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle, FREE(vert_info->verts); vert_info = &gs_vert_info; prim_info = &gs_prim_info; + + clipped = draw_pt_post_vs_run( fpme->post_vs, vert_info ); + } /* stream output needs to be done before clipping */ @@ -273,11 +277,11 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle, vert_info, prim_info ); - if (draw_pt_post_vs_run( fpme->post_vs, vert_info )) { + if (clipped) { opt |= PT_PIPELINE; } - /* Do we need to run the pipeline? + /* Do we need to run the pipeline? Now will come here if clipped */ if (opt & PT_PIPELINE) { pipeline( fpme, From 25bb05fef075a87ec6e5f2a989049faff2afedd2 Mon Sep 17 00:00:00 2001 From: delphi Date: Mon, 4 Oct 2010 17:08:33 +0100 Subject: [PATCH 006/540] draw: added userclip planes and updated variant_key --- src/gallium/auxiliary/draw/draw_llvm.c | 134 +++++++++++++++++-------- src/gallium/auxiliary/draw/draw_llvm.h | 8 +- 2 files changed, 97 insertions(+), 45 deletions(-) diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c index 940b2d7ae55..9c17e7763ae 100644 --- a/src/gallium/auxiliary/draw/draw_llvm.c +++ b/src/gallium/auxiliary/draw/draw_llvm.c @@ -823,13 +823,21 @@ generate_viewport(struct draw_llvm *llvm, static LLVMValueRef generate_clipmask(LLVMBuilderRef builder, LLVMValueRef (*outputs)[NUM_CHANNELS], - boolean disable_zclipping, - boolean enable_d3dclipping) + boolean clip_xy, + boolean clip_z, + boolean clip_user, + boolean enable_d3dclipping, + struct draw_llvm *llvm) { LLVMValueRef mask; /* stores the <4xi32> clipmasks */ LLVMValueRef test, temp; LLVMValueRef zero, shift; LLVMValueRef pos_x, pos_y, pos_z, pos_w; + LLVMValueRef planes, sum; + + unsigned nr; + unsigned i; + float (*plane)[4]; struct lp_type f32_type = lp_type_float_vec(32); @@ -843,33 +851,35 @@ generate_clipmask(LLVMBuilderRef builder, pos_w = LLVMBuildLoad(builder, outputs[0][3], ""); /*w0 w1 w2 w3*/ /* Cliptest, for hardwired planes */ - /* plane 1 */ - test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, pos_x , pos_w); - temp = shift; - test = LLVMBuildAnd(builder, test, temp, ""); - mask = test; + if (clip_xy){ + /* plane 1 */ + test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, pos_x , pos_w); + temp = shift; + test = LLVMBuildAnd(builder, test, temp, ""); + mask = test; - /* plane 2 */ - test = LLVMBuildFAdd(builder, pos_x, pos_w, ""); - test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, test); - temp = LLVMBuildShl(builder, temp, shift, ""); - test = LLVMBuildAnd(builder, test, temp, ""); - mask = LLVMBuildOr(builder, mask, test, ""); + /* plane 2 */ + test = LLVMBuildFAdd(builder, pos_x, pos_w, ""); + test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, test); + temp = LLVMBuildShl(builder, temp, shift, ""); + test = LLVMBuildAnd(builder, test, temp, ""); + mask = LLVMBuildOr(builder, mask, test, ""); - /* plane 3 */ - test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, pos_y, pos_w); - temp = LLVMBuildShl(builder, temp, shift, ""); - test = LLVMBuildAnd(builder, test, temp, ""); - mask = LLVMBuildOr(builder, mask, test, ""); + /* plane 3 */ + test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, pos_y, pos_w); + temp = LLVMBuildShl(builder, temp, shift, ""); + test = LLVMBuildAnd(builder, test, temp, ""); + mask = LLVMBuildOr(builder, mask, test, ""); - /* plane 4 */ - test = LLVMBuildFAdd(builder, pos_y, pos_w, ""); - test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, test); - temp = LLVMBuildShl(builder, temp, shift, ""); - test = LLVMBuildAnd(builder, test, temp, ""); - mask = LLVMBuildOr(builder, mask, test, ""); + /* plane 4 */ + test = LLVMBuildFAdd(builder, pos_y, pos_w, ""); + test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, test); + temp = LLVMBuildShl(builder, temp, shift, ""); + test = LLVMBuildAnd(builder, test, temp, ""); + mask = LLVMBuildOr(builder, mask, test, ""); + } - if (!disable_zclipping){ + if (clip_z){ if (enable_d3dclipping){ /* plane 5 */ test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, pos_z); @@ -892,6 +902,32 @@ generate_clipmask(LLVMBuilderRef builder, mask = LLVMBuildOr(builder, mask, test, ""); } + if (clip_user){ + /* userclip planes */ + nr = llvm->draw->nr_planes; + plane = llvm->draw->plane; + for (i = 6; i < nr; i++) { + planes = lp_build_const_vec(f32_type, plane[i][0]); + sum = LLVMBuildMul(builder, planes, pos_x, ""); + + planes = lp_build_const_vec(f32_type, plane[i][1]); + test = LLVMBuildMul(builder, planes, pos_y, ""); + sum = LLVMBuildFAdd(builder, sum, test, ""); + + planes = lp_build_const_vec(f32_type, plane[i][2]); + test = LLVMBuildMul(builder, planes, pos_z, ""); + sum = LLVMBuildFAdd(builder, sum, test, ""); + + planes = lp_build_const_vec(f32_type, plane[i][3]); + test = LLVMBuildMul(builder, planes, pos_w, ""); + sum = LLVMBuildFAdd(builder, sum, test, ""); + + test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, sum); + temp = LLVMBuildShl(builder, temp, shift, ""); + test = LLVMBuildAnd(builder, test, temp, ""); + mask = LLVMBuildOr(builder, mask, test, ""); + } + } return mask; } @@ -942,8 +978,10 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) void *code; struct lp_build_sampler_soa *sampler = 0; LLVMValueRef ret, ret_ptr; - boolean disable_cliptest = variant->key.disable_cliptest; - boolean disable_viewport = variant->key.disable_viewport; + boolean bypass_viewport = variant->key.bypass_viewport; + boolean enable_cliptest = variant->key.clip_xy || + variant->key.clip_z || + variant->key.clip_user; arg_types[0] = llvm->context_ptr_type; /* context */ arg_types[1] = llvm->vertex_header_ptr_type; /* vertex_header */ @@ -1053,10 +1091,14 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) store_clip(builder, io, outputs); /* do cliptest */ - if (!disable_cliptest){ + if (enable_cliptest){ /* allocate clipmask, assign it integer type */ - clipmask = generate_clipmask(builder, outputs, - variant->key.disable_zclipping, variant->key.enable_d3dclipping); + clipmask = generate_clipmask(builder, outputs, + variant->key.clip_xy, + variant->key.clip_z, + variant->key.clip_user, + variant->key.enable_d3dclipping, + llvm); /* return clipping boolean value for function */ clipmask_bool(builder, clipmask, ret_ptr); } @@ -1065,7 +1107,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) } /* do viewport mapping */ - if (!disable_viewport){ + if (!bypass_viewport){ generate_viewport(llvm, builder, outputs); } @@ -1137,9 +1179,11 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian void *code; struct lp_build_sampler_soa *sampler = 0; LLVMValueRef ret, ret_ptr; - boolean disable_cliptest = variant->key.disable_cliptest; - boolean disable_viewport = variant->key.disable_viewport; - + boolean bypass_viewport = variant->key.bypass_viewport; + boolean enable_cliptest = variant->key.clip_xy || + variant->key.clip_z || + variant->key.clip_user; + arg_types[0] = llvm->context_ptr_type; /* context */ arg_types[1] = llvm->vertex_header_ptr_type; /* vertex_header */ arg_types[2] = llvm->buffer_ptr_type; /* vbuffers */ @@ -1257,10 +1301,14 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian store_clip(builder, io, outputs); /* do cliptest */ - if (!disable_cliptest){ + if (enable_cliptest){ /* allocate clipmask, assign it integer type */ - clipmask = generate_clipmask(builder, outputs, - variant->key.disable_zclipping, variant->key.enable_d3dclipping); + clipmask = generate_clipmask(builder, outputs, + variant->key.clip_xy, + variant->key.clip_z, + variant->key.clip_user, + variant->key.enable_d3dclipping, + llvm); /* return clipping boolean value for function */ clipmask_bool(builder, clipmask, ret_ptr); } @@ -1269,7 +1317,7 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian } /* do viewport mapping */ - if (!disable_viewport){ + if (!bypass_viewport){ generate_viewport(llvm, builder, outputs); } @@ -1338,10 +1386,12 @@ draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store) key->nr_vertex_elements = llvm->draw->pt.nr_vertex_elements; /* will have to rig this up properly later */ - key->disable_cliptest = 0; - key->disable_viewport = 0; - key->disable_zclipping = 0; - key->enable_d3dclipping = 0; + key->clip_xy = llvm->draw->clip_xy; + key->clip_z = llvm->draw->clip_z; + key->clip_user = llvm->draw->clip_user; + key->bypass_viewport = llvm->draw->identity_viewport; + key->enable_d3dclipping = (boolean)!llvm->draw->rasterizer->gl_rasterization_rules; + key->need_edgeflags = (llvm->draw->vs.edgeflag_output ? TRUE : FALSE); /* All variants of this shader will have the same value for * nr_samplers. Not yet trying to compact away holes in the diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h index 8ff366956ce..4a4d93f33a8 100644 --- a/src/gallium/auxiliary/draw/draw_llvm.h +++ b/src/gallium/auxiliary/draw/draw_llvm.h @@ -161,10 +161,12 @@ struct draw_llvm_variant_key { unsigned nr_vertex_elements:16; unsigned nr_samplers:12; - unsigned disable_cliptest:1; - unsigned disable_viewport:1; - unsigned disable_zclipping:1; + unsigned clip_xy:1; + unsigned clip_z:1; + unsigned clip_user:1; + unsigned bypass_viewport:1; unsigned enable_d3dclipping:1; + unsigned need_edgeflags:1; /* Variable number of vertex elements: */ From 3d6eec0a87ee5549e817cdabb4b6424960678189 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Tue, 5 Oct 2010 14:32:59 -0600 Subject: [PATCH 007/540] st/mesa: replace assertion w/ conditional in framebuffer invalidation https://bugs.freedesktop.org/show_bug.cgi?id=30632 NOTE: this is a candidate for the 7.9 branch. --- src/mesa/state_tracker/st_manager.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c index 66e32b4c9e8..cd418a03661 100644 --- a/src/mesa/state_tracker/st_manager.c +++ b/src/mesa/state_tracker/st_manager.c @@ -486,9 +486,18 @@ st_context_notify_invalid_framebuffer(struct st_context_iface *stctxi, stfb = st_ws_framebuffer(st->ctx->WinSysDrawBuffer); if (!stfb || stfb->iface != stfbi) stfb = st_ws_framebuffer(st->ctx->WinSysReadBuffer); - assert(stfb && stfb->iface == stfbi); - p_atomic_set(&stfb->revalidate, TRUE); + if (stfb && stfb->iface == stfbi) { + p_atomic_set(&stfb->revalidate, TRUE); + } + else { + /* This function is probably getting called when we've detected a + * change in a window's size but the currently bound context is + * not bound to that window. + * If the st_framebuffer_iface structure had a pointer to the + * corresponding st_framebuffer we'd be able to handle this. + */ + } } static void From ea5a74fb5892c9b6ca62054be2ee83a743103f4c Mon Sep 17 00:00:00 2001 From: Jerome Glisse Date: Tue, 5 Oct 2010 16:14:11 -0400 Subject: [PATCH 008/540] r600g: userspace fence to avoid kernel call for testing bo busy status Signed-off-by: Jerome Glisse --- src/gallium/drivers/r600/r600.h | 4 + .../winsys/r600/drm/evergreen_hw_context.c | 7 ++ src/gallium/winsys/r600/drm/r600_hw_context.c | 73 ++++++++++++++++++- src/gallium/winsys/r600/drm/r600_priv.h | 5 +- src/gallium/winsys/r600/drm/r600d.h | 1 + src/gallium/winsys/r600/drm/radeon_bo.c | 60 ++++----------- 6 files changed, 103 insertions(+), 47 deletions(-) diff --git a/src/gallium/drivers/r600/r600.h b/src/gallium/drivers/r600/r600.h index 630177d6add..24e25cec0db 100644 --- a/src/gallium/drivers/r600/r600.h +++ b/src/gallium/drivers/r600/r600.h @@ -233,6 +233,10 @@ struct r600_context { u32 *pm4; struct list_head query_list; unsigned num_query_running; + unsigned fence; + struct list_head fenced_bo; + unsigned *cfence; + struct r600_bo *fence_bo; }; struct r600_draw { diff --git a/src/gallium/winsys/r600/drm/evergreen_hw_context.c b/src/gallium/winsys/r600/drm/evergreen_hw_context.c index 1355b079450..2093a2d09c1 100644 --- a/src/gallium/winsys/r600/drm/evergreen_hw_context.c +++ b/src/gallium/winsys/r600/drm/evergreen_hw_context.c @@ -613,6 +613,13 @@ int evergreen_context_init(struct r600_context *ctx, struct radeon *radeon) r = -ENOMEM; goto out_err; } + /* save 16dwords space for fence mecanism */ + ctx->pm4_ndwords -= 16; + + r = r600_context_init_fence(ctx); + if (r) { + goto out_err; + } /* init dirty list */ LIST_INITHEAD(&ctx->dirty); diff --git a/src/gallium/winsys/r600/drm/r600_hw_context.c b/src/gallium/winsys/r600/drm/r600_hw_context.c index b379499f060..7d81d734571 100644 --- a/src/gallium/winsys/r600/drm/r600_hw_context.c +++ b/src/gallium/winsys/r600/drm/r600_hw_context.c @@ -41,6 +41,44 @@ #define GROUP_FORCE_NEW_BLOCK 0 +int r600_context_init_fence(struct r600_context *ctx) +{ + ctx->fence = 1; + ctx->fence_bo = r600_bo(ctx->radeon, 4096, 0, 0); + if (ctx->fence_bo == NULL) { + return -ENOMEM; + } + ctx->cfence = r600_bo_map(ctx->radeon, ctx->fence_bo, PB_USAGE_UNSYNCHRONIZED, NULL); + *ctx->cfence = 0; + LIST_INITHEAD(&ctx->fenced_bo); + return 0; +} + +static void INLINE r600_context_update_fenced_list(struct r600_context *ctx) +{ + for (int i = 0; i < ctx->creloc; i++) { + if (!LIST_IS_EMPTY(&ctx->bo[i]->fencedlist)) + LIST_DELINIT(&ctx->bo[i]->fencedlist); + LIST_ADDTAIL(&ctx->bo[i]->fencedlist, &ctx->fenced_bo); + ctx->bo[i]->fence = ctx->fence; + ctx->bo[i]->ctx = ctx; + } +} + +static void INLINE r600_context_fence_wraparound(struct r600_context *ctx, unsigned fence) +{ + struct radeon_bo *bo, *tmp; + + LIST_FOR_EACH_ENTRY_SAFE(bo, tmp, &ctx->fenced_bo, fencedlist) { + if (bo->fence <= *ctx->cfence) { + LIST_DELINIT(&bo->fencedlist); + bo->fence = 0; + } else { + bo->fence = fence; + } + } +} + int r600_context_add_block(struct r600_context *ctx, const struct r600_reg *reg, unsigned nreg) { struct r600_block *block; @@ -572,6 +610,9 @@ void r600_context_fini(struct r600_context *ctx) } free(ctx->reloc); free(ctx->pm4); + if (ctx->fence_bo) { + r600_bo_reference(ctx->radeon, &ctx->fence_bo, NULL); + } memset(ctx, 0, sizeof(struct r600_context)); } @@ -691,6 +732,13 @@ int r600_context_init(struct r600_context *ctx, struct radeon *radeon) r = -ENOMEM; goto out_err; } + /* save 16dwords space for fence mecanism */ + ctx->pm4_ndwords -= 16; + + r = r600_context_init_fence(ctx); + if (r) { + goto out_err; + } /* init dirty list */ LIST_INITHEAD(&ctx->dirty); @@ -1019,6 +1067,7 @@ void r600_context_flush(struct r600_context *ctx) struct drm_radeon_cs drmib; struct drm_radeon_cs_chunk chunks[2]; uint64_t chunk_array[2]; + unsigned fence; int r; if (!ctx->pm4_cdwords) @@ -1028,6 +1077,18 @@ void r600_context_flush(struct r600_context *ctx) r600_context_queries_suspend(ctx); radeon_bo_pbmgr_flush_maps(ctx->radeon->kman); + + /* emit fence */ + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE_EOP, 4); + ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT | (5 << 8); + ctx->pm4[ctx->pm4_cdwords++] = 0; + ctx->pm4[ctx->pm4_cdwords++] = (1 << 29) | (0 << 24); + ctx->pm4[ctx->pm4_cdwords++] = ctx->fence; + ctx->pm4[ctx->pm4_cdwords++] = 0; + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0); + ctx->pm4[ctx->pm4_cdwords++] = 0; + r600_context_bo_reloc(ctx, &ctx->pm4[ctx->pm4_cdwords - 1], ctx->fence_bo); + #if 1 /* emit cs */ drmib.num_chunks = 2; @@ -1043,8 +1104,18 @@ void r600_context_flush(struct r600_context *ctx) r = drmCommandWriteRead(ctx->radeon->fd, DRM_RADEON_CS, &drmib, sizeof(struct drm_radeon_cs)); #endif + + r600_context_update_fenced_list(ctx); + + fence = ctx->fence + 1; + if (fence < ctx->fence) { + /* wrap around */ + fence = 1; + r600_context_fence_wraparound(ctx, fence); + } + ctx->fence = fence; + /* restart */ - radeon_bo_fencelist(ctx->radeon, ctx->bo, ctx->creloc); for (int i = 0; i < ctx->creloc; i++) { ctx->bo[i]->reloc = NULL; ctx->bo[i]->last_flush = 0; diff --git a/src/gallium/winsys/r600/drm/r600_priv.h b/src/gallium/winsys/r600/drm/r600_priv.h index ea2cf347785..a693a5b5ab8 100644 --- a/src/gallium/winsys/r600/drm/r600_priv.h +++ b/src/gallium/winsys/r600/drm/r600_priv.h @@ -64,9 +64,9 @@ struct radeon_bo { unsigned map_count; void *data; struct list_head fencedlist; + unsigned fence; + struct r600_context *ctx; boolean shared; - int64_t last_busy; - boolean set_busy; struct r600_reloc *reloc; unsigned reloc_id; unsigned last_flush; @@ -103,6 +103,7 @@ struct pb_buffer *radeon_bo_pb_create_buffer_from_handle(struct pb_manager *_mgr uint32_t handle); /* r600_hw_context.c */ +int r600_context_init_fence(struct r600_context *ctx); void r600_context_bo_reloc(struct r600_context *ctx, u32 *pm4, struct r600_bo *rbo); void r600_context_bo_flush(struct r600_context *ctx, unsigned flush_flags, unsigned flush_mask, struct r600_bo *rbo); diff --git a/src/gallium/winsys/r600/drm/r600d.h b/src/gallium/winsys/r600/drm/r600d.h index ccc9ffaf8e3..d91f7737af3 100644 --- a/src/gallium/winsys/r600/drm/r600d.h +++ b/src/gallium/winsys/r600/drm/r600d.h @@ -91,6 +91,7 @@ #define PKT3_SET_CTL_CONST 0x6F #define PKT3_SURFACE_BASE_UPDATE 0x73 +#define EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT 0x14 #define EVENT_TYPE_ZPASS_DONE 0x15 #define EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT 0x16 diff --git a/src/gallium/winsys/r600/drm/radeon_bo.c b/src/gallium/winsys/r600/drm/radeon_bo.c index 42a23f0ab8f..836d5d77e09 100644 --- a/src/gallium/winsys/r600/drm/radeon_bo.c +++ b/src/gallium/winsys/r600/drm/radeon_bo.c @@ -128,7 +128,6 @@ struct radeon_bo *radeon_bo(struct radeon *radeon, unsigned handle, return bo; } - static void radeon_bo_destroy(struct radeon *radeon, struct radeon_bo *bo) { struct drm_gem_close args; @@ -158,9 +157,15 @@ int radeon_bo_wait(struct radeon *radeon, struct radeon_bo *bo) struct drm_radeon_gem_wait_idle args; int ret; - if (LIST_IS_EMPTY(&bo->fencedlist) && !bo->shared) + if (!bo->fence && !bo->shared) return 0; + if (bo->fence <= *bo->ctx->cfence) { + LIST_DELINIT(&bo->fencedlist); + bo->fence = 0; + return 0; + } + /* Zero out args to make valgrind happy */ memset(&args, 0, sizeof(args)); args.handle = bo->handle; @@ -171,22 +176,20 @@ int radeon_bo_wait(struct radeon *radeon, struct radeon_bo *bo) return ret; } -#define BO_BUSY_BACKOFF 10000 - int radeon_bo_busy(struct radeon *radeon, struct radeon_bo *bo, uint32_t *domain) { struct drm_radeon_gem_busy args; int ret; - int64_t now; - now = os_time_get(); - if (LIST_IS_EMPTY(&bo->fencedlist) && !bo->shared) - return 0; - - if (bo->set_busy && (now - bo->last_busy < BO_BUSY_BACKOFF)) - return -EBUSY; - - bo->set_busy = FALSE; + if (!bo->shared) { + if (!bo->fence) + return 0; + if (bo->fence <= *bo->ctx->cfence) { + LIST_DELINIT(&bo->fencedlist); + bo->fence = 0; + return 0; + } + } memset(&args, 0, sizeof(args)); args.handle = bo->handle; @@ -195,37 +198,6 @@ int radeon_bo_busy(struct radeon *radeon, struct radeon_bo *bo, uint32_t *domain ret = drmCommandWriteRead(radeon->fd, DRM_RADEON_GEM_BUSY, &args, sizeof(args)); - if (ret == 0) { - struct radeon_bo *entry, *tent; - LIST_FOR_EACH_ENTRY_SAFE(entry, tent, &bo->fencedlist, fencedlist) { - LIST_DELINIT(&entry->fencedlist); - } - } else { - bo->set_busy = TRUE; - bo->last_busy = now; - } *domain = args.domain; return ret; } - -int radeon_bo_fencelist(struct radeon *radeon, struct radeon_bo **bolist, - uint32_t num_bo) -{ - struct radeon_bo *first; - int i; - - first = bolist[0]; - - if (!LIST_IS_EMPTY(&first->fencedlist)) - LIST_DELINIT(&first->fencedlist); - - LIST_INITHEAD(&first->fencedlist); - - for (i = 1; i < num_bo; i++) { - if (!LIST_IS_EMPTY(&bolist[i]->fencedlist)) - LIST_DELINIT(&bolist[i]->fencedlist); - - LIST_ADDTAIL(&bolist[i]->fencedlist, &first->fencedlist); - } - return 0; -} From 9528fc2107d4cae39bc932c1943ffc57ebc92499 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Wed, 6 Oct 2010 09:21:16 +1000 Subject: [PATCH 009/540] r600g: add evergreen stencil support. this sets the stencil up for evergreen properly. --- src/gallium/drivers/r600/eg_state_inlines.h | 8 ++++++++ src/gallium/drivers/r600/evergreen_state.c | 18 ++++++++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/r600/eg_state_inlines.h b/src/gallium/drivers/r600/eg_state_inlines.h index 81094904485..d7a880f5d3a 100644 --- a/src/gallium/drivers/r600/eg_state_inlines.h +++ b/src/gallium/drivers/r600/eg_state_inlines.h @@ -276,6 +276,14 @@ static inline uint32_t r600_translate_dbformat(enum pipe_format format) } } +static inline uint32_t r600_translate_stencilformat(enum pipe_format format) +{ + if (format == PIPE_FORMAT_Z24_UNORM_S8_USCALED) + return 1; + else + return 0; +} + static inline uint32_t r600_translate_colorswap(enum pipe_format format) { switch (format) { diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index 147d4f372e6..603a8a7b915 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -795,7 +795,7 @@ static void evergreen_db(struct r600_pipe_context *rctx, struct r600_pipe_state struct r600_resource_texture *rtex; struct r600_resource *rbuffer; unsigned level; - unsigned pitch, slice, format; + unsigned pitch, slice, format, stencil_format; if (state->zsbuf == NULL) return; @@ -811,13 +811,27 @@ static void evergreen_db(struct r600_pipe_context *rctx, struct r600_pipe_state pitch = (rtex->pitch[level] / rtex->bpt) / 8 - 1; slice = (rtex->pitch[level] / rtex->bpt) * state->zsbuf->height / 64 - 1; format = r600_translate_dbformat(state->zsbuf->texture->format); + stencil_format = r600_translate_stencilformat(state->zsbuf->texture->format); r600_pipe_state_add_reg(rstate, R_028048_DB_Z_READ_BASE, (state->zsbuf->offset + r600_bo_offset(rbuffer->bo)) >> 8, 0xFFFFFFFF, rbuffer->bo); r600_pipe_state_add_reg(rstate, R_028050_DB_Z_WRITE_BASE, (state->zsbuf->offset + r600_bo_offset(rbuffer->bo)) >> 8, 0xFFFFFFFF, rbuffer->bo); -// r600_pipe_state_add_reg(rstate, R_028014_DB_HTILE_DATA_BASE, state->zsbuf->offset >> 8, 0xFFFFFFFF, rbuffer->bo); + + if (stencil_format) { + uint32_t stencil_offset; + + stencil_offset = ((state->zsbuf->height * rtex->pitch[level]) + 255) & ~255; + r600_pipe_state_add_reg(rstate, R_02804C_DB_STENCIL_READ_BASE, + (state->zsbuf->offset + stencil_offset + r600_bo_offset(rbuffer->bo)) >> 8, 0xFFFFFFFF, rbuffer->bo); + r600_pipe_state_add_reg(rstate, R_028054_DB_STENCIL_WRITE_BASE, + (state->zsbuf->offset + stencil_offset + r600_bo_offset(rbuffer->bo)) >> 8, 0xFFFFFFFF, rbuffer->bo); + } + r600_pipe_state_add_reg(rstate, R_028008_DB_DEPTH_VIEW, 0x00000000, 0xFFFFFFFF, NULL); + r600_pipe_state_add_reg(rstate, R_028044_DB_STENCIL_INFO, + S_028044_FORMAT(stencil_format), 0xFFFFFFFF, rbuffer->bo); + r600_pipe_state_add_reg(rstate, R_028040_DB_Z_INFO, S_028040_ARRAY_MODE(rtex->array_mode) | S_028040_FORMAT(format), 0xFFFFFFFF, rbuffer->bo); From 5661e51c01cec8d8f4a1400300ced8f8f07eaff3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Fonseca?= Date: Mon, 4 Oct 2010 17:05:03 +0100 Subject: [PATCH 010/540] retrace: Handle clear_render_target and clear_depth_stencil. --- src/gallium/tests/python/retrace/interpreter.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/gallium/tests/python/retrace/interpreter.py b/src/gallium/tests/python/retrace/interpreter.py index 16132abb06d..954a701a53f 100755 --- a/src/gallium/tests/python/retrace/interpreter.py +++ b/src/gallium/tests/python/retrace/interpreter.py @@ -610,6 +610,15 @@ class Context(Object): _rgba[i] = rgba[i] self.real.clear(buffers, _rgba, depth, stencil) + def clear_render_target(self, dst, rgba, dstx, dsty, width, height): + _rgba = gallium.FloatArray(4) + for i in range(4): + _rgba[i] = rgba[i] + self.real.clear_render_target(dst, _rgba, dstx, dsty, width, height) + + def clear_depth_stencil(self, dst, clear_flags, depth, stencil, dstx, dsty, width, height): + self.real.clear_depth_stencil(dst, clear_flags, depth, stencil, dstx, dsty, width, height) + def _present(self): self.real.flush() From 591e1bc34f5a5dd065614deae41b59682f59ac08 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Sun, 3 Oct 2010 11:39:02 +0100 Subject: [PATCH 011/540] llvmpipe: make debug_fs_variant respect variant->nr_samplers --- src/gallium/drivers/llvmpipe/lp_state_fs.c | 48 +++++++++++----------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c index f0a15e11b9b..5a561f9abd1 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c @@ -782,31 +782,29 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key) debug_printf("blend.alpha_dst_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE)); } debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask); - for (i = 0; i < PIPE_MAX_SAMPLERS; ++i) { - if (key->sampler[i].format) { - debug_printf("sampler[%u] = \n", i); - debug_printf(" .format = %s\n", - util_format_name(key->sampler[i].format)); - debug_printf(" .target = %s\n", - util_dump_tex_target(key->sampler[i].target, TRUE)); - debug_printf(" .pot = %u %u %u\n", - key->sampler[i].pot_width, - key->sampler[i].pot_height, - key->sampler[i].pot_depth); - debug_printf(" .wrap = %s %s %s\n", - util_dump_tex_wrap(key->sampler[i].wrap_s, TRUE), - util_dump_tex_wrap(key->sampler[i].wrap_t, TRUE), - util_dump_tex_wrap(key->sampler[i].wrap_r, TRUE)); - debug_printf(" .min_img_filter = %s\n", - util_dump_tex_filter(key->sampler[i].min_img_filter, TRUE)); - debug_printf(" .min_mip_filter = %s\n", - util_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE)); - debug_printf(" .mag_img_filter = %s\n", - util_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE)); - if (key->sampler[i].compare_mode != PIPE_TEX_COMPARE_NONE) - debug_printf(" .compare_func = %s\n", util_dump_func(key->sampler[i].compare_func, TRUE)); - debug_printf(" .normalized_coords = %u\n", key->sampler[i].normalized_coords); - } + for (i = 0; i < key->nr_samplers; ++i) { + debug_printf("sampler[%u] = \n", i); + debug_printf(" .format = %s\n", + util_format_name(key->sampler[i].format)); + debug_printf(" .target = %s\n", + util_dump_tex_target(key->sampler[i].target, TRUE)); + debug_printf(" .pot = %u %u %u\n", + key->sampler[i].pot_width, + key->sampler[i].pot_height, + key->sampler[i].pot_depth); + debug_printf(" .wrap = %s %s %s\n", + util_dump_tex_wrap(key->sampler[i].wrap_s, TRUE), + util_dump_tex_wrap(key->sampler[i].wrap_t, TRUE), + util_dump_tex_wrap(key->sampler[i].wrap_r, TRUE)); + debug_printf(" .min_img_filter = %s\n", + util_dump_tex_filter(key->sampler[i].min_img_filter, TRUE)); + debug_printf(" .min_mip_filter = %s\n", + util_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE)); + debug_printf(" .mag_img_filter = %s\n", + util_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE)); + if (key->sampler[i].compare_mode != PIPE_TEX_COMPARE_NONE) + debug_printf(" .compare_func = %s\n", util_dump_func(key->sampler[i].compare_func, TRUE)); + debug_printf(" .normalized_coords = %u\n", key->sampler[i].normalized_coords); } } From 446dbb921762710d678486f3f5a6dfdf318fe34c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Fonseca?= Date: Tue, 5 Oct 2010 10:50:02 +0100 Subject: [PATCH 012/540] llvmpipe: Dump a few missing shader key flags. --- src/gallium/drivers/llvmpipe/lp_state_fs.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c index 5a561f9abd1..e50768445ff 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c @@ -746,6 +746,9 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key) debug_printf("fs variant %p:\n", (void *) key); + if (key->flatshade) { + debug_printf("flatshade = 1\n"); + } for (i = 0; i < key->nr_cbufs; ++i) { debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i])); } @@ -770,6 +773,10 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key) debug_printf("alpha.func = %s\n", util_dump_func(key->alpha.func, TRUE)); } + if (key->occlusion_count) { + debug_printf("occlusion_count = 1\n"); + } + if (key->blend.logicop_enable) { debug_printf("blend.logicop_func = %s\n", util_dump_logicop(key->blend.logicop_func, TRUE)); } From e74955eba3fc22fcf6e9111a4e5bbc095d34d357 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Fonseca?= Date: Sun, 26 Sep 2010 11:22:20 +0100 Subject: [PATCH 013/540] llvmpipe: Fix perspective interpolation for point sprites. Once a fragment is generated with LP_INTERP_PERSPECTIVE set for an input, it will do a divide by w for that input. Therefore it's not OK to treat LP_INTERP_PERSPECTIVE as LP_INTERP_LINEAR or vice-versa, even if the attribute is known to not vary. A better strategy would be to take the primitive in consideration when generating the fragment shader key, and therefore avoid the per-fragment perspective divide. --- src/gallium/drivers/llvmpipe/lp_setup_point.c | 71 ++++++++++++++----- 1 file changed, 54 insertions(+), 17 deletions(-) diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c index 3b217f95447..c91e85f9159 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_point.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c @@ -64,12 +64,37 @@ constant_coef(struct lp_setup_context *setup, } +static void +point_persp_coeff(struct lp_setup_context *setup, + struct lp_rast_triangle *point, + const struct point_info *info, + unsigned slot, + unsigned i) +{ + /* + * Fragment shader expects pre-multiplied w for LP_INTERP_PERSPECTIVE. A + * better stratergy would be to take the primitive in consideration when + * generating the fragment shader key, and therefore avoid the per-fragment + * perspective divide. + */ + + float w0 = info->v0[0][3]; + + assert(i < 4); + + point->inputs.a0[slot][i] = info->v0[slot][i]*w0; + point->inputs.dadx[slot][i] = 0.0f; + point->inputs.dady[slot][i] = 0.0f; +} + + /** * Setup automatic texcoord coefficients (for sprite rendering). * \param slot the vertex attribute slot to setup * \param i the attribute channel in [0,3] * \param sprite_coord_origin one of PIPE_SPRITE_COORD_x - * \param perspective_proj will the TEX instruction do a divide by Q? + * \param perspective does the shader expects pre-multiplied w, i.e., + * LP_INTERP_PERSPECTIVE is specified in the shader key */ static void texcoord_coef(struct lp_setup_context *setup, @@ -78,7 +103,7 @@ texcoord_coef(struct lp_setup_context *setup, unsigned slot, unsigned i, unsigned sprite_coord_origin, - boolean perspective_proj) + boolean perspective) { assert(i < 4); @@ -92,7 +117,7 @@ texcoord_coef(struct lp_setup_context *setup, point->inputs.dady[slot][0] = dady; point->inputs.a0[slot][0] = 0.5 - (dadx * x0 + dady * y0); - if (!perspective_proj) { + if (perspective) { /* Divide coefficients by vertex.w here. * * It would be clearer to always multiply by w0 above and @@ -119,7 +144,7 @@ texcoord_coef(struct lp_setup_context *setup, point->inputs.dady[slot][1] = dady; point->inputs.a0[slot][1] = 0.5 - (dadx * x0 + dady * y0); - if (!perspective_proj) { + if (perspective) { float w0 = info->v0[0][3]; point->inputs.dadx[slot][1] *= w0; point->inputs.dady[slot][1] *= w0; @@ -193,11 +218,17 @@ setup_point_coefficients( struct lp_setup_context *setup, /* setup interpolation for all the remaining attributes: */ for (slot = 0; slot < setup->fs.nr_inputs; slot++) { + enum lp_interp interp = setup->fs.input[slot].interp; + boolean perspective = !!(interp == LP_INTERP_PERSPECTIVE); unsigned vert_attr = setup->fs.input[slot].src_index; unsigned usage_mask = setup->fs.input[slot].usage_mask; unsigned i; + + if (perspective & usage_mask) { + fragcoord_usage_mask |= TGSI_WRITEMASK_W; + } - switch (setup->fs.input[slot].interp) { + switch (interp) { case LP_INTERP_POSITION: /* * The generated pixel interpolators will pick up the coeffs from @@ -210,32 +241,38 @@ setup_point_coefficients( struct lp_setup_context *setup, case LP_INTERP_LINEAR: /* Sprite tex coords may use linear interpolation someday */ /* fall-through */ - case LP_INTERP_PERSPECTIVE: /* check if the sprite coord flag is set for this attribute. * If so, set it up so it up so x and y vary from 0 to 1. */ if (shader->info.input_semantic_name[slot] == TGSI_SEMANTIC_GENERIC) { - const int index = shader->info.input_semantic_index[slot]; + unsigned semantic_index = shader->info.input_semantic_index[slot]; /* Note that sprite_coord enable is a bitfield of * PIPE_MAX_SHADER_OUTPUTS bits. */ - if (index < PIPE_MAX_SHADER_OUTPUTS && - (setup->sprite_coord_enable & (1 << index))) { - for (i = 0; i < NUM_CHANNELS; i++) - if (usage_mask & (1 << i)) + if (semantic_index < PIPE_MAX_SHADER_OUTPUTS && + (setup->sprite_coord_enable & (1 << semantic_index))) { + for (i = 0; i < NUM_CHANNELS; i++) { + if (usage_mask & (1 << i)) { texcoord_coef(setup, point, info, slot + 1, i, setup->sprite_coord_origin, - (usage_mask & TGSI_WRITEMASK_W)); - fragcoord_usage_mask |= TGSI_WRITEMASK_W; - break; + perspective); + } + } + break; } } - /* FALLTHROUGH */ + /* fall-through */ case LP_INTERP_CONSTANT: for (i = 0; i < NUM_CHANNELS; i++) { - if (usage_mask & (1 << i)) - constant_coef(setup, point, slot+1, info->v0[vert_attr][i], i); + if (usage_mask & (1 << i)) { + if (perspective) { + point_persp_coeff(setup, point, info, slot+1, i); + } + else { + constant_coef(setup, point, slot+1, info->v0[vert_attr][i], i); + } + } } break; From 06472ad7e835813ef7c9bf8a5cd8b62a25fa9cc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Fonseca?= Date: Wed, 6 Oct 2010 09:40:51 +0100 Subject: [PATCH 014/540] llvmpipe: Fix sprite coord perspective interpolation of Q. Q coordinate's coefficients also need to be multiplied by w, otherwise it will have 1/w, causing problems with TXP. --- src/gallium/drivers/llvmpipe/lp_setup_point.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c index c91e85f9159..1295aeecd87 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_point.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c @@ -105,6 +105,8 @@ texcoord_coef(struct lp_setup_context *setup, unsigned sprite_coord_origin, boolean perspective) { + float w0 = info->v0[0][3]; + assert(i < 4); if (i == 0) { @@ -118,13 +120,6 @@ texcoord_coef(struct lp_setup_context *setup, point->inputs.a0[slot][0] = 0.5 - (dadx * x0 + dady * y0); if (perspective) { - /* Divide coefficients by vertex.w here. - * - * It would be clearer to always multiply by w0 above and - * then divide it out for perspective projection here, but - * doing it this way involves less algebra. - */ - float w0 = info->v0[0][3]; point->inputs.dadx[slot][0] *= w0; point->inputs.dady[slot][0] *= w0; point->inputs.a0[slot][0] *= w0; @@ -145,7 +140,6 @@ texcoord_coef(struct lp_setup_context *setup, point->inputs.a0[slot][1] = 0.5 - (dadx * x0 + dady * y0); if (perspective) { - float w0 = info->v0[0][3]; point->inputs.dadx[slot][1] *= w0; point->inputs.dady[slot][1] *= w0; point->inputs.a0[slot][1] *= w0; @@ -157,7 +151,7 @@ texcoord_coef(struct lp_setup_context *setup, point->inputs.dady[slot][2] = 0.0f; } else { - point->inputs.a0[slot][3] = 1.0f; + point->inputs.a0[slot][3] = perspective ? w0 : 1.0f; point->inputs.dadx[slot][3] = 0.0f; point->inputs.dady[slot][3] = 0.0f; } From 1644bb0f40800169a3402f08b8f8a2758e90efee Mon Sep 17 00:00:00 2001 From: Jerome Glisse Date: Wed, 6 Oct 2010 09:40:27 -0400 Subject: [PATCH 015/540] r600g: avoid segfault due to unintialized list pointer Signed-off-by: Jerome Glisse --- src/gallium/winsys/r600/drm/evergreen_hw_context.c | 6 +++--- src/gallium/winsys/r600/drm/r600_hw_context.c | 10 ++++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/gallium/winsys/r600/drm/evergreen_hw_context.c b/src/gallium/winsys/r600/drm/evergreen_hw_context.c index 2093a2d09c1..9617035c934 100644 --- a/src/gallium/winsys/r600/drm/evergreen_hw_context.c +++ b/src/gallium/winsys/r600/drm/evergreen_hw_context.c @@ -640,7 +640,7 @@ static inline void evergreen_context_pipe_state_set_resource(struct r600_context block->status &= ~(R600_BLOCK_STATUS_ENABLED | R600_BLOCK_STATUS_DIRTY); r600_bo_reference(ctx->radeon, &block->reloc[1].bo, NULL); r600_bo_reference(ctx->radeon , &block->reloc[2].bo, NULL); - LIST_DEL(&block->list); + LIST_DELINIT(&block->list); return; } block->reg[0] = state->regs[0].value; @@ -695,7 +695,7 @@ static inline void evergreen_context_pipe_state_set_sampler(struct r600_context block = range->blocks[CTX_BLOCK_ID(ctx, offset)]; if (state == NULL) { block->status &= ~(R600_BLOCK_STATUS_ENABLED | R600_BLOCK_STATUS_DIRTY); - LIST_DEL(&block->list); + LIST_DELINIT(&block->list); return; } block->reg[0] = state->regs[0].value; @@ -719,7 +719,7 @@ static inline void evergreen_context_pipe_state_set_sampler_border(struct r600_c block = range->blocks[CTX_BLOCK_ID(ctx, fake_offset)]; if (state == NULL) { block->status &= ~(R600_BLOCK_STATUS_ENABLED | R600_BLOCK_STATUS_DIRTY); - LIST_DEL(&block->list); + LIST_DELINIT(&block->list); return; } if (state->nregs <= 3) { diff --git a/src/gallium/winsys/r600/drm/r600_hw_context.c b/src/gallium/winsys/r600/drm/r600_hw_context.c index 7d81d734571..72bfb357b33 100644 --- a/src/gallium/winsys/r600/drm/r600_hw_context.c +++ b/src/gallium/winsys/r600/drm/r600_hw_context.c @@ -125,6 +125,8 @@ int r600_context_add_block(struct r600_context *ctx, const struct r600_reg *reg, block->reg = &block->pm4[block->pm4_ndwords]; block->pm4_ndwords += n; block->nreg = n; + LIST_INITHEAD(&block->list); + for (j = 0; j < n; j++) { if (reg[i+j].need_bo) { block->nbo++; @@ -829,7 +831,7 @@ static inline void r600_context_pipe_state_set_resource(struct r600_context *ctx block->status &= ~(R600_BLOCK_STATUS_ENABLED | R600_BLOCK_STATUS_DIRTY); r600_bo_reference(ctx->radeon, &block->reloc[1].bo, NULL); r600_bo_reference(ctx->radeon , &block->reloc[2].bo, NULL); - LIST_DEL(&block->list); + LIST_DELINIT(&block->list); return; } block->reg[0] = state->regs[0].value; @@ -883,7 +885,7 @@ static inline void r600_context_pipe_state_set_sampler(struct r600_context *ctx, block = range->blocks[CTX_BLOCK_ID(ctx, offset)]; if (state == NULL) { block->status &= ~(R600_BLOCK_STATUS_ENABLED | R600_BLOCK_STATUS_DIRTY); - LIST_DEL(&block->list); + LIST_DELINIT(&block->list); return; } block->reg[0] = state->regs[0].value; @@ -906,7 +908,7 @@ static inline void r600_context_pipe_state_set_sampler_border(struct r600_contex block = range->blocks[CTX_BLOCK_ID(ctx, offset)]; if (state == NULL) { block->status &= ~(R600_BLOCK_STATUS_ENABLED | R600_BLOCK_STATUS_DIRTY); - LIST_DEL(&block->list); + LIST_DELINIT(&block->list); return; } if (state->nregs <= 3) { @@ -1314,7 +1316,7 @@ struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned void r600_context_query_destroy(struct r600_context *ctx, struct r600_query *query) { r600_bo_reference(ctx->radeon, &query->buffer, NULL); - LIST_DEL(&query->list); + LIST_DELINIT(&query->list); free(query); } From dcd0261affc293b75d231e612091ec7b1076fff6 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Tue, 5 Oct 2010 10:20:16 -0700 Subject: [PATCH 016/540] i965: Enable the constant propagation code. A debug disable had slipped in. --- src/mesa/drivers/dri/i965/brw_fs.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index f42c4696410..0c9f17f8fdc 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -2548,8 +2548,6 @@ fs_visitor::propagate_constants() { bool progress = false; - return false; - foreach_iter(exec_list_iterator, iter, this->instructions) { fs_inst *inst = (fs_inst *)iter.get(); From 634abbf7b2e6ea21db30aafc0de9472ee31d4173 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Tue, 5 Oct 2010 10:25:22 -0700 Subject: [PATCH 017/540] i965: Also do constant propagation for the second operand of CMP. We could do the first operand as well by flipping the comparison, but this covered several CMPs in code I was looking at. --- src/mesa/drivers/dri/i965/brw_fs.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 0c9f17f8fdc..914141a3978 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -2607,6 +2607,11 @@ fs_visitor::propagate_constants() scan_inst->src[1] = inst->src[0]; } break; + case BRW_OPCODE_CMP: + if (i == 1) { + scan_inst->src[i] = inst->src[0]; + progress = true; + } } } From 3fabd218a0ffe1aa362440d957cf9135955045a3 Mon Sep 17 00:00:00 2001 From: Jerome Glisse Date: Wed, 6 Oct 2010 12:56:53 -0400 Subject: [PATCH 018/540] r600g: fix dirty state handling Avoid having object ending up in dead list of dirty object. Signed-off-by: Jerome Glisse --- src/gallium/winsys/r600/drm/evergreen_hw_context.c | 5 ++--- src/gallium/winsys/r600/drm/r600_hw_context.c | 8 ++++---- src/gallium/winsys/r600/drm/r600_priv.h | 1 + 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/gallium/winsys/r600/drm/evergreen_hw_context.c b/src/gallium/winsys/r600/drm/evergreen_hw_context.c index 9617035c934..9b39c0c4f01 100644 --- a/src/gallium/winsys/r600/drm/evergreen_hw_context.c +++ b/src/gallium/winsys/r600/drm/evergreen_hw_context.c @@ -762,7 +762,7 @@ void evergreen_context_draw(struct r600_context *ctx, const struct r600_draw *dr struct r600_bo *cb[12]; struct r600_bo *db; unsigned ndwords = 9, flush; - struct r600_block *dirty_block; + struct r600_block *dirty_block, *next_block; if (draw->indices) { ndwords = 13; @@ -817,10 +817,9 @@ void evergreen_context_draw(struct r600_context *ctx, const struct r600_draw *dr } /* enough room to copy packet */ - LIST_FOR_EACH_ENTRY(dirty_block,&ctx->dirty,list) { + LIST_FOR_EACH_ENTRY_SAFE(dirty_block, next_block, &ctx->dirty,list) { r600_context_block_emit_dirty(ctx, dirty_block); } - LIST_INITHEAD(&ctx->dirty); /* draw packet */ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_INDEX_TYPE, 0); diff --git a/src/gallium/winsys/r600/drm/r600_hw_context.c b/src/gallium/winsys/r600/drm/r600_hw_context.c index 72bfb357b33..86bddccbbbb 100644 --- a/src/gallium/winsys/r600/drm/r600_hw_context.c +++ b/src/gallium/winsys/r600/drm/r600_hw_context.c @@ -967,7 +967,7 @@ void r600_context_draw(struct r600_context *ctx, const struct r600_draw *draw) struct r600_bo *cb[8]; struct r600_bo *db; unsigned ndwords = 9; - struct r600_block *dirty_block; + struct r600_block *dirty_block, *next_block; if (draw->indices) { ndwords = 13; @@ -1020,10 +1020,9 @@ void r600_context_draw(struct r600_context *ctx, const struct r600_draw *draw) } /* enough room to copy packet */ - LIST_FOR_EACH_ENTRY(dirty_block,&ctx->dirty,list) { + LIST_FOR_EACH_ENTRY_SAFE(dirty_block, next_block, &ctx->dirty,list) { r600_context_block_emit_dirty(ctx, dirty_block); } - LIST_INITHEAD(&ctx->dirty); /* draw packet */ ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_INDEX_TYPE, 0); @@ -1135,8 +1134,9 @@ void r600_context_flush(struct r600_context *ctx) */ for (int i = 0; i < ctx->nblocks; i++) { if (ctx->blocks[i]->status & R600_BLOCK_STATUS_ENABLED) { - if(!(ctx->blocks[i]->status & R600_BLOCK_STATUS_DIRTY)) + if(!(ctx->blocks[i]->status & R600_BLOCK_STATUS_DIRTY)) { LIST_ADDTAIL(&ctx->blocks[i]->list,&ctx->dirty); + } ctx->pm4_dirty_cdwords += ctx->blocks[i]->pm4_ndwords + ctx->blocks[i]->pm4_flush_ndwords; ctx->blocks[i]->status |= R600_BLOCK_STATUS_DIRTY; } diff --git a/src/gallium/winsys/r600/drm/r600_priv.h b/src/gallium/winsys/r600/drm/r600_priv.h index a693a5b5ab8..e3868d3cb9a 100644 --- a/src/gallium/winsys/r600/drm/r600_priv.h +++ b/src/gallium/winsys/r600/drm/r600_priv.h @@ -162,6 +162,7 @@ static inline void r600_context_block_emit_dirty(struct r600_context *ctx, struc memcpy(&ctx->pm4[ctx->pm4_cdwords], block->pm4, block->pm4_ndwords * 4); ctx->pm4_cdwords += block->pm4_ndwords; block->status ^= R600_BLOCK_STATUS_DIRTY; + LIST_DELINIT(&block->list); } static inline int radeon_bo_map(struct radeon *radeon, struct radeon_bo *bo) From 3c97c00e3810d31c3aa26173eb9fdef91b3e7c87 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 6 Oct 2010 09:57:55 -0700 Subject: [PATCH 019/540] i965: Add back gen6 headerless FB writes to the new FS backend. It's not that hard to detect when we need the header. --- src/mesa/drivers/dri/i965/brw_fs.cpp | 47 ++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 914141a3978..a380eb44525 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -324,6 +324,7 @@ public: this->sampler = 0; this->target = 0; this->eot = false; + this->header_present = false; this->shadow_compare = false; } @@ -376,6 +377,7 @@ public: int sampler; int target; /**< MRT target. */ bool eot; + bool header_present; bool shadow_compare; /** @{ @@ -420,7 +422,10 @@ public: this->virtual_grf_array_size = 0; this->virtual_grf_def = NULL; this->virtual_grf_use = NULL; + + this->kill_emitted = false; } + ~fs_visitor() { talloc_free(this->mem_ctx); @@ -503,6 +508,7 @@ public: ir_variable *frag_color, *frag_data, *frag_depth; int first_non_payload_grf; int urb_setup[FRAG_ATTRIB_MAX]; + bool kill_emitted; /** @{ debug annotation info */ const char *current_annotation; @@ -1509,6 +1515,7 @@ fs_visitor::visit(ir_discard *ir) assert(ir->condition == NULL); /* FINISHME */ emit(fs_inst(FS_OPCODE_DISCARD, temp, temp)); + kill_emitted = true; } void @@ -1843,10 +1850,19 @@ void fs_visitor::emit_fb_writes() { this->current_annotation = "FB write header"; + GLboolean header_present = GL_TRUE; int nr = 0; - /* m0, m1 header */ - nr += 2; + if (intel->gen >= 6 && + !this->kill_emitted && + c->key.nr_color_regions == 1) { + header_present = false; + } + + if (header_present) { + /* m0, m1 header */ + nr += 2; + } if (c->key.aa_dest_stencil_reg) { emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++), @@ -1911,6 +1927,7 @@ fs_visitor::emit_fb_writes() reg_undef, reg_undef)); inst->mlen = nr; inst->eot = true; + inst->header_present = header_present; } this->current_annotation = NULL; @@ -1929,18 +1946,22 @@ fs_visitor::generate_fb_write(fs_inst *inst) brw_set_mask_control(p, BRW_MASK_DISABLE); brw_set_compression_control(p, BRW_COMPRESSION_NONE); - if (intel->gen >= 6) { - brw_MOV(p, - brw_message_reg(0), - brw_vec8_grf(0, 0)); - implied_header = brw_null_reg(); - } else { - implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); - } + if (inst->header_present) { + if (intel->gen >= 6) { + brw_MOV(p, + brw_message_reg(0), + brw_vec8_grf(0, 0)); + implied_header = brw_null_reg(); + } else { + implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW); + } - brw_MOV(p, - brw_message_reg(1), - brw_vec8_grf(1, 0)); + brw_MOV(p, + brw_message_reg(1), + brw_vec8_grf(1, 0)); + } else { + implied_header = brw_null_reg(); + } brw_pop_insn_state(p); From f7cb28fad9855020e9fbd1481df03bb09346d4be Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Mon, 4 Oct 2010 15:09:18 -0700 Subject: [PATCH 020/540] i965: Gen6 no longer has the IFF instruction; always use IF. --- src/mesa/drivers/dri/i965/brw_eu_emit.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c index 59541353214..fbc6894d054 100644 --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c @@ -899,9 +899,11 @@ void brw_ENDIF(struct brw_compile *p, * instruction respectively. */ if (patch_insn->header.opcode == BRW_OPCODE_IF) { - /* Automagically turn it into an IFF: - */ - patch_insn->header.opcode = BRW_OPCODE_IFF; + if (intel->gen < 6) { + /* Automagically turn it into an IFF: + */ + patch_insn->header.opcode = BRW_OPCODE_IFF; + } patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1); patch_insn->bits3.if_else.pop_count = 0; patch_insn->bits3.if_else.pad0 = 0; From feca6609390d4642418cf7aab878e654964510c4 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Mon, 4 Oct 2010 15:08:03 -0700 Subject: [PATCH 021/540] i965: Fix up IF/ELSE/ENDIF for gen6. The jump delta is now in the part of the instruction where the destination fields used to be, and the src args are ignored (or not, for the new non-predicated IF that we don't use yet). --- src/mesa/drivers/dri/i965/brw_disasm.c | 5 ++ src/mesa/drivers/dri/i965/brw_eu_emit.c | 86 ++++++++++++++++++------- src/mesa/drivers/dri/i965/brw_structs.h | 12 ++++ 3 files changed, 79 insertions(+), 24 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c index 3a93b253777..be27f9226c3 100644 --- a/src/mesa/drivers/dri/i965/brw_disasm.c +++ b/src/mesa/drivers/dri/i965/brw_disasm.c @@ -892,7 +892,12 @@ int brw_disasm (FILE *file, struct brw_instruction *inst, int gen) if (opcode[inst->header.opcode].ndst > 0) { pad (file, 16); err |= dest (file, inst); + } else if (gen >= 6 && (inst->header.opcode == BRW_OPCODE_IF || + inst->header.opcode == BRW_OPCODE_ELSE || + inst->header.opcode == BRW_OPCODE_ENDIF)) { + format (file, " %d", inst->bits1.branch_gen6.jump_count); } + if (opcode[inst->header.opcode].nsrc > 0) { pad (file, 32); err |= src0 (file, inst); diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c index fbc6894d054..419b40ba84b 100644 --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c @@ -787,6 +787,7 @@ struct brw_instruction *brw_JMPI(struct brw_compile *p, */ struct brw_instruction *brw_IF(struct brw_compile *p, GLuint execute_size) { + struct intel_context *intel = &p->brw->intel; struct brw_instruction *insn; if (p->single_program_flow) { @@ -800,9 +801,15 @@ struct brw_instruction *brw_IF(struct brw_compile *p, GLuint execute_size) /* Override the defaults for this instruction: */ - brw_set_dest(insn, brw_ip_reg()); - brw_set_src0(insn, brw_ip_reg()); - brw_set_src1(insn, brw_imm_d(0x0)); + if (intel->gen < 6) { + brw_set_dest(insn, brw_ip_reg()); + brw_set_src0(insn, brw_ip_reg()); + brw_set_src1(insn, brw_imm_d(0x0)); + } else { + brw_set_dest(insn, brw_imm_w(0)); + brw_set_src0(insn, brw_null_reg()); + brw_set_src1(insn, brw_null_reg()); + } insn->header.execution_size = execute_size; insn->header.compression_control = BRW_COMPRESSION_NONE; @@ -835,9 +842,15 @@ struct brw_instruction *brw_ELSE(struct brw_compile *p, insn = next_insn(p, BRW_OPCODE_ELSE); } - brw_set_dest(insn, brw_ip_reg()); - brw_set_src0(insn, brw_ip_reg()); - brw_set_src1(insn, brw_imm_d(0x0)); + if (intel->gen < 6) { + brw_set_dest(insn, brw_ip_reg()); + brw_set_src0(insn, brw_ip_reg()); + brw_set_src1(insn, brw_imm_d(0x0)); + } else { + brw_set_dest(insn, brw_imm_w(0)); + brw_set_src0(insn, brw_null_reg()); + brw_set_src1(insn, brw_null_reg()); + } insn->header.compression_control = BRW_COMPRESSION_NONE; insn->header.execution_size = if_insn->header.execution_size; @@ -854,9 +867,13 @@ struct brw_instruction *brw_ELSE(struct brw_compile *p, } else { assert(if_insn->header.opcode == BRW_OPCODE_IF); - if_insn->bits3.if_else.jump_count = br * (insn - if_insn); - if_insn->bits3.if_else.pop_count = 0; - if_insn->bits3.if_else.pad0 = 0; + if (intel->gen < 6) { + if_insn->bits3.if_else.jump_count = br * (insn - if_insn); + if_insn->bits3.if_else.pop_count = 0; + if_insn->bits3.if_else.pad0 = 0; + } else { + if_insn->bits1.branch_gen6.jump_count = br * (insn - if_insn + 1); + } } return insn; @@ -884,9 +901,15 @@ void brw_ENDIF(struct brw_compile *p, } else { struct brw_instruction *insn = next_insn(p, BRW_OPCODE_ENDIF); - brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD)); - brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD)); - brw_set_src1(insn, brw_imm_d(0x0)); + if (intel->gen < 6) { + brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD)); + brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD)); + brw_set_src1(insn, brw_imm_d(0x0)); + } else { + brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_W)); + brw_set_src0(insn, brw_null_reg()); + brw_set_src1(insn, brw_null_reg()); + } insn->header.compression_control = BRW_COMPRESSION_NONE; insn->header.execution_size = patch_insn->header.execution_size; @@ -900,26 +923,41 @@ void brw_ENDIF(struct brw_compile *p, */ if (patch_insn->header.opcode == BRW_OPCODE_IF) { if (intel->gen < 6) { - /* Automagically turn it into an IFF: + /* Turn it into an IFF, which means no mask stack operations for + * all-false and jumping past the ENDIF. */ patch_insn->header.opcode = BRW_OPCODE_IFF; + patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1); + patch_insn->bits3.if_else.pop_count = 0; + patch_insn->bits3.if_else.pad0 = 0; + } else { + /* As of gen6, there is no IFF and IF must point to the ENDIF. */ + patch_insn->bits1.branch_gen6.jump_count = br * (insn - patch_insn); } - patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1); - patch_insn->bits3.if_else.pop_count = 0; - patch_insn->bits3.if_else.pad0 = 0; - } else if (patch_insn->header.opcode == BRW_OPCODE_ELSE) { - patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1); - patch_insn->bits3.if_else.pop_count = 1; - patch_insn->bits3.if_else.pad0 = 0; } else { - assert(0); + assert(patch_insn->header.opcode == BRW_OPCODE_ELSE); + if (intel->gen < 6) { + /* BRW_OPCODE_ELSE pre-gen6 should point just past the + * matching ENDIF. + */ + patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1); + patch_insn->bits3.if_else.pop_count = 1; + patch_insn->bits3.if_else.pad0 = 0; + } else { + /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */ + patch_insn->bits1.branch_gen6.jump_count = br * (insn - patch_insn); + } } /* Also pop item off the stack in the endif instruction: */ - insn->bits3.if_else.jump_count = 0; - insn->bits3.if_else.pop_count = 1; - insn->bits3.if_else.pad0 = 0; + if (intel->gen < 6) { + insn->bits3.if_else.jump_count = 0; + insn->bits3.if_else.pop_count = 1; + insn->bits3.if_else.pad0 = 0; + } else { + insn->bits1.branch_gen6.jump_count = 2; + } } } diff --git a/src/mesa/drivers/dri/i965/brw_structs.h b/src/mesa/drivers/dri/i965/brw_structs.h index 7b919872c40..8ce9af9c4fe 100644 --- a/src/mesa/drivers/dri/i965/brw_structs.h +++ b/src/mesa/drivers/dri/i965/brw_structs.h @@ -1381,6 +1381,18 @@ struct brw_instruction GLuint dest_horiz_stride:2; GLuint dest_address_mode:1; } ia16; + + struct { + GLuint dest_reg_file:2; + GLuint dest_reg_type:3; + GLuint src0_reg_file:2; + GLuint src0_reg_type:3; + GLuint src1_reg_file:2; + GLuint src1_reg_type:3; + GLuint pad:1; + + GLint jump_count:16; + } branch_gen6; } bits1; From df3505b19341424a605be703a27f50ea585ad71f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Fonseca?= Date: Wed, 6 Oct 2010 12:09:32 +0100 Subject: [PATCH 022/540] gallivm: Take the type signedness in consideration in round/ceil/floor. --- src/gallium/auxiliary/gallivm/lp_bld_arit.c | 99 ++++++++++++--------- 1 file changed, 55 insertions(+), 44 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index e65c13e64b5..3f9c250ad57 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -1210,7 +1210,7 @@ lp_build_iround(struct lp_build_context *bld, LLVMValueRef a) { const struct lp_type type = bld->type; - LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); + LLVMTypeRef int_vec_type = bld->int_vec_type; LLVMValueRef res; assert(type.floating); @@ -1222,20 +1222,24 @@ lp_build_iround(struct lp_build_context *bld, res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); } else { - LLVMTypeRef vec_type = lp_build_vec_type(type); - LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); - LLVMValueRef sign; LLVMValueRef half; - /* get sign bit */ - sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); - sign = LLVMBuildAnd(bld->builder, sign, mask, ""); - - /* sign * 0.5 */ half = lp_build_const_vec(type, 0.5); - half = LLVMBuildBitCast(bld->builder, half, int_vec_type, ""); - half = LLVMBuildOr(bld->builder, sign, half, ""); - half = LLVMBuildBitCast(bld->builder, half, vec_type, ""); + + if (type.sign) { + LLVMTypeRef vec_type = bld->vec_type; + LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); + LLVMValueRef sign; + + /* get sign bit */ + sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); + sign = LLVMBuildAnd(bld->builder, sign, mask, ""); + + /* sign * 0.5 */ + half = LLVMBuildBitCast(bld->builder, half, int_vec_type, ""); + half = LLVMBuildOr(bld->builder, sign, half, ""); + half = LLVMBuildBitCast(bld->builder, half, vec_type, ""); + } res = LLVMBuildFAdd(bld->builder, a, half, ""); } @@ -1256,7 +1260,7 @@ lp_build_ifloor(struct lp_build_context *bld, LLVMValueRef a) { const struct lp_type type = bld->type; - LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); + LLVMTypeRef int_vec_type = bld->int_vec_type; LLVMValueRef res; assert(type.floating); @@ -1267,27 +1271,31 @@ lp_build_ifloor(struct lp_build_context *bld, res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); } else { - /* Take the sign bit and add it to 1 constant */ - LLVMTypeRef vec_type = lp_build_vec_type(type); - unsigned mantissa = lp_mantissa(type); - LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); - LLVMValueRef sign; - LLVMValueRef offset; + res = a; - /* sign = a < 0 ? ~0 : 0 */ - sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); - sign = LLVMBuildAnd(bld->builder, sign, mask, ""); - sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign"); + if (type.sign) { + /* Take the sign bit and add it to 1 constant */ + LLVMTypeRef vec_type = bld->vec_type; + unsigned mantissa = lp_mantissa(type); + LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); + LLVMValueRef sign; + LLVMValueRef offset; - /* offset = -0.99999(9)f */ - offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); - offset = LLVMConstBitCast(offset, int_vec_type); + /* sign = a < 0 ? ~0 : 0 */ + sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); + sign = LLVMBuildAnd(bld->builder, sign, mask, ""); + sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign"); - /* offset = a < 0 ? offset : 0.0f */ - offset = LLVMBuildAnd(bld->builder, offset, sign, ""); - offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset"); + /* offset = -0.99999(9)f */ + offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); + offset = LLVMConstBitCast(offset, int_vec_type); - res = LLVMBuildFAdd(bld->builder, a, offset, "ifloor.res"); + /* offset = a < 0 ? offset : 0.0f */ + offset = LLVMBuildAnd(bld->builder, offset, sign, ""); + offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset"); + + res = LLVMBuildFAdd(bld->builder, res, offset, "ifloor.res"); + } } /* round to nearest (toward zero) */ @@ -1307,7 +1315,7 @@ lp_build_iceil(struct lp_build_context *bld, LLVMValueRef a) { const struct lp_type type = bld->type; - LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); + LLVMTypeRef int_vec_type = bld->int_vec_type; LLVMValueRef res; assert(type.floating); @@ -1318,25 +1326,28 @@ lp_build_iceil(struct lp_build_context *bld, res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); } else { - LLVMTypeRef vec_type = lp_build_vec_type(type); + LLVMTypeRef vec_type = bld->vec_type; unsigned mantissa = lp_mantissa(type); - LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); - LLVMValueRef sign; LLVMValueRef offset; - /* sign = a < 0 ? 0 : ~0 */ - sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); - sign = LLVMBuildAnd(bld->builder, sign, mask, ""); - sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign"); - sign = LLVMBuildNot(bld->builder, sign, "iceil.not"); - /* offset = 0.99999(9)f */ offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); - offset = LLVMConstBitCast(offset, int_vec_type); - /* offset = a < 0 ? 0.0 : offset */ - offset = LLVMBuildAnd(bld->builder, offset, sign, ""); - offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset"); + if (type.sign) { + LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); + LLVMValueRef sign; + + /* sign = a < 0 ? 0 : ~0 */ + sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); + sign = LLVMBuildAnd(bld->builder, sign, mask, ""); + sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign"); + sign = LLVMBuildNot(bld->builder, sign, "iceil.not"); + + /* offset = a < 0 ? 0.0 : offset */ + offset = LLVMConstBitCast(offset, int_vec_type); + offset = LLVMBuildAnd(bld->builder, offset, sign, ""); + offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset"); + } res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res"); } From 4648846bd6c376877f024ccf300ceac5b0b3dcd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Fonseca?= Date: Wed, 6 Oct 2010 14:06:14 +0100 Subject: [PATCH 023/540] gallivm: Use a faster (and less accurate) log2 in lod computation. --- src/gallium/auxiliary/gallivm/lp_bld_arit.c | 44 +++++++++++++++++++ src/gallium/auxiliary/gallivm/lp_bld_arit.h | 5 +++ src/gallium/auxiliary/gallivm/lp_bld_sample.c | 4 ++ 3 files changed, 53 insertions(+) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index 3f9c250ad57..ff94f498acf 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -2286,3 +2286,47 @@ lp_build_log2(struct lp_build_context *bld, lp_build_log2_approx(bld, x, NULL, NULL, &res); return res; } + + +/** + * Faster (and less accurate) log2. + * + * log2(x) = floor(log2(x)) + frac(x) + * + * See http://www.flipcode.com/archives/Fast_log_Function.shtml + */ +LLVMValueRef +lp_build_fast_log2(struct lp_build_context *bld, + LLVMValueRef x) +{ + const struct lp_type type = bld->type; + LLVMTypeRef vec_type = bld->vec_type; + LLVMTypeRef int_vec_type = bld->int_vec_type; + + unsigned mantissa = lp_mantissa(type); + LLVMValueRef mantmask = lp_build_const_int_vec(type, (1ULL << mantissa) - 1); + LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type); + + LLVMValueRef ipart; + LLVMValueRef fpart; + + assert(lp_check_value(bld->type, x)); + + assert(type.floating); + + x = LLVMBuildBitCast(bld->builder, x, int_vec_type, ""); + + /* ipart = floor(log2(x)) - 1 */ + ipart = LLVMBuildLShr(bld->builder, x, lp_build_const_int_vec(type, mantissa), ""); + ipart = LLVMBuildAnd(bld->builder, ipart, lp_build_const_int_vec(type, 255), ""); + ipart = LLVMBuildSub(bld->builder, ipart, lp_build_const_int_vec(type, 128), ""); + ipart = LLVMBuildSIToFP(bld->builder, ipart, vec_type, ""); + + /* fpart = 1.0 + frac(x) */ + fpart = LLVMBuildAnd(bld->builder, x, mantmask, ""); + fpart = LLVMBuildOr(bld->builder, fpart, one, ""); + fpart = LLVMBuildBitCast(bld->builder, fpart, vec_type, ""); + + /* floor(log2(x)) + frac(x) */ + return LLVMBuildFAdd(bld->builder, ipart, fpart, ""); +} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h index 31efa9921ce..3ed4fec2333 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h @@ -212,6 +212,11 @@ LLVMValueRef lp_build_log2(struct lp_build_context *bld, LLVMValueRef a); +LLVMValueRef +lp_build_fast_log2(struct lp_build_context *bld, + LLVMValueRef a); + + void lp_build_exp2_approx(struct lp_build_context *bld, LLVMValueRef x, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c index aee94c1b866..9dee653eee8 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c @@ -243,7 +243,11 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, } /* compute lod = log2(rho) */ +#if 0 lod = lp_build_log2(float_bld, rho); +#else + lod = lp_build_fast_log2(float_bld, rho); +#endif /* add shader lod bias */ if (lod_bias) { From 012d57737b1b4e4263aa3414abe433195ff8a713 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Fonseca?= Date: Wed, 6 Oct 2010 17:44:05 +0100 Subject: [PATCH 024/540] gallivm: Fast implementation of iround(log2(x)) Not tested yet, but should be correct. --- src/gallium/auxiliary/gallivm/lp_bld_arit.c | 35 +++++++++++++++++++++ src/gallium/auxiliary/gallivm/lp_bld_arit.h | 4 +++ 2 files changed, 39 insertions(+) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index ff94f498acf..15b74410188 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -2330,3 +2330,38 @@ lp_build_fast_log2(struct lp_build_context *bld, /* floor(log2(x)) + frac(x) */ return LLVMBuildFAdd(bld->builder, ipart, fpart, ""); } + + +/** + * Fast implementation of iround(log2(x)). + * + * Not an approximation -- it should give accurate results all the time. + */ +LLVMValueRef +lp_build_ilog2(struct lp_build_context *bld, + LLVMValueRef x) +{ + const struct lp_type type = bld->type; + LLVMTypeRef int_vec_type = bld->int_vec_type; + + unsigned mantissa = lp_mantissa(type); + LLVMValueRef sqrt2 = lp_build_const_vec(type, 1.4142135623730951); + + LLVMValueRef ipart; + + assert(lp_check_value(bld->type, x)); + + assert(type.floating); + + /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */ + x = LLVMBuildFMul(bld->builder, x, sqrt2, ""); + + x = LLVMBuildBitCast(bld->builder, x, int_vec_type, ""); + + /* ipart = floor(log2(x) + 0.5) */ + ipart = LLVMBuildLShr(bld->builder, x, lp_build_const_int_vec(type, mantissa), ""); + ipart = LLVMBuildAnd(bld->builder, ipart, lp_build_const_int_vec(type, 255), ""); + ipart = LLVMBuildSub(bld->builder, ipart, lp_build_const_int_vec(type, 127), ""); + + return ipart; +} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h index 3ed4fec2333..f36197479f0 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h @@ -216,6 +216,10 @@ LLVMValueRef lp_build_fast_log2(struct lp_build_context *bld, LLVMValueRef a); +LLVMValueRef +lp_build_ilog2(struct lp_build_context *bld, + LLVMValueRef x); + void lp_build_exp2_approx(struct lp_build_context *bld, From af05f6157668b3c5e6fd73c3d743b11e619b9067 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Fonseca?= Date: Wed, 6 Oct 2010 18:31:36 +0100 Subject: [PATCH 025/540] gallivm: Combined ifloor & fract helper. The only way to ensure we don't do redundant FP <-> SI conversions. --- src/gallium/auxiliary/gallivm/lp_bld_arit.c | 42 +++++++++++++++++++ src/gallium/auxiliary/gallivm/lp_bld_arit.h | 6 +++ src/gallium/auxiliary/gallivm/lp_bld_sample.c | 4 +- .../auxiliary/gallivm/lp_bld_sample_soa.c | 41 +++++++----------- 4 files changed, 65 insertions(+), 28 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index 15b74410188..64c468c14d4 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -1359,6 +1359,48 @@ lp_build_iceil(struct lp_build_context *bld, } +/** + * Combined ifloor() & fract(). + * + * Preferred to calling the functions separately, as it will ensure that the + * stratergy (floor() vs ifloor()) that results in less redundant work is used. + */ +void +lp_build_ifloor_fract(struct lp_build_context *bld, + LLVMValueRef a, + LLVMValueRef *out_ipart, + LLVMValueRef *out_fpart) +{ + + + const struct lp_type type = bld->type; + LLVMValueRef ipart; + + assert(type.floating); + assert(lp_check_value(type, a)); + + if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { + /* + * floor() is easier. + */ + + ipart = lp_build_floor(bld, a); + *out_fpart = LLVMBuildFSub(bld->builder, a, ipart, "fpart"); + *out_ipart = LLVMBuildFPToSI(bld->builder, ipart, bld->int_vec_type, "ipart"); + } + else { + /* + * ifloor() is easier. + */ + + *out_ipart = lp_build_ifloor(bld, a); + ipart = LLVMBuildSIToFP(bld->builder, *out_ipart, bld->vec_type, "ipart"); + *out_fpart = LLVMBuildFSub(bld->builder, a, ipart, "fpart"); + } +} + + LLVMValueRef lp_build_sqrt(struct lp_build_context *bld, LLVMValueRef a) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h index f36197479f0..8424384f8f7 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h @@ -171,6 +171,12 @@ LLVMValueRef lp_build_itrunc(struct lp_build_context *bld, LLVMValueRef a); +void +lp_build_ifloor_fract(struct lp_build_context *bld, + LLVMValueRef a, + LLVMValueRef *out_ipart, + LLVMValueRef *out_fpart); + LLVMValueRef lp_build_sqrt(struct lp_build_context *bld, LLVMValueRef a); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c index 9dee653eee8..acd99741f13 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c @@ -319,7 +319,7 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld, bld->builder, unit); /* convert float lod to integer */ - level = lp_build_ifloor(float_bld, lod); + lp_build_ifloor_fract(float_bld, lod, &level, weight_out); /* compute level 0 and clamp to legal range of levels */ *level0_out = lp_build_clamp(int_bld, level, @@ -330,8 +330,6 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld, *level1_out = lp_build_clamp(int_bld, level, int_bld->zero, last_level); - - *weight_out = lp_build_fract(float_bld, lod); } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c index 36a77d3aff0..d464147371d 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c @@ -253,11 +253,9 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, /* mul by size and subtract 0.5 */ coord = lp_build_mul(coord_bld, coord, length_f); coord = lp_build_sub(coord_bld, coord, half); - /* convert to int */ - coord0 = lp_build_ifloor(coord_bld, coord); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); coord1 = lp_build_add(uint_coord_bld, coord0, uint_coord_bld->one); - /* compute lerp weight */ - weight = lp_build_fract(coord_bld, coord); /* repeat wrap */ if (is_pot) { coord0 = LLVMBuildAnd(bld->builder, coord0, length_minus_one, ""); @@ -284,8 +282,8 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, coord = lp_build_sub(coord_bld, coord, half); - weight = lp_build_fract(coord_bld, coord); - coord0 = lp_build_ifloor(coord_bld, coord); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); break; @@ -304,10 +302,8 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, max = lp_build_sub(coord_bld, length_f, min); coord = lp_build_clamp(coord_bld, coord, min, max); } - /* compute lerp weight */ - weight = lp_build_fract(coord_bld, coord); - /* coord0 = floor(coord); */ - coord0 = lp_build_ifloor(coord_bld, coord); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); /* coord0 = max(coord0, 0) */ coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero); @@ -327,10 +323,8 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, max = lp_build_sub(coord_bld, length_f, min); coord = lp_build_clamp(coord_bld, coord, min, max); coord = lp_build_sub(coord_bld, coord, half); - /* compute lerp weight */ - weight = lp_build_fract(coord_bld, coord); - /* convert to int */ - coord0 = lp_build_ifloor(coord_bld, coord); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); } break; @@ -343,11 +337,8 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, coord = lp_build_mul(coord_bld, coord, length_f); coord = lp_build_sub(coord_bld, coord, half); - /* compute lerp weight */ - weight = lp_build_fract(coord_bld, coord); - - /* convert to int coords */ - coord0 = lp_build_ifloor(coord_bld, coord); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); /* coord0 = max(coord0, 0) */ @@ -369,8 +360,8 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, coord = lp_build_sub(coord_bld, coord, half); - weight = lp_build_fract(coord_bld, coord); - coord0 = lp_build_ifloor(coord_bld, coord); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); break; @@ -392,8 +383,8 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, coord = lp_build_sub(coord_bld, coord, half); - weight = lp_build_fract(coord_bld, coord); - coord0 = lp_build_ifloor(coord_bld, coord); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); } break; @@ -416,8 +407,8 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, coord = lp_build_sub(coord_bld, coord, half); - weight = lp_build_fract(coord_bld, coord); - coord0 = lp_build_ifloor(coord_bld, coord); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); } break; From 5849a6ab6412eddd4552329178e6edb0bea92977 Mon Sep 17 00:00:00 2001 From: Keith Whitwell Date: Thu, 30 Sep 2010 16:43:56 +0100 Subject: [PATCH 026/540] gallivm: don't apply zero lod_bias --- src/gallium/auxiliary/gallivm/lp_bld_sample.c | 7 ++++++- src/gallium/auxiliary/gallivm/lp_bld_sample.h | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c index acd99741f13..2227a062d08 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c @@ -132,6 +132,10 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE; } + if (sampler->lod_bias != 0.0) { + state->lod_bias_non_zero = 1; + } + /* If min_lod == max_lod we can greatly simplify mipmap selection. * This is a case that occurs during automatic mipmap generation. */ @@ -258,7 +262,8 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, } /* add sampler lod bias */ - lod = LLVMBuildFAdd(bld->builder, lod, sampler_lod_bias, "sampler_lod_bias"); + if (bld->static_state->lod_bias_non_zero) + lod = LLVMBuildFAdd(bld->builder, lod, sampler_lod_bias, "sampler_lod_bias"); /* clamp lod */ lod = lp_build_clamp(float_bld, lod, min_lod, max_lod); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h index 4d2eeaa5eb4..bb1c8c8dcee 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h @@ -83,6 +83,7 @@ struct lp_sampler_static_state unsigned compare_func:3; unsigned normalized_coords:1; unsigned min_max_lod_equal:1; /**< min_lod == max_lod ? */ + unsigned lod_bias_non_zero:1; }; From 1c32583581ef5aee59901d9dd8e56cc1a125f0d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Fonseca?= Date: Wed, 6 Oct 2010 14:53:19 +0100 Subject: [PATCH 027/540] gallivm: Only apply min/max_lod when necessary. --- src/gallium/auxiliary/gallivm/lp_bld_sample.c | 51 +++++++++++++------ src/gallium/auxiliary/gallivm/lp_bld_sample.h | 2 + src/gallium/drivers/llvmpipe/lp_state_fs.c | 4 ++ 3 files changed, 42 insertions(+), 15 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c index 2227a062d08..c1c98bf8596 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c @@ -126,21 +126,32 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, state->wrap_r = sampler->wrap_r; state->min_img_filter = sampler->min_img_filter; state->mag_img_filter = sampler->mag_img_filter; - if (view->last_level) { + + if (view->last_level && sampler->max_lod > 0.0f) { state->min_mip_filter = sampler->min_mip_filter; } else { state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE; } - if (sampler->lod_bias != 0.0) { - state->lod_bias_non_zero = 1; - } + if (state->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) { + if (sampler->lod_bias != 0.0f) { + state->lod_bias_non_zero = 1; + } - /* If min_lod == max_lod we can greatly simplify mipmap selection. - * This is a case that occurs during automatic mipmap generation. - */ - if (sampler->min_lod == sampler->max_lod) { - state->min_max_lod_equal = 1; + /* If min_lod == max_lod we can greatly simplify mipmap selection. + * This is a case that occurs during automatic mipmap generation. + */ + if (sampler->min_lod == sampler->max_lod) { + state->min_max_lod_equal = 1; + } else { + if (sampler->min_lod > 0.0f) { + state->apply_min_lod = 1; + } + + if (sampler->max_lod < (float)view->last_level) { + state->apply_max_lod = 1; + } + } } state->compare_mode = sampler->compare_mode; @@ -181,21 +192,19 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, LLVMValueRef depth) { - LLVMValueRef min_lod = - bld->dynamic_state->min_lod(bld->dynamic_state, bld->builder, unit); - if (bld->static_state->min_max_lod_equal) { /* User is forcing sampling from a particular mipmap level. * This is hit during mipmap generation. */ + LLVMValueRef min_lod = + bld->dynamic_state->min_lod(bld->dynamic_state, bld->builder, unit); + return min_lod; } else { struct lp_build_context *float_bld = &bld->float_bld; LLVMValueRef sampler_lod_bias = bld->dynamic_state->lod_bias(bld->dynamic_state, bld->builder, unit); - LLVMValueRef max_lod = - bld->dynamic_state->max_lod(bld->dynamic_state, bld->builder, unit); LLVMValueRef index0 = LLVMConstInt(LLVMInt32Type(), 0, 0); LLVMValueRef lod; @@ -265,8 +274,20 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, if (bld->static_state->lod_bias_non_zero) lod = LLVMBuildFAdd(bld->builder, lod, sampler_lod_bias, "sampler_lod_bias"); + /* clamp lod */ - lod = lp_build_clamp(float_bld, lod, min_lod, max_lod); + if (bld->static_state->apply_max_lod) { + LLVMValueRef max_lod = + bld->dynamic_state->max_lod(bld->dynamic_state, bld->builder, unit); + + lod = lp_build_min(float_bld, lod, max_lod); + } + if (bld->static_state->apply_min_lod) { + LLVMValueRef min_lod = + bld->dynamic_state->min_lod(bld->dynamic_state, bld->builder, unit); + + lod = lp_build_max(float_bld, lod, min_lod); + } return lod; } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h index bb1c8c8dcee..bb83ede9314 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h @@ -84,6 +84,8 @@ struct lp_sampler_static_state unsigned normalized_coords:1; unsigned min_max_lod_equal:1; /**< min_lod == max_lod ? */ unsigned lod_bias_non_zero:1; + unsigned apply_min_lod:1; /**< min_lod > 0 ? */ + unsigned apply_max_lod:1; /**< max_lod < last_level ? */ }; diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c index e50768445ff..3ce8be5a0a9 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c @@ -812,6 +812,10 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key) if (key->sampler[i].compare_mode != PIPE_TEX_COMPARE_NONE) debug_printf(" .compare_func = %s\n", util_dump_func(key->sampler[i].compare_func, TRUE)); debug_printf(" .normalized_coords = %u\n", key->sampler[i].normalized_coords); + debug_printf(" .min_max_lod_equal = %u\n", key->sampler[i].min_max_lod_equal); + debug_printf(" .lod_bias_non_zero = %u\n", key->sampler[i].lod_bias_non_zero); + debug_printf(" .apply_min_lod = %u\n", key->sampler[i].apply_min_lod); + debug_printf(" .apply_max_lod = %u\n", key->sampler[i].apply_max_lod); } } From 87dd859b342b844add906358810445da21b6b092 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Fonseca?= Date: Wed, 6 Oct 2010 18:44:51 +0100 Subject: [PATCH 028/540] gallivm: Compute lod as integer whenever possible. More accurate/faster results for PIPE_TEX_MIPFILTER_NEAREST. Less FP <-> SI conversion overall. --- src/gallium/auxiliary/gallivm/lp_bld_sample.c | 174 ++++++++++++------ src/gallium/auxiliary/gallivm/lp_bld_sample.h | 12 +- .../auxiliary/gallivm/lp_bld_sample_aos.c | 40 ++-- .../auxiliary/gallivm/lp_bld_sample_soa.c | 31 ++-- 4 files changed, 160 insertions(+), 97 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c index c1c98bf8596..3287cf7c376 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c @@ -167,6 +167,73 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, } +/** + * Generate code to compute coordinate gradient (rho). + * \param ddx partial derivatives of (s, t, r, q) with respect to X + * \param ddy partial derivatives of (s, t, r, q) with respect to Y + * \param width scalar int texture width + * \param height scalar int texture height + * \param depth scalar int texture depth + * + * XXX: The resulting rho is scalar, so we ignore all but the first element of + * derivatives that are passed by the shader. + */ +static LLVMValueRef +lp_build_rho(struct lp_build_sample_context *bld, + const LLVMValueRef ddx[4], + const LLVMValueRef ddy[4], + LLVMValueRef width, + LLVMValueRef height, + LLVMValueRef depth) +{ + struct lp_build_context *float_bld = &bld->float_bld; + const int dims = texture_dims(bld->static_state->target); + LLVMValueRef index0 = LLVMConstInt(LLVMInt32Type(), 0, 0); + LLVMValueRef dsdx, dsdy; + LLVMValueRef dtdx = NULL, dtdy = NULL, drdx = NULL, drdy = NULL; + LLVMValueRef rho; + + dsdx = LLVMBuildExtractElement(bld->builder, ddx[0], index0, "dsdx"); + dsdx = lp_build_abs(float_bld, dsdx); + dsdy = LLVMBuildExtractElement(bld->builder, ddy[0], index0, "dsdy"); + dsdy = lp_build_abs(float_bld, dsdy); + if (dims > 1) { + dtdx = LLVMBuildExtractElement(bld->builder, ddx[1], index0, "dtdx"); + dtdx = lp_build_abs(float_bld, dtdx); + dtdy = LLVMBuildExtractElement(bld->builder, ddy[1], index0, "dtdy"); + dtdy = lp_build_abs(float_bld, dtdy); + if (dims > 2) { + drdx = LLVMBuildExtractElement(bld->builder, ddx[2], index0, "drdx"); + drdx = lp_build_abs(float_bld, drdx); + drdy = LLVMBuildExtractElement(bld->builder, ddy[2], index0, "drdy"); + drdy = lp_build_abs(float_bld, drdy); + } + } + + /* Compute rho = max of all partial derivatives scaled by texture size. + * XXX this could be vectorized somewhat + */ + rho = LLVMBuildFMul(bld->builder, + lp_build_max(float_bld, dsdx, dsdy), + lp_build_int_to_float(float_bld, width), ""); + if (dims > 1) { + LLVMValueRef max; + max = LLVMBuildFMul(bld->builder, + lp_build_max(float_bld, dtdx, dtdy), + lp_build_int_to_float(float_bld, height), ""); + rho = lp_build_max(float_bld, rho, max); + if (dims > 2) { + max = LLVMBuildFMul(bld->builder, + lp_build_max(float_bld, drdx, drdy), + lp_build_int_to_float(float_bld, depth), ""); + rho = lp_build_max(float_bld, rho, max); + } + } + + return rho; +} + + /** * Generate code to compute texture level of detail (lambda). * \param ddx partial derivatives of (s, t, r, q) with respect to X @@ -180,7 +247,7 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, * XXX: The resulting lod is scalar, so ignore all but the first element of * derivatives, lod_bias, etc that are passed by the shader. */ -LLVMValueRef +void lp_build_lod_selector(struct lp_build_sample_context *bld, unsigned unit, const LLVMValueRef ddx[4], @@ -189,9 +256,18 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, LLVMValueRef explicit_lod, /* optional */ LLVMValueRef width, LLVMValueRef height, - LLVMValueRef depth) + LLVMValueRef depth, + unsigned mip_filter, + LLVMValueRef *out_lod_ipart, + LLVMValueRef *out_lod_fpart) { + struct lp_build_context *float_bld = &bld->float_bld; + LLVMValueRef lod; + + *out_lod_ipart = bld->int_bld.zero; + *out_lod_fpart = bld->float_bld.zero; + if (bld->static_state->min_max_lod_equal) { /* User is forcing sampling from a particular mipmap level. * This is hit during mipmap generation. @@ -199,68 +275,40 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, LLVMValueRef min_lod = bld->dynamic_state->min_lod(bld->dynamic_state, bld->builder, unit); - return min_lod; + lod = min_lod; } else { - struct lp_build_context *float_bld = &bld->float_bld; LLVMValueRef sampler_lod_bias = bld->dynamic_state->lod_bias(bld->dynamic_state, bld->builder, unit); LLVMValueRef index0 = LLVMConstInt(LLVMInt32Type(), 0, 0); - LLVMValueRef lod; if (explicit_lod) { lod = LLVMBuildExtractElement(bld->builder, explicit_lod, index0, ""); } else { - const int dims = texture_dims(bld->static_state->target); - LLVMValueRef dsdx, dsdy; - LLVMValueRef dtdx = NULL, dtdy = NULL, drdx = NULL, drdy = NULL; LLVMValueRef rho; - dsdx = LLVMBuildExtractElement(bld->builder, ddx[0], index0, "dsdx"); - dsdx = lp_build_abs(float_bld, dsdx); - dsdy = LLVMBuildExtractElement(bld->builder, ddy[0], index0, "dsdy"); - dsdy = lp_build_abs(float_bld, dsdy); - if (dims > 1) { - dtdx = LLVMBuildExtractElement(bld->builder, ddx[1], index0, "dtdx"); - dtdx = lp_build_abs(float_bld, dtdx); - dtdy = LLVMBuildExtractElement(bld->builder, ddy[1], index0, "dtdy"); - dtdy = lp_build_abs(float_bld, dtdy); - if (dims > 2) { - drdx = LLVMBuildExtractElement(bld->builder, ddx[2], index0, "drdx"); - drdx = lp_build_abs(float_bld, drdx); - drdy = LLVMBuildExtractElement(bld->builder, ddy[2], index0, "drdy"); - drdy = lp_build_abs(float_bld, drdy); - } - } - - /* Compute rho = max of all partial derivatives scaled by texture size. - * XXX this could be vectorized somewhat - */ - rho = LLVMBuildFMul(bld->builder, - lp_build_max(float_bld, dsdx, dsdy), - lp_build_int_to_float(float_bld, width), ""); - if (dims > 1) { - LLVMValueRef max; - max = LLVMBuildFMul(bld->builder, - lp_build_max(float_bld, dtdx, dtdy), - lp_build_int_to_float(float_bld, height), ""); - rho = lp_build_max(float_bld, rho, max); - if (dims > 2) { - max = LLVMBuildFMul(bld->builder, - lp_build_max(float_bld, drdx, drdy), - lp_build_int_to_float(float_bld, depth), ""); - rho = lp_build_max(float_bld, rho, max); - } - } + rho = lp_build_rho(bld, ddx, ddy, width, height, depth); /* compute lod = log2(rho) */ -#if 0 - lod = lp_build_log2(float_bld, rho); -#else - lod = lp_build_fast_log2(float_bld, rho); -#endif + if ((mip_filter == PIPE_TEX_MIPFILTER_NONE || + mip_filter == PIPE_TEX_MIPFILTER_NEAREST) && + !lod_bias && + !bld->static_state->lod_bias_non_zero && + !bld->static_state->apply_max_lod && + !bld->static_state->apply_min_lod) { + *out_lod_ipart = lp_build_ilog2(float_bld, rho); + *out_lod_fpart = bld->float_bld.zero; + return; + } + + if (0) { + lod = lp_build_log2(float_bld, rho); + } + else { + lod = lp_build_fast_log2(float_bld, rho); + } /* add shader lod bias */ if (lod_bias) { @@ -288,9 +336,20 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, lod = lp_build_max(float_bld, lod, min_lod); } - - return lod; } + + if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { + LLVMValueRef ipart = lp_build_ifloor(float_bld, lod); + lp_build_name(ipart, "lod_ipart"); + *out_lod_ipart = ipart; + ipart = LLVMBuildSIToFP(bld->builder, ipart, float_bld->vec_type, ""); + *out_lod_fpart = LLVMBuildFSub(bld->builder, lod, ipart, "lod_fpart"); + } + else { + *out_lod_ipart = lp_build_iround(float_bld, lod); + } + + return; } @@ -304,10 +363,9 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, void lp_build_nearest_mip_level(struct lp_build_sample_context *bld, unsigned unit, - LLVMValueRef lod, + LLVMValueRef lod_ipart, LLVMValueRef *level_out) { - struct lp_build_context *float_bld = &bld->float_bld; struct lp_build_context *int_bld = &bld->int_bld; LLVMValueRef last_level, level; @@ -317,7 +375,7 @@ lp_build_nearest_mip_level(struct lp_build_sample_context *bld, bld->builder, unit); /* convert float lod to integer */ - level = lp_build_iround(float_bld, lod); + level = lod_ipart; /* clamp level to legal range of levels */ *level_out = lp_build_clamp(int_bld, level, zero, last_level); @@ -332,12 +390,10 @@ lp_build_nearest_mip_level(struct lp_build_sample_context *bld, void lp_build_linear_mip_levels(struct lp_build_sample_context *bld, unsigned unit, - LLVMValueRef lod, + LLVMValueRef lod_ipart, LLVMValueRef *level0_out, - LLVMValueRef *level1_out, - LLVMValueRef *weight_out) + LLVMValueRef *level1_out) { - struct lp_build_context *float_bld = &bld->float_bld; struct lp_build_context *int_bld = &bld->int_bld; LLVMValueRef last_level, level; @@ -345,7 +401,7 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld, bld->builder, unit); /* convert float lod to integer */ - lp_build_ifloor_fract(float_bld, lod, &level, weight_out); + level = lod_ipart; /* compute level 0 and clamp to legal range of levels */ *level0_out = lp_build_clamp(int_bld, level, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h index bb83ede9314..b019c3fa5e7 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h @@ -274,7 +274,7 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, const struct pipe_sampler_state *sampler); -LLVMValueRef +void lp_build_lod_selector(struct lp_build_sample_context *bld, unsigned unit, const LLVMValueRef ddx[4], @@ -283,7 +283,10 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, LLVMValueRef explicit_lod, /* optional */ LLVMValueRef width, LLVMValueRef height, - LLVMValueRef depth); + LLVMValueRef depth, + unsigned mip_filter, + LLVMValueRef *out_lod_ipart, + LLVMValueRef *out_lod_fpart); void lp_build_nearest_mip_level(struct lp_build_sample_context *bld, @@ -294,10 +297,9 @@ lp_build_nearest_mip_level(struct lp_build_sample_context *bld, void lp_build_linear_mip_levels(struct lp_build_sample_context *bld, unsigned unit, - LLVMValueRef lod, + LLVMValueRef lod_ipart, LLVMValueRef *level0_out, - LLVMValueRef *level1_out, - LLVMValueRef *weight_out); + LLVMValueRef *level1_out); LLVMValueRef lp_build_get_mipmap_level(struct lp_build_sample_context *bld, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c index 49a6eed615f..8a55681166d 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c @@ -882,13 +882,13 @@ lp_build_sample_aos(struct lp_build_sample_context *bld, LLVMValueRef data_array, LLVMValueRef texel_out[4]) { - struct lp_build_context *float_bld = &bld->float_bld; + struct lp_build_context *int_bld = &bld->int_bld; LLVMBuilderRef builder = bld->builder; const unsigned mip_filter = bld->static_state->min_mip_filter; const unsigned min_filter = bld->static_state->min_img_filter; const unsigned mag_filter = bld->static_state->mag_img_filter; const int dims = texture_dims(bld->static_state->target); - LLVMValueRef lod = NULL, lod_fpart = NULL; + LLVMValueRef lod_ipart = NULL, lod_fpart = NULL; LLVMValueRef ilevel0, ilevel1 = NULL; LLVMValueRef width0_vec = NULL, height0_vec = NULL, depth0_vec = NULL; LLVMValueRef width1_vec = NULL, height1_vec = NULL, depth1_vec = NULL; @@ -936,7 +936,6 @@ lp_build_sample_aos(struct lp_build_sample_context *bld, ddy = face_ddy; } - /* * Compute the level of detail (float). */ @@ -945,9 +944,13 @@ lp_build_sample_aos(struct lp_build_sample_context *bld, /* Need to compute lod either to choose mipmap levels or to * distinguish between minification/magnification with one mipmap level. */ - lod = lp_build_lod_selector(bld, unit, ddx, ddy, - lod_bias, explicit_lod, - width, height, depth); + lp_build_lod_selector(bld, unit, ddx, ddy, + lod_bias, explicit_lod, + width, height, depth, + mip_filter, + &lod_ipart, &lod_fpart); + } else { + lod_ipart = LLVMConstInt(LLVMInt32Type(), 0, 0); } /* @@ -966,30 +969,29 @@ lp_build_sample_aos(struct lp_build_sample_context *bld, * We should be able to set ilevel0 = const(0) but that causes * bad x86 code to be emitted. */ - lod = lp_build_const_elem(bld->coord_bld.type, 0.0); - lp_build_nearest_mip_level(bld, unit, lod, &ilevel0); + assert(lod_ipart); + lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0); } else { ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0); } break; case PIPE_TEX_MIPFILTER_NEAREST: - assert(lod); - lp_build_nearest_mip_level(bld, unit, lod, &ilevel0); + assert(lod_ipart); + lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0); break; case PIPE_TEX_MIPFILTER_LINEAR: { LLVMValueRef f256 = LLVMConstReal(LLVMFloatType(), 256.0); - LLVMValueRef i255 = lp_build_const_int32(255); + LLVMTypeRef i32_type = LLVMIntType(32); LLVMTypeRef i16_type = LLVMIntType(16); - assert(lod); + assert(lod_fpart); + + lp_build_linear_mip_levels(bld, unit, lod_ipart, &ilevel0, &ilevel1); - lp_build_linear_mip_levels(bld, unit, lod, &ilevel0, &ilevel1, - &lod_fpart); lod_fpart = LLVMBuildFMul(builder, lod_fpart, f256, ""); - lod_fpart = lp_build_ifloor(&bld->float_bld, lod_fpart); - lod_fpart = LLVMBuildAnd(builder, lod_fpart, i255, ""); + lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32_type, ""); lod_fpart = LLVMBuildTrunc(builder, lod_fpart, i16_type, ""); lod_fpart = lp_build_broadcast_scalar(&h16, lod_fpart); @@ -1049,9 +1051,9 @@ lp_build_sample_aos(struct lp_build_sample_context *bld, lp_build_flow_scope_declare(flow_ctx, &packed_lo); lp_build_flow_scope_declare(flow_ctx, &packed_hi); - /* minify = lod > 0.0 */ - minify = LLVMBuildFCmp(builder, LLVMRealUGE, - lod, float_bld->zero, ""); + /* minify = lod >= 0.0 */ + minify = LLVMBuildICmp(builder, LLVMIntSGE, + lod_ipart, int_bld->zero, ""); lp_build_if(&if_ctx, flow_ctx, builder, minify); { diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c index d464147371d..4f9bf6763e6 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c @@ -884,12 +884,12 @@ lp_build_sample_general(struct lp_build_sample_context *bld, LLVMValueRef data_array, LLVMValueRef *colors_out) { - struct lp_build_context *float_bld = &bld->float_bld; + struct lp_build_context *int_bld = &bld->int_bld; const unsigned mip_filter = bld->static_state->min_mip_filter; const unsigned min_filter = bld->static_state->min_img_filter; const unsigned mag_filter = bld->static_state->mag_img_filter; const int dims = texture_dims(bld->static_state->target); - LLVMValueRef lod = NULL, lod_fpart = NULL; + LLVMValueRef lod_ipart = NULL, lod_fpart = NULL; LLVMValueRef ilevel0, ilevel1 = NULL; LLVMValueRef width0_vec = NULL, height0_vec = NULL, depth0_vec = NULL; LLVMValueRef width1_vec = NULL, height1_vec = NULL, depth1_vec = NULL; @@ -935,9 +935,13 @@ lp_build_sample_general(struct lp_build_sample_context *bld, /* Need to compute lod either to choose mipmap levels or to * distinguish between minification/magnification with one mipmap level. */ - lod = lp_build_lod_selector(bld, unit, ddx, ddy, - lod_bias, explicit_lod, - width, height, depth); + lp_build_lod_selector(bld, unit, ddx, ddy, + lod_bias, explicit_lod, + width, height, depth, + mip_filter, + &lod_ipart, &lod_fpart); + } else { + lod_ipart = LLVMConstInt(LLVMInt32Type(), 0, 0); } /* @@ -950,22 +954,21 @@ lp_build_sample_general(struct lp_build_sample_context *bld, * We should be able to set ilevel0 = const(0) but that causes * bad x86 code to be emitted. */ - lod = lp_build_const_elem(bld->coord_bld.type, 0.0); - lp_build_nearest_mip_level(bld, unit, lod, &ilevel0); + assert(lod_ipart); + lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0); } else { ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0); } } else { - assert(lod); + assert(lod_ipart); if (mip_filter == PIPE_TEX_MIPFILTER_NEAREST) { - lp_build_nearest_mip_level(bld, unit, lod, &ilevel0); + lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0); } else { assert(mip_filter == PIPE_TEX_MIPFILTER_LINEAR); - lp_build_linear_mip_levels(bld, unit, lod, &ilevel0, &ilevel1, - &lod_fpart); + lp_build_linear_mip_levels(bld, unit, lod_ipart, &ilevel0, &ilevel1); lod_fpart = lp_build_broadcast_scalar(&bld->coord_bld, lod_fpart); } } @@ -1019,9 +1022,9 @@ lp_build_sample_general(struct lp_build_sample_context *bld, lp_build_flow_scope_declare(flow_ctx, &colors_out[2]); lp_build_flow_scope_declare(flow_ctx, &colors_out[3]); - /* minify = lod > 0.0 */ - minify = LLVMBuildFCmp(bld->builder, LLVMRealUGE, - lod, float_bld->zero, ""); + /* minify = lod >= 0.0 */ + minify = LLVMBuildICmp(bld->builder, LLVMIntSGE, + lod_ipart, int_bld->zero, ""); lp_build_if(&if_ctx, flow_ctx, bld->builder, minify); { From 33f88b3492af1f4f7e78c3688d481f1dee189822 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Fonseca?= Date: Wed, 6 Oct 2010 10:09:37 +0100 Subject: [PATCH 029/540] util: Cleanup util_pack_z_stencil and friends. - Handle PIPE_FORMAT_Z32_FLOAT packing correctly. - In the integer version z shouldn't be passed as as double. - Make it clear that the integer versions should only be used for masks. - Make integer type sizes explicit (uint32_t for now, although uint64_t will be necessary later to encode f32_s8_x24). --- src/gallium/auxiliary/util/u_pack_color.h | 50 +++++++++++++---------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/src/gallium/auxiliary/util/u_pack_color.h b/src/gallium/auxiliary/util/u_pack_color.h index c90b0fdbc3f..5378f2d782f 100644 --- a/src/gallium/auxiliary/util/u_pack_color.h +++ b/src/gallium/auxiliary/util/u_pack_color.h @@ -434,8 +434,8 @@ util_pack_color(const float rgba[4], enum pipe_format format, union util_color * /* Integer versions of util_pack_z and util_pack_z_stencil - useful for * constructing clear masks. */ -static INLINE uint -util_pack_uint_z(enum pipe_format format, unsigned z) +static INLINE uint32_t +util_pack_mask_z(enum pipe_format format, uint32_t z) { switch (format) { case PIPE_FORMAT_Z16_UNORM: @@ -452,29 +452,32 @@ util_pack_uint_z(enum pipe_format format, unsigned z) case PIPE_FORMAT_S8_USCALED: return 0; default: - debug_print_format("gallium: unhandled format in util_pack_z()", format); + debug_print_format("gallium: unhandled format in util_pack_mask_z()", format); assert(0); return 0; } } -static INLINE uint -util_pack_uint_z_stencil(enum pipe_format format, double z, uint s) +static INLINE uint32_t +util_pack_mask_z_stencil(enum pipe_format format, uint32_t z, uint8_t s) { - unsigned packed = util_pack_uint_z(format, z); - - s &= 0xff; + uint32_t packed = util_pack_mask_z(format, z); switch (format) { case PIPE_FORMAT_Z24_UNORM_S8_USCALED: - return packed | (s << 24); + packed |= (uint32_t)s << 24; + break; case PIPE_FORMAT_S8_USCALED_Z24_UNORM: - return packed | s; + packed |= s; + break; case PIPE_FORMAT_S8_USCALED: - return packed | s; + packed |= s; + break; default: - return packed; + break; } + + return packed; } @@ -482,9 +485,11 @@ util_pack_uint_z_stencil(enum pipe_format format, double z, uint s) /** * Note: it's assumed that z is in [0,1] */ -static INLINE uint +static INLINE uint32_t util_pack_z(enum pipe_format format, double z) { + union fi fui; + if (z == 0.0) return 0; @@ -492,24 +497,25 @@ util_pack_z(enum pipe_format format, double z) case PIPE_FORMAT_Z16_UNORM: if (z == 1.0) return 0xffff; - return (uint) (z * 0xffff); + return (uint32_t) (z * 0xffff); case PIPE_FORMAT_Z32_UNORM: /* special-case to avoid overflow */ if (z == 1.0) return 0xffffffff; - return (uint) (z * 0xffffffff); + return (uint32_t) (z * 0xffffffff); case PIPE_FORMAT_Z32_FLOAT: - return (uint)z; + fui.f = (float)z; + return fui.ui; case PIPE_FORMAT_Z24_UNORM_S8_USCALED: case PIPE_FORMAT_Z24X8_UNORM: if (z == 1.0) return 0xffffff; - return (uint) (z * 0xffffff); + return (uint32_t) (z * 0xffffff); case PIPE_FORMAT_S8_USCALED_Z24_UNORM: case PIPE_FORMAT_X8Z24_UNORM: if (z == 1.0) return 0xffffff00; - return ((uint) (z * 0xffffff)) << 8; + return ((uint32_t) (z * 0xffffff)) << 8; case PIPE_FORMAT_S8_USCALED: /* this case can get it via util_pack_z_stencil() */ return 0; @@ -525,14 +531,14 @@ util_pack_z(enum pipe_format format, double z) * Pack Z and/or stencil values into a 32-bit value described by format. * Note: it's assumed that z is in [0,1] and s in [0,255] */ -static INLINE uint -util_pack_z_stencil(enum pipe_format format, double z, uint s) +static INLINE uint32_t +util_pack_z_stencil(enum pipe_format format, double z, uint8_t s) { - unsigned packed = util_pack_z(format, z); + uint32_t packed = util_pack_z(format, z); switch (format) { case PIPE_FORMAT_Z24_UNORM_S8_USCALED: - packed |= s << 24; + packed |= (uint32_t)s << 24; break; case PIPE_FORMAT_S8_USCALED_Z24_UNORM: packed |= s; From 9fe510ef35a783a244d0d54baa50f959a6b781dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Fonseca?= Date: Wed, 6 Oct 2010 10:11:15 +0100 Subject: [PATCH 030/540] llvmpipe: Cleanup depth-stencil clears. Only cosmetic changes. No actual practical difference. --- src/gallium/drivers/llvmpipe/lp_rast.c | 34 ++++++++++++++++++------- src/gallium/drivers/llvmpipe/lp_rast.h | 4 +-- src/gallium/drivers/llvmpipe/lp_setup.c | 11 +++++--- 3 files changed, 34 insertions(+), 15 deletions(-) diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c index d7e6415e139..790d88a7450 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.c +++ b/src/gallium/drivers/llvmpipe/lp_rast.c @@ -211,8 +211,8 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) { const struct lp_scene *scene = task->scene; - unsigned clear_value = arg.clear_zstencil.value; - unsigned clear_mask = arg.clear_zstencil.mask; + uint32_t clear_value = arg.clear_zstencil.value; + uint32_t clear_mask = arg.clear_zstencil.mask; const unsigned height = TILE_SIZE / TILE_VECTOR_HEIGHT; const unsigned width = TILE_SIZE * TILE_VECTOR_HEIGHT; const unsigned block_size = scene->zsbuf.blocksize; @@ -220,7 +220,8 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task, uint8_t *dst; unsigned i, j; - LP_DBG(DEBUG_RAST, "%s 0x%x%x\n", __FUNCTION__, clear_value, clear_mask); + LP_DBG(DEBUG_RAST, "%s: value=0x%08x, mask=0x%08x\n", + __FUNCTION__, clear_value, clear_mask); /* * Clear the aera of the swizzled depth/depth buffer matching this tile, in @@ -232,16 +233,31 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task, dst = task->depth_tile; + clear_value &= clear_mask; + switch (block_size) { case 1: + assert(clear_mask == 0xff); memset(dst, (uint8_t) clear_value, height * width); break; case 2: - for (i = 0; i < height; i++) { - uint16_t *row = (uint16_t *)dst; - for (j = 0; j < width; j++) - *row++ = (uint16_t) clear_value; - dst += dst_stride; + if (clear_mask == 0xffff) { + for (i = 0; i < height; i++) { + uint16_t *row = (uint16_t *)dst; + for (j = 0; j < width; j++) + *row++ = (uint16_t) clear_value; + dst += dst_stride; + } + } + else { + for (i = 0; i < height; i++) { + uint16_t *row = (uint16_t *)dst; + for (j = 0; j < width; j++) { + uint16_t tmp = ~clear_mask & *row; + *row++ = clear_value | tmp; + } + dst += dst_stride; + } } break; case 4: @@ -258,7 +274,7 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task, uint32_t *row = (uint32_t *)dst; for (j = 0; j < width; j++) { uint32_t tmp = ~clear_mask & *row; - *row++ = (clear_value & clear_mask) | tmp; + *row++ = clear_value | tmp; } dst += dst_stride; } diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h index c55b97a9d13..0f62377c072 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.h +++ b/src/gallium/drivers/llvmpipe/lp_rast.h @@ -149,8 +149,8 @@ union lp_rast_cmd_arg { const struct lp_rast_state *set_state; uint8_t clear_color[4]; struct { - unsigned value; - unsigned mask; + uint32_t value; + uint32_t mask; } clear_zstencil; struct lp_fence *fence; struct llvmpipe_query *query_obj; diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c index 5ff11a33632..e72ead0def4 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup.c +++ b/src/gallium/drivers/llvmpipe/lp_setup.c @@ -377,16 +377,19 @@ lp_setup_try_clear( struct lp_setup_context *setup, } if (flags & PIPE_CLEAR_DEPTHSTENCIL) { - unsigned zmask = (flags & PIPE_CLEAR_DEPTH) ? ~0 : 0; - unsigned smask = (flags & PIPE_CLEAR_STENCIL) ? ~0 : 0; + uint32_t zmask = (flags & PIPE_CLEAR_DEPTH) ? ~0 : 0; + uint32_t smask = (flags & PIPE_CLEAR_STENCIL) ? ~0 : 0; zsvalue = util_pack_z_stencil(setup->fb.zsbuf->format, depth, stencil); - zsmask = util_pack_uint_z_stencil(setup->fb.zsbuf->format, + + zsmask = util_pack_mask_z_stencil(setup->fb.zsbuf->format, zmask, smask); + + zsvalue &= zsmask; } if (setup->state == SETUP_ACTIVE) { @@ -431,7 +434,7 @@ lp_setup_try_clear( struct lp_setup_context *setup, if (flags & PIPE_CLEAR_COLOR) { memcpy(setup->clear.color.clear_color, &color_arg, - sizeof color_arg); + sizeof setup->clear.color.clear_color); } } From 5eeaf3671e2f913d38187fd1401c4b22a2900d57 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 6 Oct 2010 11:25:05 -0700 Subject: [PATCH 031/540] i965: Fix botch in the header_present case in the new FS. I only set it on the color_regions == 0 case, missing the important case, causing GPU hangs on pre-gen6. --- src/mesa/drivers/dri/i965/brw_fs.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index a380eb44525..6e5ea0782e1 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -1920,6 +1920,7 @@ fs_visitor::emit_fb_writes() inst->mlen = nr; if (target == c->key.nr_color_regions - 1) inst->eot = true; + inst->header_present = header_present; } if (c->key.nr_color_regions == 0) { From 5d99b01501128c7179cdd6aa29bc8953d0d81e75 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 6 Oct 2010 10:22:22 -0700 Subject: [PATCH 032/540] i965: Add some clarification of the WECtrl field. --- src/mesa/drivers/dri/i965/brw_defines.h | 21 +++++++++++++++++++-- src/mesa/drivers/dri/i965/brw_disasm.c | 4 ++-- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index f9c12e15558..9633c95ff50 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -501,9 +501,26 @@ #define BRW_MASK_ENABLE 0 #define BRW_MASK_DISABLE 1 -/* Sandybridge is WECtrl (Write enable control) */ +/** @{ + * + * Gen6 has replaced "mask enable/disable" with WECtrl, which is + * effectively the same but much simpler to think about. Now, there + * are two contributors ANDed together to whether channels are + * executed: The predication on the instruction, and the channel write + * enable. + */ +/** + * This is the default value. It means that a channel's write enable is set + * if the per-channel IP is pointing at this instruction. + */ #define BRW_WE_NORMAL 0 -#define BRW_WE_KILL_PRED 1 +/** + * This is used like BRW_MASK_DISABLE, and causes all channels to have + * their write enable set. Note that predication still contributes to + * whether the channel actually gets written. + */ +#define BRW_WE_ALL 1 +/** @} */ #define BRW_OPCODE_MOV 1 #define BRW_OPCODE_SEL 2 diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c index be27f9226c3..12b8f2e4678 100644 --- a/src/mesa/drivers/dri/i965/brw_disasm.c +++ b/src/mesa/drivers/dri/i965/brw_disasm.c @@ -165,8 +165,8 @@ char *accwr[2] = { }; char *wectrl[2] = { - [0] = "WEnormal", - [1] = "WEpredicted" + [0] = "WE_normal", + [1] = "WE_all" }; char *exec_size[8] = { From fe6efc25ed3c1edf26073c4e6b6a3a45c857c1eb Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 6 Oct 2010 11:00:31 -0700 Subject: [PATCH 033/540] i965: Don't do 1/w multiplication in new FS for gen6 Not needed now that we're doing barycentric. --- src/mesa/drivers/dri/i965/brw_fs.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 6e5ea0782e1..7327c3a700e 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -810,12 +810,14 @@ fs_visitor::emit_general_interpolation(ir_variable *ir) } attr.reg_offset -= type->vector_elements; - for (unsigned int c = 0; c < type->vector_elements; c++) { - emit(fs_inst(BRW_OPCODE_MUL, - attr, - attr, - this->pixel_w)); - attr.reg_offset++; + if (intel->gen < 6) { + for (unsigned int c = 0; c < type->vector_elements; c++) { + emit(fs_inst(BRW_OPCODE_MUL, + attr, + attr, + this->pixel_w)); + attr.reg_offset++; + } } location++; } From 75270f705f319b0ecf297d1bdd328e52a8a956aa Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 6 Oct 2010 11:04:02 -0700 Subject: [PATCH 034/540] i965: Gen6's sampler messages are the same as Ironlake. This should fix texturing in the new FS backend. --- src/mesa/drivers/dri/i965/brw_fs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 7327c3a700e..19739901619 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -2054,7 +2054,7 @@ fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src) int rlen = 4; uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; - if (intel->gen == 5) { + if (intel->gen >= 5) { switch (inst->opcode) { case FS_OPCODE_TEX: if (inst->shadow_compare) { From a760b5b509f85991a10400977576afabcedbb3c5 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 6 Oct 2010 11:13:22 -0700 Subject: [PATCH 035/540] i965: Refactor gl_FrontFacing setup out of general variable setup. --- src/mesa/drivers/dri/i965/brw_fs.cpp | 53 ++++++++++++++++------------ 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 19739901619..c5013f05745 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -474,8 +474,9 @@ public: void generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src); void emit_dummy_fs(); - void emit_fragcoord_interpolation(ir_variable *ir); - void emit_general_interpolation(ir_variable *ir); + fs_reg *emit_fragcoord_interpolation(ir_variable *ir); + fs_reg *emit_frontfacing_interpolation(ir_variable *ir); + fs_reg *emit_general_interpolation(ir_variable *ir); void emit_interpolation_setup_gen4(); void emit_interpolation_setup_gen6(); fs_inst *emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate); @@ -721,7 +722,7 @@ fs_visitor::setup_builtin_uniform_values(ir_variable *ir) } } -void +fs_reg * fs_visitor::emit_fragcoord_interpolation(ir_variable *ir) { fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); @@ -761,11 +762,10 @@ fs_visitor::emit_fragcoord_interpolation(ir_variable *ir) /* gl_FragCoord.w: Already set up in emit_interpolation */ emit(fs_inst(BRW_OPCODE_MOV, wpos, this->wpos_w)); - hash_table_insert(this->variable_ht, reg, ir); + return reg; } - -void +fs_reg * fs_visitor::emit_general_interpolation(ir_variable *ir) { fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); @@ -823,7 +823,25 @@ fs_visitor::emit_general_interpolation(ir_variable *ir) } } - hash_table_insert(this->variable_ht, reg, ir); + return reg; +} + +fs_reg * +fs_visitor::emit_frontfacing_interpolation(ir_variable *ir) +{ + fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); + struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); + /* bit 31 is "primitive is back face", so checking < (1 << 31) gives + * us front face + */ + fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, + *reg, + fs_reg(r1_6ud), + fs_reg(1u << 31))); + inst->conditional_mod = BRW_CONDITIONAL_L; + emit(fs_inst(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u))); + + return reg; } void @@ -844,24 +862,15 @@ fs_visitor::visit(ir_variable *ir) if (ir->mode == ir_var_in) { if (!strcmp(ir->name, "gl_FragCoord")) { - emit_fragcoord_interpolation(ir); - return; + reg = emit_fragcoord_interpolation(ir); } else if (!strcmp(ir->name, "gl_FrontFacing")) { - reg = new(this->mem_ctx) fs_reg(this, ir->type); - struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); - /* bit 31 is "primitive is back face", so checking < (1 << 31) gives - * us front face - */ - fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, - *reg, - fs_reg(r1_6ud), - fs_reg(1u << 31))); - inst->conditional_mod = BRW_CONDITIONAL_L; - emit(fs_inst(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u))); + reg = emit_frontfacing_interpolation(ir); } else { - emit_general_interpolation(ir); - return; + reg = emit_general_interpolation(ir); } + assert(reg); + hash_table_insert(this->variable_ht, reg, ir); + return; } if (ir->mode == ir_var_uniform) { From 1fdc8c007ea66b4c9866bf2c679653a005307fa5 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 6 Oct 2010 11:19:48 -0700 Subject: [PATCH 036/540] i965: Add support for gl_FrontFacing on gen6. Fixes glsl1-gl_FrontFacing var (2) with new FS. --- src/mesa/drivers/dri/i965/brw_fs.cpp | 49 ++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index c5013f05745..1ccf695a597 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -830,16 +830,33 @@ fs_reg * fs_visitor::emit_frontfacing_interpolation(ir_variable *ir) { fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); - struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); - /* bit 31 is "primitive is back face", so checking < (1 << 31) gives - * us front face - */ - fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, - *reg, - fs_reg(r1_6ud), - fs_reg(1u << 31))); - inst->conditional_mod = BRW_CONDITIONAL_L; - emit(fs_inst(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u))); + + /* The frontfacing comes in as a bit in the thread payload. */ + if (intel->gen >= 6) { + emit(fs_inst(BRW_OPCODE_ASR, + *reg, + fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)), + fs_reg(15))); + emit(fs_inst(BRW_OPCODE_NOT, + *reg, + *reg)); + emit(fs_inst(BRW_OPCODE_AND, + *reg, + *reg, + fs_reg(1))); + } else { + fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type); + struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD); + /* bit 31 is "primitive is back face", so checking < (1 << 31) gives + * us front face + */ + fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, + *reg, + fs_reg(r1_6ud), + fs_reg(1u << 31))); + inst->conditional_mod = BRW_CONDITIONAL_L; + emit(fs_inst(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u))); + } return reg; } @@ -2818,6 +2835,18 @@ fs_visitor::generate_code() case BRW_OPCODE_XOR: brw_XOR(p, dst, src[0], src[1]); break; + case BRW_OPCODE_NOT: + brw_NOT(p, dst, src[0]); + break; + case BRW_OPCODE_ASR: + brw_ASR(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_SHR: + brw_SHR(p, dst, src[0], src[1]); + break; + case BRW_OPCODE_SHL: + brw_SHL(p, dst, src[0], src[1]); + break; case BRW_OPCODE_CMP: brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); From b380531fd40e0876218b1116502bafea7911bd3d Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 6 Oct 2010 12:10:31 -0700 Subject: [PATCH 037/540] i965: Don't assume that WPOS is always provided on gen6 in the new FS. We sensibly only provide it if the FS asks for it. We could actually skip WPOS unless the FS needed WPOS.zw, but that's something for later. Fixes: glsl-texture2d and probably many others. --- src/mesa/drivers/dri/i965/brw_fs.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 1ccf695a597..b93c27ec8ff 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -2258,8 +2258,7 @@ fs_visitor::calculate_urb_setup() /* Figure out where each of the incoming setup attributes lands. */ if (intel->gen >= 6) { for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) { - if (i == FRAG_ATTRIB_WPOS || - (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i))) { + if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) { urb_setup[i] = urb_next++; } } From d3163912c1f15fcb44beb33c5069799d56e1dc16 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 6 Oct 2010 17:29:29 -0700 Subject: [PATCH 038/540] i965: Fix gen6 pointsize handling to match pre-gen6. Fixes point-line-no-cull. Bug #30532 --- src/mesa/drivers/dri/i965/gen6_sf_state.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c index 6c883381958..b2a6bd04e2f 100644 --- a/src/mesa/drivers/dri/i965/gen6_sf_state.c +++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c @@ -125,7 +125,8 @@ upload_sf_state(struct brw_context *brw) } /* _NEW_POINT */ - if (ctx->Point._Attenuated) + if (!(ctx->VertexProgram.PointSizeEnabled || + ctx->Point._Attenuated)) dw4 |= GEN6_SF_USE_STATE_POINT_WIDTH; dw4 |= U_FIXED(CLAMP(ctx->Point.Size, 0.125, 225.875), 3) << From da495ee8708d655742eff7b0cf1fee8f71ed10e9 Mon Sep 17 00:00:00 2001 From: Chia-I Wu Date: Thu, 7 Oct 2010 12:06:07 +0800 Subject: [PATCH 039/540] targets/egl: Fix linking with libdrm. --- src/gallium/targets/egl/Makefile | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/gallium/targets/egl/Makefile b/src/gallium/targets/egl/Makefile index 47c24cefe5c..38e60dbafbf 100644 --- a/src/gallium/targets/egl/Makefile +++ b/src/gallium/targets/egl/Makefile @@ -24,7 +24,9 @@ common_CPPFLAGS := \ -I$(TOP)/src/gallium/auxiliary \ -I$(TOP)/src/gallium/drivers \ -I$(TOP)/src/gallium/include \ - -I$(TOP)/src/gallium/winsys + -I$(TOP)/src/gallium/winsys \ + $(LIBDRM_CFLAGS) + common_SYS := common_LIBS := \ $(TOP)/src/gallium/drivers/identity/libidentity.a \ @@ -41,11 +43,11 @@ egl_SYS := -lm $(DLOPEN_LIBS) -L$(TOP)/$(LIB_DIR) -lEGL egl_LIBS := $(TOP)/src/gallium/state_trackers/egl/libegl.a ifneq ($(findstring x11, $(EGL_PLATFORMS)),) -egl_SYS += -lX11 -lXext -lXfixes +egl_SYS += -lX11 -lXext -lXfixes $(LIBDRM_LIB) egl_LIBS += $(TOP)/src/gallium/winsys/sw/xlib/libws_xlib.a endif -ifneq ($(findstring kms, $(EGL_PLATFORMS)),) -egl_SYS += -ldrm +ifneq ($(findstring drm, $(EGL_PLATFORMS)),) +egl_SYS += $(LIBDRM_LIB) endif ifneq ($(findstring fbdev, $(EGL_PLATFORMS)),) egl_LIBS += $(TOP)/src/gallium/winsys/sw/fbdev/libfbdev.a From b2c0ef8b51ce86388335e83f2390940cb8fbc12f Mon Sep 17 00:00:00 2001 From: Chia-I Wu Date: Thu, 7 Oct 2010 12:14:38 +0800 Subject: [PATCH 040/540] st/vega: Fix version check in context creation. This fixes a regression since 4531356817ec8383ac35932903773de67af92e37. --- src/gallium/state_trackers/vega/vg_manager.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/state_trackers/vega/vg_manager.c b/src/gallium/state_trackers/vega/vg_manager.c index e7996741d14..232deefa166 100644 --- a/src/gallium/state_trackers/vega/vg_manager.c +++ b/src/gallium/state_trackers/vega/vg_manager.c @@ -352,7 +352,7 @@ vg_api_create_context(struct st_api *stapi, struct st_manager *smapi, return NULL; /* only 1.0 is supported */ - if (attribs->major != 1 || attribs->minor > 0) + if (attribs->major > 1 || (attribs->major == 1 && attribs->minor > 0)) return NULL; pipe = smapi->screen->context_create(smapi->screen, NULL); From 84457701b05ef29126d90c2fe72083278d26bd4f Mon Sep 17 00:00:00 2001 From: Andre Maasikas Date: Wed, 6 Oct 2010 21:14:15 +0300 Subject: [PATCH 041/540] r600g: fix evergreen interpolation setup interp data is stored in gpr0 so first interp overwrote it and subsequent ones got wrong values reserve register 0 so it's not used for attribs. alternative is to interpolate attrib0 last (reverse, as r600c does) --- src/gallium/drivers/r600/r600_shader.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 016e75bd52b..1f79c15a2b5 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -529,6 +529,9 @@ int r600_shader_from_tgsi(const struct tgsi_token *tokens, struct r600_shader *s if (ctx.type == TGSI_PROCESSOR_VERTEX) { ctx.file_offset[TGSI_FILE_INPUT] = 1; } + if (ctx.type == TGSI_PROCESSOR_FRAGMENT && ctx.bc->chiprev == 2) { + ctx.file_offset[TGSI_FILE_INPUT] = 1; + } ctx.file_offset[TGSI_FILE_OUTPUT] = ctx.file_offset[TGSI_FILE_INPUT] + ctx.info.file_count[TGSI_FILE_INPUT]; ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] + From 97eea87bde5d05f247580aeb2963ac2476417bd5 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Thu, 7 Oct 2010 15:13:09 +1000 Subject: [PATCH 042/540] r600g: use format from the sampler view not from the texture. we want to use the format from the sampler view which isn't always the same as the texture format when creating sampler views. --- src/gallium/drivers/r600/evergreen_state.c | 6 +++--- src/gallium/drivers/r600/r600_state.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index 603a8a7b915..0fd1d399518 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -424,15 +424,15 @@ static struct pipe_sampler_view *evergreen_create_sampler_view(struct pipe_conte swizzle[1] = state->swizzle_g; swizzle[2] = state->swizzle_b; swizzle[3] = state->swizzle_a; - format = r600_translate_texformat(texture->format, + format = r600_translate_texformat(state->format, swizzle, &word4, &yuv_format); if (format == ~0) { format = 0; } - desc = util_format_description(texture->format); + desc = util_format_description(state->format); if (desc == NULL) { - R600_ERR("unknow format %d\n", texture->format); + R600_ERR("unknow format %d\n", state->format); } tmp = (struct r600_resource_texture*)texture; rbuffer = &tmp->resource; diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c index b55c3450059..7ceedf6363b 100644 --- a/src/gallium/drivers/r600/r600_state.c +++ b/src/gallium/drivers/r600/r600_state.c @@ -626,15 +626,15 @@ static struct pipe_sampler_view *r600_create_sampler_view(struct pipe_context *c swizzle[1] = state->swizzle_g; swizzle[2] = state->swizzle_b; swizzle[3] = state->swizzle_a; - format = r600_translate_texformat(texture->format, + format = r600_translate_texformat(state->format, swizzle, &word4, &yuv_format); if (format == ~0) { format = 0; } - desc = util_format_description(texture->format); + desc = util_format_description(state->format); if (desc == NULL) { - R600_ERR("unknow format %d\n", texture->format); + R600_ERR("unknow format %d\n", state->format); } tmp = (struct r600_resource_texture*)texture; rbuffer = &tmp->resource; From 51f9cc4759c23b74a2e4d9c79b0a5df27d403f54 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Thu, 7 Oct 2010 15:32:05 +1000 Subject: [PATCH 043/540] r600g: fix Z export enable bits. we should be checking output array not input to decide. Signed-off-by: Dave Airlie --- src/gallium/drivers/r600/r600_shader.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 1f79c15a2b5..366d5d9c351 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -132,7 +132,7 @@ static void r600_pipe_shader_ps(struct pipe_context *ctx, struct r600_pipe_shade r600_pipe_state_add_reg(rstate, R_028644_SPI_PS_INPUT_CNTL_0 + i * 4, tmp, 0xFFFFFFFF, NULL); } for (i = 0; i < rshader->noutput; i++) { - if (rshader->input[i].name == TGSI_SEMANTIC_POSITION) + if (rshader->output[i].name == TGSI_SEMANTIC_POSITION) r600_pipe_state_add_reg(rstate, R_02880C_DB_SHADER_CONTROL, S_02880C_Z_EXPORT_ENABLE(1), From 321ec1a22468c1f23bbbf7d5c9de53d4687fec0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Fonseca?= Date: Thu, 7 Oct 2010 22:03:59 +0100 Subject: [PATCH 044/540] gallivm: Vectorize the rho computation. --- src/gallium/auxiliary/gallivm/lp_bld_sample.c | 92 +++++++++++-------- src/gallium/auxiliary/gallivm/lp_bld_sample.h | 13 ++- .../auxiliary/gallivm/lp_bld_sample_soa.c | 24 +++++ 3 files changed, 92 insertions(+), 37 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c index 3287cf7c376..a6a64f38c84 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c @@ -171,9 +171,6 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, * Generate code to compute coordinate gradient (rho). * \param ddx partial derivatives of (s, t, r, q) with respect to X * \param ddy partial derivatives of (s, t, r, q) with respect to Y - * \param width scalar int texture width - * \param height scalar int texture height - * \param depth scalar int texture depth * * XXX: The resulting rho is scalar, so we ignore all but the first element of * derivatives that are passed by the shader. @@ -181,52 +178,75 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, static LLVMValueRef lp_build_rho(struct lp_build_sample_context *bld, const LLVMValueRef ddx[4], - const LLVMValueRef ddy[4], - LLVMValueRef width, - LLVMValueRef height, - LLVMValueRef depth) + const LLVMValueRef ddy[4]) { + struct lp_build_context *float_size_bld = &bld->float_size_bld; struct lp_build_context *float_bld = &bld->float_bld; const int dims = texture_dims(bld->static_state->target); - LLVMValueRef index0 = LLVMConstInt(LLVMInt32Type(), 0, 0); - LLVMValueRef dsdx, dsdy; - LLVMValueRef dtdx = NULL, dtdy = NULL, drdx = NULL, drdy = NULL; + LLVMTypeRef i32t = LLVMInt32Type(); + LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); + LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0); + LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0); + LLVMValueRef dsdx, dsdy, dtdx, dtdy, drdx, drdy; + LLVMValueRef rho_x, rho_y; + LLVMValueRef rho_vec; + LLVMValueRef float_size; LLVMValueRef rho; dsdx = LLVMBuildExtractElement(bld->builder, ddx[0], index0, "dsdx"); - dsdx = lp_build_abs(float_bld, dsdx); dsdy = LLVMBuildExtractElement(bld->builder, ddy[0], index0, "dsdy"); - dsdy = lp_build_abs(float_bld, dsdy); - if (dims > 1) { + + if (dims <= 1) { + rho_x = dsdx; + rho_y = dsdy; + } + else { + rho_x = float_size_bld->undef; + rho_y = float_size_bld->undef; + + rho_x = LLVMBuildInsertElement(bld->builder, rho_x, dsdx, index0, ""); + rho_y = LLVMBuildInsertElement(bld->builder, rho_y, dsdy, index0, ""); + dtdx = LLVMBuildExtractElement(bld->builder, ddx[1], index0, "dtdx"); - dtdx = lp_build_abs(float_bld, dtdx); dtdy = LLVMBuildExtractElement(bld->builder, ddy[1], index0, "dtdy"); - dtdy = lp_build_abs(float_bld, dtdy); - if (dims > 2) { + + rho_x = LLVMBuildInsertElement(bld->builder, rho_x, dtdx, index1, ""); + rho_y = LLVMBuildInsertElement(bld->builder, rho_y, dtdy, index1, ""); + + if (dims >= 3) { drdx = LLVMBuildExtractElement(bld->builder, ddx[2], index0, "drdx"); - drdx = lp_build_abs(float_bld, drdx); drdy = LLVMBuildExtractElement(bld->builder, ddy[2], index0, "drdy"); - drdy = lp_build_abs(float_bld, drdy); + + rho_x = LLVMBuildInsertElement(bld->builder, rho_x, drdx, index2, ""); + rho_y = LLVMBuildInsertElement(bld->builder, rho_y, drdy, index2, ""); } } - /* Compute rho = max of all partial derivatives scaled by texture size. - * XXX this could be vectorized somewhat - */ - rho = LLVMBuildFMul(bld->builder, - lp_build_max(float_bld, dsdx, dsdy), - lp_build_int_to_float(float_bld, width), ""); - if (dims > 1) { - LLVMValueRef max; - max = LLVMBuildFMul(bld->builder, - lp_build_max(float_bld, dtdx, dtdy), - lp_build_int_to_float(float_bld, height), ""); - rho = lp_build_max(float_bld, rho, max); - if (dims > 2) { - max = LLVMBuildFMul(bld->builder, - lp_build_max(float_bld, drdx, drdy), - lp_build_int_to_float(float_bld, depth), ""); - rho = lp_build_max(float_bld, rho, max); + rho_x = lp_build_abs(float_size_bld, rho_x); + rho_y = lp_build_abs(float_size_bld, rho_y); + + rho_vec = lp_build_max(float_size_bld, rho_x, rho_y); + + float_size = lp_build_int_to_float(float_size_bld, bld->uint_size); + + rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size); + + if (dims <= 1) { + rho = rho_vec; + } + else { + if (dims >= 2) { + LLVMValueRef rho_s, rho_t, rho_r; + + rho_s = LLVMBuildExtractElement(bld->builder, rho_vec, index0, ""); + rho_t = LLVMBuildExtractElement(bld->builder, rho_vec, index1, ""); + + rho = lp_build_max(float_bld, rho_s, rho_t); + + if (dims >= 3) { + rho_r = LLVMBuildExtractElement(bld->builder, rho_vec, index0, ""); + rho = lp_build_max(float_bld, rho, rho_r); + } } } @@ -289,7 +309,7 @@ lp_build_lod_selector(struct lp_build_sample_context *bld, else { LLVMValueRef rho; - rho = lp_build_rho(bld, ddx, ddy, width, height, depth); + rho = lp_build_rho(bld, ddx, ddy); /* compute lod = log2(rho) */ if ((mip_filter == PIPE_TEX_MIPFILTER_NONE || diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h index b019c3fa5e7..1dd78a07a24 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h @@ -202,9 +202,20 @@ struct lp_build_sample_context struct lp_type int_coord_type; struct lp_build_context int_coord_bld; + /** Unsigned integer texture size */ + struct lp_type uint_size_type; + struct lp_build_context uint_size_bld; + + /** Unsigned integer texture size */ + struct lp_type float_size_type; + struct lp_build_context float_size_bld; + /** Output texels type and build context */ struct lp_type texel_type; struct lp_build_context texel_bld; + + /** Unsigned vector with texture width, height, depth */ + LLVMValueRef uint_size; }; @@ -241,7 +252,7 @@ apply_sampler_swizzle(struct lp_build_sample_context *bld, } -static INLINE int +static INLINE unsigned texture_dims(enum pipe_texture_target tex) { switch (tex) { diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c index 4f9bf6763e6..74362abf455 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c @@ -1141,7 +1141,9 @@ lp_build_sample_soa(LLVMBuilderRef builder, LLVMValueRef explicit_lod, /* optional */ LLVMValueRef texel_out[4]) { + unsigned dims = texture_dims(static_state->target); struct lp_build_sample_context bld; + LLVMTypeRef i32t = LLVMInt32Type(); LLVMValueRef width, width_vec; LLVMValueRef height, height_vec; LLVMValueRef depth, depth_vec; @@ -1171,6 +1173,9 @@ lp_build_sample_soa(LLVMBuilderRef builder, bld.coord_type = type; bld.uint_coord_type = lp_uint_type(type); bld.int_coord_type = lp_int_type(type); + bld.float_size_type = lp_type_float(32); + bld.float_size_type.length = dims > 1 ? 4 : 1; + bld.uint_size_type = lp_uint_type(bld.float_size_type); bld.texel_type = type; float_vec_type = lp_type_float_vec(32); @@ -1181,6 +1186,8 @@ lp_build_sample_soa(LLVMBuilderRef builder, lp_build_context_init(&bld.coord_bld, builder, bld.coord_type); lp_build_context_init(&bld.uint_coord_bld, builder, bld.uint_coord_type); lp_build_context_init(&bld.int_coord_bld, builder, bld.int_coord_type); + lp_build_context_init(&bld.uint_size_bld, builder, bld.uint_size_type); + lp_build_context_init(&bld.float_size_bld, builder, bld.float_size_type); lp_build_context_init(&bld.texel_bld, builder, bld.texel_type); /* Get the dynamic state */ @@ -1196,6 +1203,23 @@ lp_build_sample_soa(LLVMBuilderRef builder, t = coords[1]; r = coords[2]; + /* width, height, depth as single uint vector */ + if (dims <= 1) { + bld.uint_size = width; + } + else { + bld.uint_size = LLVMBuildInsertElement(builder, bld.uint_size_bld.undef, + width, LLVMConstInt(i32t, 0, 0), ""); + if (dims >= 2) { + bld.uint_size = LLVMBuildInsertElement(builder, bld.uint_size, + height, LLVMConstInt(i32t, 1, 0), ""); + if (dims >= 3) { + bld.uint_size = LLVMBuildInsertElement(builder, bld.uint_size, + depth, LLVMConstInt(i32t, 2, 0), ""); + } + } + } + /* width, height, depth as uint vectors */ width_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, width); height_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, height); From 1d595c7cd4aefc7baf1942626f53bec8f6699f7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20H=C3=B8gsberg?= Date: Thu, 7 Oct 2010 17:03:53 -0400 Subject: [PATCH 045/540] gles2: Add GL_EXT_texture_format_BGRA8888 support --- src/mesa/drivers/dri/intel/intel_extensions_es2.c | 1 + src/mesa/main/APIspec.xml | 11 ++++++++++- src/mesa/main/extensions.c | 4 ++++ src/mesa/main/mtypes.h | 1 + 4 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/mesa/drivers/dri/intel/intel_extensions_es2.c b/src/mesa/drivers/dri/intel/intel_extensions_es2.c index 24f64045ef8..ed5db20e38a 100644 --- a/src/mesa/drivers/dri/intel/intel_extensions_es2.c +++ b/src/mesa/drivers/dri/intel/intel_extensions_es2.c @@ -69,6 +69,7 @@ static const char *es2_extensions[] = { "GL_ARB_depth_texture", "GL_EXT_packed_depth_stencil", "GL_EXT_framebuffer_object", + "GL_EXT_texture_format_BGRA8888", #if FEATURE_OES_EGL_image "GL_OES_EGL_image", diff --git a/src/mesa/main/APIspec.xml b/src/mesa/main/APIspec.xml index 4c5fd59d4fb..4dc0b0d4851 100644 --- a/src/mesa/main/APIspec.xml +++ b/src/mesa/main/APIspec.xml @@ -383,6 +383,7 @@ + @@ -458,11 +459,18 @@ - + + + + + + + +