draw: split off all the extra functionality in the vertex shader

This will at least allow us to make the initial gains to get decent vertex performance much more quickly & with higher confidence of getting it right. At some later point can look again at code-generating all the fetch/cliptest/viewport extras in the same block as the vertex shader. For now, just need to get some decent baseline performance.
2025-12-25 04:20:08 +01:00 · 2008-04-17 23:44:32 +01:00 · 2008-04-17 23:44:32 +01:00 · a773f06e96
commit a773f06e96
parent 01b6354e72
13 changed files with 635 additions and 118 deletions
--- a/src/gallium/auxiliary/draw/Makefile
+++ b/src/gallium/auxiliary/draw/Makefile
@ -20,8 +20,10 @@ C_SOURCES = \
 	draw_pt_fetch_emit.c \
 	draw_pt_fetch_pipeline.c \
 	draw_pt_fetch_shade_pipeline.c \
-	draw_pt_pipeline.c \
+	draw_pt_fetch.c \
+	draw_pt_post_vs.c \
 	draw_pt_emit.c \
+	draw_pt_pipeline.c \
 	draw_pt_elts.c \
 	draw_prim.c \
 	draw_pstipple.c \
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@ -110,6 +110,12 @@ struct draw_context *draw_create( void )

   tgsi_exec_machine_init(&draw->machine);

+   /* FIXME: give this machine thing a proper constructor:
+    */
+   draw->machine.Inputs = align_malloc(PIPE_MAX_ATTRIBS * sizeof(struct tgsi_exec_vector), 16);
+   draw->machine.Outputs = align_malloc(PIPE_MAX_ATTRIBS * sizeof(struct tgsi_exec_vector), 16);
+
+
   if (!draw_pt_init( draw ))
      goto fail;

@ -155,8 +161,13 @@ void draw_destroy( struct draw_context *draw )
   if (draw->pipeline.rasterize)
      draw->pipeline.rasterize->destroy( draw->pipeline.rasterize );

+   if (draw->machine.Inputs)
+      align_free(draw->machine.Inputs);
+   if (draw->machine.Outputs)
+      align_free(draw->machine.Outputs);
   tgsi_exec_machine_free_data(&draw->machine);
-   
+
+
   if (draw->vs.vertex_cache)
      align_free( draw->vs.vertex_cache ); /* Frees all the vertices. */

@ -265,6 +276,7 @@ draw_set_vertex_elements(struct draw_context *draw,
   draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE/*STATE_CHANGE*/ );

   memcpy(draw->vertex_element, elements, count * sizeof(elements[0]));
+   draw->nr_vertex_elements = count;
 }


@ -463,15 +475,3 @@ boolean draw_get_edgeflag( struct draw_context *draw,
      return 1;
 }

-
-#if 0
-/* Crufty init function.  Fix me.
- */
-boolean draw_init_machine( struct draw_context *draw )
-{
-   ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_MAX_ATTRIBS);
-   ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_MAX_ATTRIBS);
-   machine->Inputs = ALIGN16_ASSIGN(inputs);
-   machine->Outputs = ALIGN16_ASSIGN(outputs);
-}
-#endif
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@ -224,6 +224,8 @@ struct draw_context
   unsigned nr_vertex_buffers;

   struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
+   unsigned nr_vertex_elements;
+
   struct draw_vertex_shader *vertex_shader;

   boolean identity_viewport;
--- a/src/gallium/auxiliary/draw/draw_pt.h
+++ b/src/gallium/auxiliary/draw/draw_pt.h
@ -112,6 +112,7 @@ struct draw_pt_middle_end {
 * mode...  
 */
 struct vbuf_render;
+struct vertex_header;


 /* Helper functions.
@ -132,25 +133,25 @@ struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit(struct draw_context *d
 */
 void draw_pt_run_pipeline( struct draw_context *draw,
                           unsigned prim,
-                           char *verts,
-                           unsigned vertex_stride,
+                           struct vertex_header *verts,
                           unsigned vertex_count,
+                           unsigned vertex_stride,
                           const ushort *elts,
                           unsigned count );


-/* HW vertex emit:
+/*******************************************************************************
+ * HW vertex emit:
 */
 struct pt_emit;

 void draw_pt_emit_prepare( struct pt_emit *emit,
-			   unsigned prim,
-			   unsigned opt );
+			   unsigned prim );

 void draw_pt_emit( struct pt_emit *emit,
-		   char *verts,
-		   unsigned stride,
+		   const float (*vertex_data)[4],
 		   unsigned vertex_count,
+		   unsigned stride,
 		   const ushort *elts,
 		   unsigned count );

@ -159,6 +160,42 @@ void draw_pt_emit_destroy( struct pt_emit *emit );
 struct pt_emit *draw_pt_emit_create( struct draw_context *draw );


+/*******************************************************************************
+ * API vertex fetch:
+ */
+
+struct pt_fetch;
+void draw_pt_fetch_prepare( struct pt_fetch *fetch,
+			    boolean emit_header,
+			    unsigned vertex_size );
+
+void draw_pt_fetch_run( struct pt_fetch *fetch,
+			const unsigned *elts,
+			unsigned count,
+			char *verts );
+
+void draw_pt_fetch_destroy( struct pt_fetch *fetch );
+
+struct pt_fetch *draw_pt_fetch_create( struct draw_context *draw );
+
+/*******************************************************************************
+ * Post-VS: cliptest, rhw, viewport
+ */
+struct pt_post_vs;
+
+boolean draw_pt_post_vs_run( struct pt_post_vs *pvs,
+			     struct vertex_header *pipeline_verts,
+			     unsigned stride,
+			     unsigned count );
+
+void draw_pt_post_vs_prepare( struct pt_post_vs *pvs,
+			      boolean bypass_clipping,
+			      boolean identity_viewport,
+			      boolean opengl );
+
+struct pt_post_vs *draw_pt_post_vs_create( struct draw_context *draw );
+
+void draw_pt_post_vs_destroy( struct pt_post_vs *pvs );


 #endif
--- a/src/gallium/auxiliary/draw/draw_pt_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@ -38,16 +38,11 @@ struct pt_emit {
   struct draw_context *draw;

   struct translate *translate;
-
-   unsigned pipeline_vertex_size;
-   unsigned prim;
-   unsigned opt;
 };


 void draw_pt_emit_prepare( struct pt_emit *emit,
-			   unsigned prim,
-			   unsigned opt )
+			   unsigned prim )
 {
   struct draw_context *draw = emit->draw;
   const struct vertex_info *vinfo;
@ -75,8 +70,7 @@ void draw_pt_emit_prepare( struct pt_emit *emit,
      unsigned emit_sz = 0;
      unsigned src_buffer = 0;
      unsigned output_format;
-      unsigned src_offset = (sizeof(struct vertex_header) + 
-			     vinfo->src_index[i] * 4 * sizeof(float) );
+      unsigned src_offset = (vinfo->src_index[i] * 4 * sizeof(float) );


         
@ -139,9 +133,9 @@ void draw_pt_emit_prepare( struct pt_emit *emit,


 void draw_pt_emit( struct pt_emit *emit,
-		   char *verts,
-		   unsigned stride,
+		   const float (*vertex_data)[4],
 		   unsigned vertex_count,
+		   unsigned stride,
 		   const ushort *elts,
 		   unsigned count )
 {
@ -164,7 +158,7 @@ void draw_pt_emit( struct pt_emit *emit,

   translate->set_buffer(translate, 
 			 0, 
-			 verts,
+			 vertex_data,
 			 stride );

   translate->set_buffer(translate, 
--- a/src/gallium/auxiliary/draw/draw_pt_fetch.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch.c
@ -0,0 +1,175 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "pipe/p_util.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vbuf.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pt.h"
+#include "translate/translate.h"
+
+
+struct pt_fetch {
+   struct draw_context *draw;
+
+   struct translate *translate;
+   
+   unsigned vertex_size;
+};
+
+
+
+/* Perform the fetch from API vertex elements & vertex buffers, to a
+ * contiguous set of float[4] attributes as required for the
+ * vertex_shader->run_linear() method.
+ *
+ * This is used in all cases except pure passthrough
+ * (draw_pt_fetch_emit.c) which has its own version to translate
+ * directly to hw vertices.
+ *
+ */
+void draw_pt_fetch_prepare( struct pt_fetch *fetch,
+			    boolean emit_header,
+			    unsigned vertex_size )
+{
+   struct draw_context *draw = fetch->draw;
+   unsigned i, nr = 0;
+   unsigned dst_offset = 0;
+   struct translate_key key;
+
+   fetch->vertex_size = vertex_size;
+
+   memset(&key, 0, sizeof(key));
+
+   /* If PT_SHADE is not set, then we are creating post-shader
+    * vertices, meaning that we need to emit/leave space for a vertex
+    * header.
+    *
+    * It's worth considering whether the vertex headers should contain
+    * a pointer to the 'data', rather than having it inline.
+    * Something to look at after we've fully switched over to the pt
+    * paths.
+    */
+   if (emit_header)
+   {
+      /* Need to set header->vertex_id = 0xffff somehow.
+       */
+      key.element[nr].input_format = PIPE_FORMAT_R32_FLOAT;
+      key.element[nr].input_buffer = draw->nr_vertex_buffers;
+      key.element[nr].input_offset = 0;
+      key.element[nr].output_format = PIPE_FORMAT_R32_FLOAT;
+      key.element[nr].output_offset = dst_offset;
+      dst_offset += 1 * sizeof(float);
+      nr++;
+
+
+      /* Just leave the clip[] array untouched.
+       */
+      dst_offset += 4 * sizeof(float);
+   }
+      
+
+   for (i = 0; i < draw->nr_vertex_elements; i++) {
+      key.element[nr].input_format = draw->vertex_element[i].src_format;
+      key.element[nr].input_buffer = draw->vertex_element[i].vertex_buffer_index;
+      key.element[nr].input_offset = draw->vertex_element[i].src_offset;
+      key.element[nr].output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+      key.element[nr].output_offset = dst_offset;
+
+      dst_offset += 4 * sizeof(float);
+      nr++;
+   }
+
+   assert(dst_offset <= vertex_size);
+
+   key.nr_elements = nr;
+   key.output_stride = vertex_size;
+
+
+   /* Don't bother with caching at this stage:
+    */
+   if (!fetch->translate ||
+       memcmp(&fetch->translate->key, &key, sizeof(key)) != 0) 
+   {
+      if (fetch->translate)
+	 fetch->translate->release(fetch->translate);
+
+      fetch->translate = translate_generic_create( &key );
+
+      if (emit_header) {
+	 static struct vertex_header vh = { 0, 0, 0, 0xffff };
+	 fetch->translate->set_buffer(fetch->translate, 
+				      draw->nr_vertex_buffers, 
+				      &vh,
+				      0);
+      }
+   }
+}
+
+
+
+
+void draw_pt_fetch_run( struct pt_fetch *fetch,
+			const unsigned *elts,
+			unsigned count,
+			char *verts )
+{
+   struct draw_context *draw = fetch->draw;
+   struct translate *translate = fetch->translate;
+   unsigned i;
+
+   for (i = 0; i < draw->nr_vertex_buffers; i++) {
+      translate->set_buffer(translate, 
+			    i, 
+			    ((char *)draw->user.vbuffer[i] + 
+			     draw->vertex_buffer[i].buffer_offset),
+			    draw->vertex_buffer[i].pitch );
+   }
+
+   translate->run_elts( translate,
+			elts, 
+			count,
+			verts );
+}
+
+
+struct pt_fetch *draw_pt_fetch_create( struct draw_context *draw )
+{
+   struct pt_fetch *fetch = CALLOC_STRUCT(pt_fetch);
+   if (!fetch)
+      return NULL;
+	 
+   fetch->draw = draw;
+   return fetch;
+}
+
+void draw_pt_fetch_destroy( struct pt_fetch *fetch )
+{
+   FREE(fetch);
+}
+
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_pipeline.c
@ -286,9 +286,9 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle,
    */
   draw_pt_run_pipeline( fpme->draw,
                         fpme->prim,
-                         pipeline_verts,
-                         fpme->pipeline_vertex_size,
+                         (struct vertex_header *)pipeline_verts,
                         fetch_count,
+                         fpme->pipeline_vertex_size,
                         draw_elts,
                         draw_count );
                 
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
@ -39,8 +39,11 @@ struct fetch_pipeline_middle_end {
   struct draw_context *draw;

   struct pt_emit *emit;
+   struct pt_fetch *fetch;
+   struct pt_post_vs *post_vs;

-   unsigned pipeline_vertex_size;
+   unsigned vertex_data_offset;
+   unsigned vertex_size;
   unsigned prim;
   unsigned opt;
 };
@ -51,15 +54,43 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,
 				    unsigned opt )
 {
   struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
+   struct draw_context *draw = fpme->draw;
+   struct draw_vertex_shader *vs = draw->vertex_shader;
+   unsigned nr = MAX2( vs->info.num_inputs,
+		       vs->info.num_outputs );

   fpme->prim = prim;
   fpme->opt = opt;

-   if (!(opt & PT_PIPELINE)) 
-      draw_pt_emit_prepare( fpme->emit, prim, opt );
+   /* Always leave room for the vertex header whether we need it or
+    * not.  It's hard to get rid of it in particular because of the
+    * viewport code in draw_pt_post_vs.c.  
+    */
+   fpme->vertex_size = sizeof(struct vertex_header) + nr * 4 * sizeof(float);
+
+   
+
+   draw_pt_fetch_prepare( fpme->fetch, 
+			  (opt & (PT_CLIPTEST | PT_PIPELINE)) != 0,
+			  fpme->vertex_size );
+
+   /* XXX: it's not really gl rasterization rules we care about here,
+    * but gl vs dx9 clip spaces.
+    */
+   draw_pt_post_vs_prepare( fpme->post_vs,
+			    draw->rasterizer->bypass_clipping,
+			    draw->identity_viewport,
+			    draw->rasterizer->gl_rasterization_rules );
+			    
+
+   if (!(opt & PT_PIPELINE)) 
+      draw_pt_emit_prepare( fpme->emit, 
+			    prim );
+
+   /* No need to prepare the shader.
+    */
+   vs->prepare(vs, draw);

-   //fpme->pipeline_vertex_size = sizeof(struct vertex_header) + nr * 4 * sizeof(float);
-   fpme->pipeline_vertex_size = MAX_VERTEX_ALLOCATION;
 }


@ -74,44 +105,63 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle,
   struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
   struct draw_context *draw = fpme->draw;
   struct draw_vertex_shader *shader = draw->vertex_shader;
-   char *pipeline_verts;
-   unsigned pipeline = PT_PIPELINE;
+   unsigned opt = fpme->opt;

-   pipeline_verts = MALLOC(fpme->pipeline_vertex_size *
-			   fetch_count);
+   struct vertex_header *pipeline_verts = 
+      (struct vertex_header *)MALLOC(fpme->vertex_size * fetch_count);

   if (!pipeline_verts) {
      assert(0);
      return;
   }

-
-   /* Shade
+   /* Fetch into our vertex buffer
    */
-   shader->prepare(shader, draw);
+   draw_pt_fetch_run( fpme->fetch,
+		      fetch_elts, 
+		      fetch_count,
+		      (char *)pipeline_verts );

-   if (shader->run(shader, draw, fetch_elts, fetch_count, pipeline_verts,
-		   fpme->pipeline_vertex_size))
+   /* Run the shader, note that this overwrites the data[] parts of
+    * the pipeline verts.  If there is no shader, ie a bypass shader,
+    * then the inputs == outputs, and are already in the correct
+    * place.
+    */
+   if (opt & PT_SHADE)
   {
-      pipeline |= PT_CLIPTEST;
+      shader->run_linear(shader, 
+			 (const float (*)[4])pipeline_verts->data,
+			 (      float (*)[4])pipeline_verts->data,
+			 (const float (*)[4])draw->user.constants,
+			 fetch_count,
+			 fpme->vertex_size,
+			 fpme->vertex_size);
   }

+   if (draw_pt_post_vs_run( fpme->post_vs,
+			    pipeline_verts,
+			    fetch_count,
+			    fpme->vertex_size ))
+   {
+      opt |= PT_PIPELINE;
+   }

   /* Do we need to run the pipeline?
    */
-   if (fpme->opt & pipeline) {
+   if (opt & PT_PIPELINE) {
      draw_pt_run_pipeline( fpme->draw,
                            fpme->prim,
                            pipeline_verts,
-                            fpme->pipeline_vertex_size,
                            fetch_count,
+                            fpme->vertex_size,
                            draw_elts,
                            draw_count );
-   } else {
+   } 
+   else {
      draw_pt_emit( fpme->emit,
-		    pipeline_verts,
-		    fpme->pipeline_vertex_size,
+		    (const float (*)[4])pipeline_verts->data,
 		    fetch_count,
+		    fpme->vertex_size,
 		    draw_elts,
 		    draw_count );
   }
@ -129,6 +179,17 @@ static void fetch_pipeline_finish( struct draw_pt_middle_end *middle )

 static void fetch_pipeline_destroy( struct draw_pt_middle_end *middle )
 {
+   struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
+
+   if (fpme->fetch)
+      draw_pt_fetch_destroy( fpme->fetch );
+
+   if (fpme->emit)
+      draw_pt_emit_destroy( fpme->emit );
+
+   if (fpme->post_vs)
+      draw_pt_post_vs_destroy( fpme->post_vs );
+
   FREE(middle);
 }

@ -146,6 +207,14 @@ struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit( struct draw_context *

   fpme->draw = draw;

+   fpme->fetch = draw_pt_fetch_create( draw );
+   if (!fpme->fetch)
+      goto fail;
+
+   fpme->post_vs = draw_pt_post_vs_create( draw );
+   if (!fpme->post_vs)
+      goto fail;
+
   fpme->emit = draw_pt_emit_create( draw );
   if (!fpme->emit) 
      goto fail;
--- a/src/gallium/auxiliary/draw/draw_pt_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_pipeline.c
@ -117,12 +117,13 @@ void draw_pt_reset_vertex_ids( struct draw_context *draw )
 */
 void draw_pt_run_pipeline( struct draw_context *draw,
                           unsigned prim,
-                           char *verts,
-                           unsigned stride,
+                           struct vertex_header *pipeline_verts,
                           unsigned vertex_count,
+                           unsigned stride,
                           const ushort *elts,
                           unsigned count )
 {
+   char *verts = (char *)pipeline_verts;
   unsigned i;

   draw->pt.pipeline.verts = verts;
--- a/src/gallium/auxiliary/draw/draw_pt_post_vs.c
+++ b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
@ -0,0 +1,202 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "pipe/p_util.h"
+#include "pipe/p_context.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vbuf.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pt.h"
+
+struct pt_post_vs {
+   struct draw_context *draw;
+
+   boolean (*run)( struct pt_post_vs *pvs,
+		struct vertex_header *vertices,
+		unsigned count,
+		unsigned stride );
+};
+
+
+
+static INLINE unsigned
+compute_clipmask_gl(const float *clip, /*const*/ float plane[][4], unsigned nr)
+{
+   unsigned mask = 0x0;
+   unsigned i;
+
+   /* Do the hardwired planes first:
+    */
+   if (-clip[0] + clip[3] < 0) mask |= CLIP_RIGHT_BIT;
+   if ( clip[0] + clip[3] < 0) mask |= CLIP_LEFT_BIT;
+   if (-clip[1] + clip[3] < 0) mask |= CLIP_TOP_BIT;
+   if ( clip[1] + clip[3] < 0) mask |= CLIP_BOTTOM_BIT;
+   if (-clip[2] + clip[3] < 0) mask |= CLIP_FAR_BIT;
+   if ( clip[2] + clip[3] < 0) mask |= CLIP_NEAR_BIT;
+
+   /* Followed by any remaining ones:
+    */
+   for (i = 6; i < nr; i++) {
+      if (dot4(clip, plane[i]) < 0) 
+         mask |= (1<<i);
+   }
+
+   return mask;
+}
+
+
+/* The normal case - cliptest, rhw divide, viewport transform.
+ *
+ * Also handle identity viewport here at the expense of a few wasted
+ * instructions
+ */
+static boolean post_vs_cliptest_viewport_gl( struct pt_post_vs *pvs,
+					  struct vertex_header *vertices,
+					  unsigned count,
+					  unsigned stride )
+{
+   struct vertex_header *out = vertices;
+   const float *scale = pvs->draw->viewport.scale;
+   const float *trans = pvs->draw->viewport.translate;
+   unsigned j;
+   unsigned clipped = 0;
+
+   for (j = 0; j < count; j++) {
+      out->clip[0] = out->data[0][0];
+      out->clip[1] = out->data[0][1];
+      out->clip[2] = out->data[0][2];
+      out->clip[3] = out->data[0][3];
+
+      out->vertex_id = 0xffff;
+      out->edgeflag = 1;
+      out->clipmask = compute_clipmask_gl(out->clip, 
+					  pvs->draw->plane,
+					  pvs->draw->nr_planes);
+      clipped += out->clipmask;
+
+      if (out->clipmask == 0)
+      {
+	 /* divide by w */
+	 float w = 1.0f / out->data[0][3];
+
+	 /* Viewport mapping */
+	 out->data[0][0] = out->data[0][0] * w * scale[0] + trans[0];
+	 out->data[0][1] = out->data[0][1] * w * scale[1] + trans[1];
+	 out->data[0][2] = out->data[0][2] * w * scale[2] + trans[2];
+	 out->data[0][3] = w;
+      }
+
+      out = (struct vertex_header *)( (char *)out + stride );
+   }
+
+   return clipped != 0;
+}
+
+
+
+/* If bypass_clipping is set, skip cliptest and rhw divide.
+ */
+static boolean post_vs_viewport( struct pt_post_vs *pvs,
+			      struct vertex_header *vertices,
+			      unsigned count,
+			      unsigned stride )
+{
+   struct vertex_header *out = vertices;
+   const float *scale = pvs->draw->viewport.scale;
+   const float *trans = pvs->draw->viewport.translate;
+   unsigned j;
+
+   debug_printf("%s\n", __FUNCTION__);
+   for (j = 0; j < count; j++) {
+      /* Viewport mapping only, no cliptest/rhw divide
+       */
+      out->data[0][0] = out->data[0][0] * scale[0] + trans[0];
+      out->data[0][1] = out->data[0][1] * scale[1] + trans[1];
+      out->data[0][2] = out->data[0][2] * scale[2] + trans[2];
+
+      out = (struct vertex_header *)((char *)out + stride);
+   }
+   
+   return FALSE;
+}
+
+
+/* If bypass_clipping is set and we have an identity viewport, nothing
+ * to do.
+ */
+static boolean post_vs_none( struct pt_post_vs *pvs,
+			     struct vertex_header *vertices,
+			     unsigned count,
+			     unsigned stride )
+{
+   debug_printf("%s\n", __FUNCTION__);
+   return FALSE;
+}
+
+boolean draw_pt_post_vs_run( struct pt_post_vs *pvs,
+			     struct vertex_header *pipeline_verts,
+			     unsigned count,
+			     unsigned stride )
+{
+   return pvs->run( pvs, pipeline_verts, count, stride );
+}
+
+
+void draw_pt_post_vs_prepare( struct pt_post_vs *pvs,
+			      boolean bypass_clipping,
+			      boolean identity_viewport,
+			      boolean opengl )
+{
+   if (bypass_clipping) {
+      if (identity_viewport)
+	 pvs->run = post_vs_none;
+      else
+	 pvs->run = post_vs_viewport;
+   }
+   else {
+      //if (opengl) 
+      pvs->run = post_vs_cliptest_viewport_gl;
+   }
+}
+
+
+struct pt_post_vs *draw_pt_post_vs_create( struct draw_context *draw )
+{
+   struct pt_post_vs *pvs = CALLOC_STRUCT( pt_post_vs );
+   if (!pvs)
+      return NULL;
+
+   pvs->draw = draw;
+   
+   return pvs;
+}
+
+void draw_pt_post_vs_destroy( struct pt_post_vs *pvs )
+{
+   FREE(pvs);
+}
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@ -58,8 +58,10 @@ static void
 vs_exec_prepare( struct draw_vertex_shader *shader,
 		 struct draw_context *draw )
 {
+   struct exec_vertex_shader *evs = exec_vertex_shader(shader);
+
   /* specify the vertex program to interpret/execute */
-   tgsi_exec_machine_bind_shader(&draw->machine,
+   tgsi_exec_machine_bind_shader(evs->machine,
 				 shader->state.tokens,
 				 PIPE_MAX_SAMPLERS,
 				 NULL /*samplers*/ );
@ -84,31 +86,45 @@ vs_exec_run( struct draw_vertex_shader *shader,
 	     void *vOut,
             unsigned vertex_size)
 {
-   struct tgsi_exec_machine *machine = &draw->machine;
+   struct exec_vertex_shader *evs = exec_vertex_shader(shader);
+   struct tgsi_exec_machine *machine = evs->machine;
   unsigned int i, j;
   unsigned int clipped = 0;
-
-   ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_MAX_ATTRIBS);
-   ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_MAX_ATTRIBS);
+   struct tgsi_exec_vector *outputs = 0;
   const float *scale = draw->viewport.scale;
   const float *trans = draw->viewport.translate;

   assert(shader->info.output_semantic_name[0] == TGSI_SEMANTIC_POSITION);

   machine->Consts = (const float (*)[4]) draw->user.constants;
-   machine->Inputs = ALIGN16_ASSIGN(inputs);
+
   if (draw->rasterizer->bypass_vs) {
      /* outputs are just the inputs */
-      machine->Outputs = machine->Inputs;
+      outputs = machine->Inputs;
   }
   else {
-      machine->Outputs = ALIGN16_ASSIGN(outputs);
+      outputs = machine->Outputs;
   }

   for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
      unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i);
      draw->vertex_fetch.fetch_func( draw, machine, &elts[i], max_vertices );

+#if 0
+      for (j = 0; j < max_vertices; j++) {
+	 unsigned slot;
+	 debug_printf("%d) Input vert:\n", i + j);
+	 for (slot = 0; slot < shader->info.num_inputs; slot++) {
+	    debug_printf("\t%d: %f %f %f %f\n", slot,
+			 machine->Inputs[slot].xyzw[0].f[j],
+			 machine->Inputs[slot].xyzw[1].f[j],
+			 machine->Inputs[slot].xyzw[2].f[j],
+			 machine->Inputs[slot].xyzw[3].f[j]);
+	 }
+      }
+#endif
+
+
      if (!draw->rasterizer->bypass_vs) {
         /* run interpreter */
         tgsi_exec_machine_run( machine );
@ -127,10 +143,10 @@ vs_exec_run( struct draw_vertex_shader *shader,
          * program as a set of DP4 instructions appended to the
          * user-provided code.
          */
-         x = out->clip[0] = machine->Outputs[0].xyzw[0].f[j];
-         y = out->clip[1] = machine->Outputs[0].xyzw[1].f[j];
-         z = out->clip[2] = machine->Outputs[0].xyzw[2].f[j];
-         w = out->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+         x = out->clip[0] = outputs[0].xyzw[0].f[j];
+         y = out->clip[1] = outputs[0].xyzw[1].f[j];
+         z = out->clip[2] = outputs[0].xyzw[2].f[j];
+         w = out->clip[3] = outputs[0].xyzw[3].f[j];

         if (!draw->rasterizer->bypass_clipping) {
            out->clipmask = compute_clipmask(out->clip, draw->plane,
@ -156,7 +172,8 @@ vs_exec_run( struct draw_vertex_shader *shader,
            out->data[0][2] = z * scale[2] + trans[2];
            out->data[0][3] = w;
         }
-         else {
+         else 
+	 {
            out->data[0][0] = x;
            out->data[0][1] = y;
            out->data[0][2] = z;
@ -167,10 +184,10 @@ vs_exec_run( struct draw_vertex_shader *shader,
          * vertex attrib slots.
          */
         for (slot = 1; slot < draw->num_vs_outputs; slot++) {
-            out->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
-            out->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
-            out->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
-            out->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+            out->data[slot][0] = outputs[slot].xyzw[0].f[j];
+            out->data[slot][1] = outputs[slot].xyzw[1].f[j];
+            out->data[slot][2] = outputs[slot].xyzw[2].f[j];
+            out->data[slot][3] = outputs[slot].xyzw[3].f[j];
         }

 #if 0 /*DEBUG*/
@ -216,12 +233,25 @@ vs_exec_run_linear( struct draw_vertex_shader *shader,
      /* Swizzle inputs.  
       */
      for (j = 0; j < max_vertices; j++) {
+#if 0
+         debug_printf("%d) Input vert:\n", i + j);
+         for (slot = 0; slot < shader->info.num_inputs; slot++) {
+            debug_printf("\t%d: %f %f %f %f\n", slot,
+			 input[slot][0],
+			 input[slot][1],
+			 input[slot][2],
+			 input[slot][3]);
+         }
+#endif
+
         for (slot = 0; slot < shader->info.num_inputs; slot++) {
            machine->Inputs[slot].xyzw[0].f[j] = input[slot][0];
            machine->Inputs[slot].xyzw[1].f[j] = input[slot][1];
            machine->Inputs[slot].xyzw[2].f[j] = input[slot][2];
            machine->Inputs[slot].xyzw[3].f[j] = input[slot][3];
         }
+
+	 input = (const float (*)[4])((const char *)input + input_stride);
      } 

      /* run interpreter */
@ -235,13 +265,23 @@ vs_exec_run_linear( struct draw_vertex_shader *shader,
            output[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
            output[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
            output[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+
         }
+
+#if 0
+	 debug_printf("%d) Post xform vert:\n", i + j);
+	 for (slot = 0; slot < shader->info.num_outputs; slot++) {
+	    debug_printf("\t%d: %f %f %f %f\n", slot,
+			 output[slot][0],
+			 output[slot][1],
+			 output[slot][2],
+			 output[slot][3]);
+         }
+#endif
+
+	 output = (float (*)[4])((char *)output + output_stride);
      } 

-      /* Advance input, output pointers: 
-       */
-      input = (const float (*)[4])((const char *)input + input_stride);
-      output = (float (*)[4])((char *)output + output_stride);
   }
 }

--- a/src/gallium/auxiliary/draw/draw_vs_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c
@ -47,6 +47,7 @@
 struct draw_llvm_vertex_shader {
   struct draw_vertex_shader base;
   struct gallivm_prog *llvm_prog;
+   struct tgsi_exec_machine *machine;
 };


@ -77,12 +78,9 @@ vs_llvm_run( struct draw_vertex_shader *base,
   struct draw_llvm_vertex_shader *shader =
      (struct draw_llvm_vertex_shader *)base;

-   struct tgsi_exec_machine *machine = &draw->machine;
+   struct tgsi_exec_machine *machine = shader->machine;
   unsigned int j;
   unsigned int clipped = 0;
-
-   ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_MAX_ATTRIBS);
-   ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_MAX_ATTRIBS);
   const float *scale = draw->viewport.scale;
   const float *trans = draw->viewport.translate;

@ -93,13 +91,12 @@ vs_llvm_run( struct draw_vertex_shader *base,
   /* Consts does not require 16 byte alignment. */
   machine->Consts = (float (*)[4]) draw->user.constants;

-   machine->Inputs = ALIGN16_ASSIGN(inputs);
   if (draw->rasterizer->bypass_vs) {
      /* outputs are just the inputs */
-      machine->Outputs = machine->Inputs;
+      outputs = machine->Inputs;
   }
   else {
-      machine->Outputs = ALIGN16_ASSIGN(outputs);
+      outputs = machine->Outputs;
   }


@ -119,10 +116,10 @@ vs_llvm_run( struct draw_vertex_shader *base,
      unsigned slot;
      float x, y, z, w;

-      x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j];
-      y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j];
-      z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
-      w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+      x = vOut[j]->clip[0] = outputs[0].xyzw[0].f[j];
+      y = vOut[j]->clip[1] = outputs[0].xyzw[1].f[j];
+      z = vOut[j]->clip[2] = outputs[0].xyzw[2].f[j];
+      w = vOut[j]->clip[3] = outputs[0].xyzw[3].f[j];

      if (!draw->rasterizer->bypass_clipping) {
         vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane,
@ -159,10 +156,10 @@ vs_llvm_run( struct draw_vertex_shader *base,
       * vertex attrib slots.
       */
      for (slot = 1; slot < draw->num_vs_outputs; slot++) {
-         vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
-         vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
-         vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
-         vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+         vOut[j]->data[slot][0] = outputs[slot].xyzw[0].f[j];
+         vOut[j]->data[slot][1] = outputs[slot].xyzw[1].f[j];
+         vOut[j]->data[slot][2] = outputs[slot].xyzw[2].f[j];
+         vOut[j]->data[slot][3] = outputs[slot].xyzw[3].f[j];
      }
   } /* loop over vertices */
   return clipped != 0;
@ -183,7 +180,7 @@ vs_llvm_run_linear( struct draw_vertex_shader *base,
   struct draw_llvm_vertex_shader *shader =
      (struct draw_llvm_vertex_shader *)base;

-   struct tgsi_exec_machine *machine = &draw->machine;
+   struct tgsi_exec_machine *machine = shader->machine;
   unsigned int j;


@ -199,6 +196,8 @@ vs_llvm_run_linear( struct draw_vertex_shader *base,
 	    machine->Inputs[slot].xyzw[2].f[j] = input[slot][2];
 	    machine->Inputs[slot].xyzw[3].f[j] = input[slot][3];
 	 }
+
+	 input = (const float (*)[4])((const char *)input + input_stride);
      } 

      /* run shader */
@ -216,12 +215,9 @@ vs_llvm_run_linear( struct draw_vertex_shader *base,
         output[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
         output[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
         output[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
-      }

-      /* Advance input, output pointers: 
-       */
-      input = (const float (*)[4])((const char *)input + input_stride);
-      output = (float (*)[4])((char *)output + output_stride);
+	 output = (float (*)[4])((char *)output + output_stride);
+      }
   } 
 }

@ -263,6 +259,7 @@ draw_create_vs_llvm(struct draw_context *draw,
   vs->base.run = vs_llvm_run;
   vs->base.run_linear = vs_llvm_run_linear;
   vs->base.delete = vs_llvm_delete;
+   vs->machine = &draw->machine;

   {
      struct gallivm_ir *ir = gallivm_ir_new(GALLIVM_VS);
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@ -91,12 +91,10 @@ vs_sse_run( struct draw_vertex_shader *base,
            unsigned vertex_size )
 {
   struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base;
-   struct tgsi_exec_machine *machine = &draw->machine;
+   struct tgsi_exec_machine *machine = shader->machine;
   unsigned int i, j;
   unsigned int clipped = 0;
-
-   ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_MAX_ATTRIBS);
-   ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_MAX_ATTRIBS);
+   struct tgsi_exec_vector *outputs = 0;
   const float *scale = draw->viewport.scale;
   const float *trans = draw->viewport.translate;

@ -104,13 +102,13 @@ vs_sse_run( struct draw_vertex_shader *base,

   /* Consts does not require 16 byte alignment. */
   machine->Consts = (const float (*)[4]) draw->user.constants;
-   machine->Inputs = ALIGN16_ASSIGN(inputs);
+
   if (draw->rasterizer->bypass_vs) {
      /* outputs are just the inputs */
-      machine->Outputs = machine->Inputs;
+      outputs = machine->Inputs;
   }
   else {
-      machine->Outputs = ALIGN16_ASSIGN(outputs);
+      outputs = machine->Outputs;
   }

   for (i = 0; i < count; i += SSE_MAX_VERTICES) {
@ -142,10 +140,10 @@ vs_sse_run( struct draw_vertex_shader *base,
         struct vertex_header *out =
            draw_header_from_block(vOut, vertex_size, i + j);

-         x = out->clip[0] = machine->Outputs[0].xyzw[0].f[j];
-         y = out->clip[1] = machine->Outputs[0].xyzw[1].f[j];
-         z = out->clip[2] = machine->Outputs[0].xyzw[2].f[j];
-         w = out->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+         x = out->clip[0] = outputs[0].xyzw[0].f[j];
+         y = out->clip[1] = outputs[0].xyzw[1].f[j];
+         z = out->clip[2] = outputs[0].xyzw[2].f[j];
+         w = out->clip[3] = outputs[0].xyzw[3].f[j];

         if (!draw->rasterizer->bypass_clipping) {
            out->clipmask = compute_clipmask(out->clip, draw->plane,
@ -182,10 +180,10 @@ vs_sse_run( struct draw_vertex_shader *base,
          * vertex attrib slots.
          */
         for (slot = 1; slot < draw->num_vs_outputs; slot++) {
-            out->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
-            out->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
-            out->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
-            out->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+            out->data[slot][0] = outputs[slot].xyzw[0].f[j];
+            out->data[slot][1] = outputs[slot].xyzw[1].f[j];
+            out->data[slot][2] = outputs[slot].xyzw[2].f[j];
+            out->data[slot][3] = outputs[slot].xyzw[3].f[j];
         }
 #if 0 /*DEBUG*/
         printf("%d) Post xform vert:\n", i + j);
@ -233,6 +231,8 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
            machine->Inputs[slot].xyzw[2].f[j] = input[slot][2];
            machine->Inputs[slot].xyzw[3].f[j] = input[slot][3];
         }
+
+	 input = (const float (*)[4])((const char *)input + input_stride);
      } 

      /* run compiled shader
@ -253,12 +253,9 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
            output[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
            output[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
         }
-      } 

-      /* Advance input, output pointers: 
-       */
-      input = (const float (*)[4])((const char *)input + input_stride);
-      output = (float (*)[4])((char *)output + output_stride);
+	 output = (float (*)[4])((char *)output + output_stride);
+      } 
   }
 }

@ -300,6 +297,7 @@ draw_create_vs_sse(struct draw_context *draw,
   vs->base.run = vs_sse_run;
   vs->base.run_linear = vs_sse_run_linear;
   vs->base.delete = vs_sse_delete;
+   vs->machine = &draw->machine;
   
   x86_init_func( &vs->sse2_program );