Merge branch 'const-buffer-changes'

Conflicts: src/mesa/drivers/dri/i965/brw_curbe.c src/mesa/drivers/dri/i965/brw_vs_emit.c src/mesa/drivers/dri/i965/brw_wm_glsl.c
2026-05-08 15:38:09 +02:00 · 2009-05-01 09:37:14 -06:00 · 2009-05-01 09:37:14 -06:00 · b9196c1fa3
commit b9196c1fa3
parent 3f25219c7b dca190e943
19 changed files with 507 additions and 234 deletions
--- a/src/mesa/drivers/dri/i915/i915_context.c
+++ b/src/mesa/drivers/dri/i915/i915_context.c
@ -73,7 +73,7 @@ i915InvalidateState(GLcontext * ctx, GLuint new_state)
         p->params_uptodate = 0;
   }

-   if (new_state & (_NEW_FOG | _NEW_HINT | _NEW_PROGRAM))
+   if (new_state & (_NEW_FOG | _NEW_HINT | _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS))
      i915_update_fog(ctx);
 }

--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@ -245,6 +245,9 @@ struct brw_vs_ouput_sizes {
 };


+/** Number of general purpose registers (VS, WM, etc) */
+#define BRW_MAX_GRF 128
+
 /** Number of texture sampler units */
 #define BRW_MAX_TEX_UNIT 16

@ -450,8 +453,6 @@ struct brw_context

   struct {
      struct brw_state_flags dirty;
-      struct brw_tracked_state **atoms;
-      GLuint nr_atoms;

      GLuint nr_color_regions;
      struct intel_region *color_regions[MAX_DRAW_BUFFERS];
@ -471,7 +472,8 @@ struct brw_context
      int validated_bo_count;
   } state;

-   struct brw_cache cache;
+   struct brw_cache cache;  /** non-surface items */
+   struct brw_cache surface_cache;  /* surface items */
   struct brw_cached_batch_item *cached_batch_items;

   struct {
@ -555,11 +557,6 @@ struct brw_context
      GLuint vs_size;
      GLuint total_size;

-      /* Dynamic tracker which changes to reflect the state referenced
-       * by active fp and vp program parameters:
-       */
-      struct brw_tracked_state tracked_state;
-
      dri_bo *curbe_bo;
      /** Offset within curbe_bo of space for current curbe entry */
      GLuint curbe_offset;
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@ -36,6 +36,7 @@
 #include "main/macros.h"
 #include "main/enums.h"
 #include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
 #include "shader/prog_statevars.h"
 #include "intel_batchbuffer.h"
 #include "intel_regions.h"
@ -188,13 +189,6 @@ static void prepare_constant_buffer(struct brw_context *brw)
   GLfloat *buf;
   GLuint i;

-   /* Update our own dependency flags.  This works because this
-    * function will also be called whenever fp or vp changes.
-    */
-   brw->curbe.tracked_state.dirty.mesa = (_NEW_TRANSFORM|_NEW_PROJECTION);
-   brw->curbe.tracked_state.dirty.mesa |= vp->program.Base.Parameters->StateFlags;
-   brw->curbe.tracked_state.dirty.mesa |= fp->program.Base.Parameters->StateFlags;
-
   if (sz == 0) {
      if (brw->curbe.last_buf) {
 	 free(brw->curbe.last_buf);
@ -363,11 +357,7 @@ update_constant_buffer(struct brw_context *brw,
      }

      if (0) {
-         int i;
-         for (i = 0; i < params->NumParameters; i++) {
-            float *p = params->ParameterValues[i];
-            printf("%d: %f %f %f %f\n", i, p[0], p[1], p[2], p[3]);
-         }
+         _mesa_print_parameter_list(params);
      }
   }
 }
@ -380,7 +370,7 @@ update_vertex_constant_buffer(struct brw_context *brw)
   struct brw_vertex_program *vp =
      (struct brw_vertex_program *) brw->vertex_program;
   if (0) {
-      printf("update VS constants in buffer %p\n", vp->const_buffer);
+      printf("update VS constants in buffer %p  vp = %p\n", vp->const_buffer, vp);
      printf("program %u\n", vp->program.Base.Id);
   }
   if (vp->use_const_buffer)
@ -394,6 +384,10 @@ update_fragment_constant_buffer(struct brw_context *brw)
 {
   struct brw_fragment_program *fp =
      (struct brw_fragment_program *) brw->fragment_program;
+   if (0) {
+      printf("update WM constants in buffer %p\n", fp->const_buffer);
+      printf("program %u\n", fp->program.Base.Id);
+   }
   if (fp->use_const_buffer)
      update_constant_buffer(brw, fp->program.Base.Parameters, fp->const_buffer);
 }
@ -428,7 +422,7 @@ static void emit_constant_buffer(struct brw_context *brw)
 */
 const struct brw_tracked_state brw_constant_buffer = {
   .dirty = {
-      .mesa = (_NEW_TRANSFORM|_NEW_PROJECTION),      /* plus fp and vp flags */
+      .mesa = _NEW_PROGRAM_CONSTANTS,
      .brw  = (BRW_NEW_FRAGMENT_PROGRAM |
 	       BRW_NEW_VERTEX_PROGRAM |
 	       BRW_NEW_URB_FENCE | /* Implicit - hardware requires this, not used above */
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@ -135,8 +135,8 @@ dri_bo *brw_search_cache( struct brw_cache *cache,
 			  void *aux_return);
 void brw_state_cache_check_size( struct brw_context *brw );

-void brw_init_cache( struct brw_context *brw );
-void brw_destroy_cache( struct brw_context *brw );
+void brw_init_caches( struct brw_context *brw );
+void brw_destroy_caches( struct brw_context *brw );

 /***********************************************************************
 * brw_state_batch.c
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@ -56,9 +56,9 @@
 * incorrect program is run for the other instance.
 */

+#include "main/imports.h"
 #include "brw_state.h"
 #include "intel_batchbuffer.h"
-#include "main/imports.h"

 /* XXX: Fixme - have to include these to get the sizes of the prog_key
 * structs:
@ -69,8 +69,10 @@
 #include "brw_sf.h"
 #include "brw_gs.h"

-static GLuint hash_key( const void *key, GLuint key_size,
-			dri_bo **reloc_bufs, GLuint nr_reloc_bufs)
+
+static GLuint
+hash_key(const void *key, GLuint key_size,
+         dri_bo **reloc_bufs, GLuint nr_reloc_bufs)
 {
   GLuint *ikey = (GLuint *)key;
   GLuint hash = 0, i;
@ -95,6 +97,7 @@ static GLuint hash_key( const void *key, GLuint key_size,
   return hash;
 }

+
 /**
 * Marks a new buffer as being chosen for the given cache id.
 */
@ -111,6 +114,7 @@ update_cache_last(struct brw_cache *cache, enum brw_cache_id cache_id,
   cache->brw->state.dirty.cache |= 1 << cache_id;
 }

+
 static struct brw_cache_item *
 search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
 	     GLuint hash, const void *key, GLuint key_size,
@ -143,7 +147,8 @@ search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
 }


-static void rehash( struct brw_cache *cache )
+static void
+rehash(struct brw_cache *cache)
 {
   struct brw_cache_item **items;
   struct brw_cache_item *c, *next;
@ -164,15 +169,17 @@ static void rehash( struct brw_cache *cache )
   cache->size = size;
 }

+
 /**
 * Returns the buffer object matching cache_id and key, or NULL.
 */
-dri_bo *brw_search_cache( struct brw_cache *cache,
-			  enum brw_cache_id cache_id,
-			  const void *key,
-			  GLuint key_size,
-			  dri_bo **reloc_bufs, GLuint nr_reloc_bufs,
-			  void *aux_return )
+dri_bo *
+brw_search_cache(struct brw_cache *cache,
+                 enum brw_cache_id cache_id,
+                 const void *key,
+                 GLuint key_size,
+                 dri_bo **reloc_bufs, GLuint nr_reloc_bufs,
+                 void *aux_return)
 {
   struct brw_cache_item *item;
   GLuint hash = hash_key(key, key_size, reloc_bufs, nr_reloc_bufs);
@ -192,6 +199,7 @@ dri_bo *brw_search_cache( struct brw_cache *cache,
   return item->bo;
 }

+
 dri_bo *
 brw_upload_cache( struct brw_cache *cache,
 		  enum brw_cache_id cache_id,
@ -265,7 +273,9 @@ brw_upload_cache( struct brw_cache *cache,
   return bo;
 }

-/* This doesn't really work with aux data.  Use search/upload instead
+
+/**
+ * This doesn't really work with aux data.  Use search/upload instead
 */
 dri_bo *
 brw_cache_data_sz(struct brw_cache *cache,
@ -296,6 +306,7 @@ brw_cache_data_sz(struct brw_cache *cache,
   return bo;
 }

+
 /**
 * Wrapper around brw_cache_data_sz using the cache_id's canonical key size.
 *
@ -319,21 +330,22 @@ enum pool_type {
   DW_GENERAL_STATE
 };

-static void
-brw_init_cache_id( struct brw_context *brw,
-		const char *name,
-		enum brw_cache_id id,
-		GLuint key_size,
-		GLuint aux_size)
-{
-   struct brw_cache *cache = &brw->cache;

+static void
+brw_init_cache_id(struct brw_cache *cache,
+                  const char *name,
+                  enum brw_cache_id id,
+                  GLuint key_size,
+                  GLuint aux_size)
+{
   cache->name[id] = strdup(name);
   cache->key_size[id] = key_size;
   cache->aux_size[id] = aux_size;
 }

-void brw_init_cache( struct brw_context *brw )
+
+static void
+brw_init_non_surface_cache(struct brw_context *brw)
 {
   struct brw_cache *cache = &brw->cache;

@ -342,114 +354,136 @@ void brw_init_cache( struct brw_context *brw )
   cache->size = 7;
   cache->n_items = 0;
   cache->items = (struct brw_cache_item **)
-      _mesa_calloc(cache->size * 
-		   sizeof(struct brw_cache_item));
+      _mesa_calloc(cache->size * sizeof(struct brw_cache_item));

-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "CC_VP",
 		     BRW_CC_VP,
 		     sizeof(struct brw_cc_viewport),
 		     0);

-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "CC_UNIT",
 		     BRW_CC_UNIT,
 		     sizeof(struct brw_cc_unit_state),
 		     0);

-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "WM_PROG",
 		     BRW_WM_PROG,
 		     sizeof(struct brw_wm_prog_key),
 		     sizeof(struct brw_wm_prog_data));

-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SAMPLER_DEFAULT_COLOR",
 		     BRW_SAMPLER_DEFAULT_COLOR,
 		     sizeof(struct brw_sampler_default_color),
 		     0);

-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SAMPLER",
 		     BRW_SAMPLER,
 		     0,		/* variable key/data size */
 		     0);

-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "WM_UNIT",
 		     BRW_WM_UNIT,
 		     sizeof(struct brw_wm_unit_state),
 		     0);

-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SF_PROG",
 		     BRW_SF_PROG,
 		     sizeof(struct brw_sf_prog_key),
 		     sizeof(struct brw_sf_prog_data));

-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SF_VP",
 		     BRW_SF_VP,
 		     sizeof(struct brw_sf_viewport),
 		     0);

-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SF_UNIT",
 		     BRW_SF_UNIT,
 		     sizeof(struct brw_sf_unit_state),
 		     0);

-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "VS_UNIT",
 		     BRW_VS_UNIT,
 		     sizeof(struct brw_vs_unit_state),
 		     0);

-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "VS_PROG",
 		     BRW_VS_PROG,
 		     sizeof(struct brw_vs_prog_key),
 		     sizeof(struct brw_vs_prog_data));

-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "CLIP_UNIT",
 		     BRW_CLIP_UNIT,
 		     sizeof(struct brw_clip_unit_state),
 		     0);

-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "CLIP_PROG",
 		     BRW_CLIP_PROG,
 		     sizeof(struct brw_clip_prog_key),
 		     sizeof(struct brw_clip_prog_data));

-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "GS_UNIT",
 		     BRW_GS_UNIT,
 		     sizeof(struct brw_gs_unit_state),
 		     0);

-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "GS_PROG",
 		     BRW_GS_PROG,
 		     sizeof(struct brw_gs_prog_key),
 		     sizeof(struct brw_gs_prog_data));
+}

-   brw_init_cache_id(brw,
+
+static void
+brw_init_surface_cache(struct brw_context *brw)
+{
+   struct brw_cache *cache = &brw->surface_cache;
+
+   cache->brw = brw;
+
+   cache->size = 7;
+   cache->n_items = 0;
+   cache->items = (struct brw_cache_item **)
+      _mesa_calloc(cache->size * sizeof(struct brw_cache_item));
+
+   brw_init_cache_id(cache,
 		     "SS_SURFACE",
 		     BRW_SS_SURFACE,
 		     sizeof(struct brw_surface_state),
 		     0);

-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SS_SURF_BIND",
 		     BRW_SS_SURF_BIND,
 		     0,
 		     0);
 }

+
+void
+brw_init_caches(struct brw_context *brw)
+{
+   brw_init_non_surface_cache(brw);
+   brw_init_surface_cache(brw);
+}
+
+
 static void
-brw_clear_cache( struct brw_context *brw )
+brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
 {
   struct brw_cache_item *c, *next;
   GLuint i;
@ -457,8 +491,8 @@ brw_clear_cache( struct brw_context *brw )
   if (INTEL_DEBUG & DEBUG_STATE)
      _mesa_printf("%s\n", __FUNCTION__);

-   for (i = 0; i < brw->cache.size; i++) {
-      for (c = brw->cache.items[i]; c; c = next) {
+   for (i = 0; i < cache->size; i++) {
+      for (c = cache->items[i]; c; c = next) {
 	 int j;

 	 next = c->next;
@ -468,10 +502,10 @@ brw_clear_cache( struct brw_context *brw )
 	 free((void *)c->key);
 	 free(c);
      }
-      brw->cache.items[i] = NULL;
+      cache->items[i] = NULL;
   }

-   brw->cache.n_items = 0;
+   cache->n_items = 0;

   if (brw->curbe.last_buf) {
      _mesa_free(brw->curbe.last_buf);
@ -483,25 +517,40 @@ brw_clear_cache( struct brw_context *brw )
   brw->state.dirty.cache |= ~0;
 }

-void brw_state_cache_check_size( struct brw_context *brw )
+
+void
+brw_state_cache_check_size(struct brw_context *brw)
 {
   /* un-tuned guess.  We've got around 20 state objects for a total of around
    * 32k, so 1000 of them is around 1.5MB.
    */
   if (brw->cache.n_items > 1000)
-      brw_clear_cache(brw);
+      brw_clear_cache(brw, &brw->cache);
+
+   if (brw->surface_cache.n_items > 1000)
+      brw_clear_cache(brw, &brw->surface_cache);
 }

-void brw_destroy_cache( struct brw_context *brw )
+
+static void
+brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
 {
   GLuint i;

-   brw_clear_cache(brw);
+   brw_clear_cache(brw, cache);
   for (i = 0; i < BRW_MAX_CACHE; i++) {
-      dri_bo_unreference(brw->cache.last_bo[i]);
-      free(brw->cache.name[i]);
+      dri_bo_unreference(cache->last_bo[i]);
+      free(cache->name[i]);
   }
-   free(brw->cache.items);
-   brw->cache.items = NULL;
-   brw->cache.size = 0;
+   free(cache->items);
+   cache->items = NULL;
+   cache->size = 0;
+}
+
+
+void
+brw_destroy_caches(struct brw_context *brw)
+{
+   brw_destroy_cache(brw, &brw->cache);
+   brw_destroy_cache(brw, &brw->surface_cache);
 }
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@ -59,7 +59,6 @@ const struct brw_tracked_state *atoms[] =
   &brw_curbe_offsets,
   &brw_recalculate_urb_fence,

-
   &brw_cc_vp,
   &brw_cc_unit,

@ -88,54 +87,26 @@ const struct brw_tracked_state *atoms[] =

   &brw_line_stipple,
   &brw_aa_line_parameters,
-   /* Ordering of the commands below is documented as fixed.  
-    */
-#if 0
-   &brw_pipelined_state_pointers,
-   &brw_urb_fence,
-   &brw_constant_buffer_state,
-#else
+
   &brw_psp_urb_cbs,
-#endif

   &brw_drawing_rect,
   &brw_indices,
   &brw_vertices,

-   NULL,			/* brw_constant_buffer */
+   &brw_constant_buffer
 };


 void brw_init_state( struct brw_context *brw )
 {
-   GLuint i;
-
-   brw_init_cache(brw);
-
-   brw->state.atoms = _mesa_malloc(sizeof(atoms));
-   brw->state.nr_atoms = sizeof(atoms)/sizeof(*atoms);
-   _mesa_memcpy(brw->state.atoms, atoms, sizeof(atoms));
-
-   /* Patch in a pointer to the dynamic state atom:
-    */
-   for (i = 0; i < brw->state.nr_atoms; i++)
-      if (brw->state.atoms[i] == NULL)
-	 brw->state.atoms[i] = &brw->curbe.tracked_state;
-
-   _mesa_memcpy(&brw->curbe.tracked_state, 
-		&brw_constant_buffer,
-		sizeof(brw_constant_buffer));
+   brw_init_caches(brw);
 }


 void brw_destroy_state( struct brw_context *brw )
 {
-   if (brw->state.atoms) {
-      _mesa_free(brw->state.atoms);
-      brw->state.atoms = NULL;
-   }
-
-   brw_destroy_cache(brw);
+   brw_destroy_caches(brw);
   brw_destroy_batch_cache(brw);
 }

@ -218,6 +189,7 @@ static struct dirty_bit_map mesa_bits[] = {
   DEFINE_BIT(_NEW_MULTISAMPLE),
   DEFINE_BIT(_NEW_TRACK_MATRIX),
   DEFINE_BIT(_NEW_PROGRAM),
+   DEFINE_BIT(_NEW_PROGRAM_CONSTANTS),
   {0, 0, 0}
 };

@ -336,7 +308,7 @@ void brw_validate_state( struct brw_context *brw )

   /* do prepare stage for all atoms */
   for (i = 0; i < Elements(atoms); i++) {
-      const struct brw_tracked_state *atom = brw->state.atoms[i];
+      const struct brw_tracked_state *atom = atoms[i];

      if (brw->intel.Fallback)
         break;
@ -367,8 +339,8 @@ void brw_upload_state(struct brw_context *brw)
      _mesa_memset(&examined, 0, sizeof(examined));
      prev = *state;

-      for (i = 0; i < brw->state.nr_atoms; i++) {	 
-	 const struct brw_tracked_state *atom = brw->state.atoms[i];
+      for (i = 0; i < Elements(atoms); i++) {	 
+	 const struct brw_tracked_state *atom = atoms[i];
 	 struct brw_state_flags generated;

 	 assert(atom->dirty.mesa ||
@ -397,7 +369,7 @@ void brw_upload_state(struct brw_context *brw)
   }
   else {
      for (i = 0; i < Elements(atoms); i++) {	 
-	 const struct brw_tracked_state *atom = brw->state.atoms[i];
+	 const struct brw_tracked_state *atom = atoms[i];

 	 if (brw->intel.Fallback)
 	    break;
--- a/src/mesa/drivers/dri/i965/brw_vs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c
@ -69,13 +69,18 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 {
   GLuint i, reg = 0, mrf;

-#if 0
-   if (c->vp->program.Base.Parameters->NumParameters >= 6)
-      c->vp->use_const_buffer = 1;
+   /* Determine whether to use a real constant buffer or use a block
+    * of GRF registers for constants.  The later is faster but only
+    * works if everything fits in the GRF.
+    * XXX this heuristic/check may need some fine tuning...
+    */
+   if (c->vp->program.Base.Parameters->NumParameters +
+       c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
+      c->vp->use_const_buffer = GL_TRUE;
   else
-#endif
      c->vp->use_const_buffer = GL_FALSE;
-   /*printf("use_const_buffer = %d\n", c->use_const_buffer);*/
+
+   /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/

   /* r0 -- reserved as usual
    */
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@ -240,15 +240,18 @@ struct brw_wm_compile {
   GLuint max_wm_grf;
   GLuint last_scratch;

+   GLuint cur_inst;  /**< index of current instruction */
+
   /** Mapping from Mesa registers to hardware registers */
   struct {
      GLboolean inited;
      struct brw_reg reg;
   } wm_regs[PROGRAM_PAYLOAD+1][256][4];

+   GLboolean used_grf[BRW_WM_MAX_GRF];
+   GLuint first_free_grf;
   struct brw_reg stack;
   struct brw_reg emit_mask_reg;
-   GLuint reg_index;  /**< Index of next free GRF register */
   GLuint tmp_regs[BRW_WM_MAX_GRF];
   GLuint tmp_index;
   GLuint tmp_max;
--- a/src/mesa/drivers/dri/i965/brw_wm_glsl.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
@ -1,5 +1,7 @@
 #include "main/macros.h"
 #include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
+#include "shader/prog_optimize.h"
 #include "brw_context.h"
 #include "brw_eu.h"
 #include "brw_wm.h"
@ -42,6 +44,76 @@ GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
 }


+
+static void
+reclaim_temps(struct brw_wm_compile *c);
+
+
+/** Mark GRF register as used. */
+static void
+prealloc_grf(struct brw_wm_compile *c, int r)
+{
+   c->used_grf[r] = GL_TRUE;
+}
+
+
+/** Mark given GRF register as not in use. */
+static void
+release_grf(struct brw_wm_compile *c, int r)
+{
+   /*assert(c->used_grf[r]);*/
+   c->used_grf[r] = GL_FALSE;
+   c->first_free_grf = MIN2(c->first_free_grf, r);
+}
+
+
+/** Return index of a free GRF, mark it as used. */
+static int
+alloc_grf(struct brw_wm_compile *c)
+{
+   GLuint r;
+   for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
+      if (!c->used_grf[r]) {
+         c->used_grf[r] = GL_TRUE;
+         c->first_free_grf = r + 1;  /* a guess */
+         return r;
+      }
+   }
+
+   /* no free temps, try to reclaim some */
+   reclaim_temps(c);
+   c->first_free_grf = 0;
+
+   /* try alloc again */
+   for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
+      if (!c->used_grf[r]) {
+         c->used_grf[r] = GL_TRUE;
+         c->first_free_grf = r + 1;  /* a guess */
+         return r;
+      }
+   }
+
+   for (r = 0; r < BRW_WM_MAX_GRF; r++) {
+      assert(c->used_grf[r]);
+   }
+   /*printf("Really out of temp regs!\n");*/
+   return 60;
+}
+
+
+/** Return number of GRF registers used */
+static int
+num_grf_used(const struct brw_wm_compile *c)
+{
+   int r;
+   for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
+      if (c->used_grf[r])
+         return r + 1;
+   return 0;
+}
+
+
+
 /**
 * Record the mapping of a Mesa register to a hardware register.
 */
@ -68,11 +140,18 @@ static int get_scalar_dst_index(const struct prog_instruction *inst)
 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 {
    struct brw_reg reg;
-    if(c->tmp_index == c->tmp_max)
-	c->tmp_regs[ c->tmp_max++ ] = c->reg_index++;
-    
+
+    /* if we need to allocate another temp, grow the tmp_regs[] array */
+    if (c->tmp_index == c->tmp_max) {
+       c->tmp_regs[ c->tmp_max++ ] = alloc_grf(c);
+    }
+
+    /* form the GRF register */
    reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
+    /*printf("alloc_temp %d\n", reg.nr);*/
+    assert(reg.nr < BRW_WM_MAX_GRF);
    return reg;
+
 }

 /**
@ -130,35 +209,26 @@ get_reg(struct brw_wm_compile *c, int file, int index, int component,
 	    return brw_null_reg();
    }

+    assert(index < 256);
    /* see if we've already allocated a HW register for this Mesa register */
    if (c->wm_regs[file][index][component].inited) {
-	/* yes, re-use */
-	reg = c->wm_regs[file][index][component].reg;
+       /* yes, re-use */
+       reg = c->wm_regs[file][index][component].reg;
    }
    else {
 	/* no, allocate new register */
-	reg = brw_vec8_grf(c->reg_index, 0);
+       int grf = alloc_grf(c);
+       if (grf < 0) {
+          /* totally out of temps */
+          grf = 70; /* XXX !!!! */
+       }
+
+       reg = brw_vec8_grf(grf, 0);
+       /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
+
+       set_reg(c, file, index, component, reg);
    }

-    /* if this is a new register allocation, record it in the table */
-    if (!c->wm_regs[file][index][component].inited) {
-	set_reg(c, file, index, component, reg);
-	c->reg_index++;
-    }
-
-    if (c->reg_index >= BRW_WM_MAX_GRF - 12) {
-	/* ran out of temporary registers! */
-#if 1
-        /* This is a big hack for now.
-         * Return bad register index, just don't hang the GPU.
-         */
-        _mesa_fprintf(stderr, "out of regs %d\n", c->reg_index);
-        c->reg_index = BRW_WM_MAX_GRF - 13;
-#else
-	return brw_null_reg();
-#endif
-    }
- 
    if (neg & (1 << component)) {
 	reg = negate(reg);
    }
@ -168,6 +238,46 @@ get_reg(struct brw_wm_compile *c, int file, int index, int component,
 }


+
+/**
+ * This is called if we run out of GRF registers.  Examine the live intervals
+ * of temp regs in the program and free those which won't be used again.
+ */
+static void
+reclaim_temps(struct brw_wm_compile *c)
+{
+   GLint intBegin[MAX_PROGRAM_TEMPS];
+   GLint intEnd[MAX_PROGRAM_TEMPS];
+   int index;
+
+   /*printf("Reclaim temps:\n");*/
+
+   _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
+                             intBegin, intEnd);
+
+   for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
+      if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
+         /* program temp[i] can be freed */
+         int component;
+         /*printf("  temp[%d] is dead\n", index);*/
+         for (component = 0; component < 4; component++) {
+            if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
+               int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
+               release_grf(c, r);
+               /*
+               printf("  Reclaim temp %d, reg %d at inst %d\n",
+                      index, r, c->cur_inst);
+               */
+               c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
+            }
+         }
+      }
+   }
+}
+
+
+
+
 /**
 * Preallocate registers.  This sets up the Mesa to hardware register
 * mapping for certain registers, such as constants (uniforms/state vars)
@ -179,6 +289,10 @@ static void prealloc_reg(struct brw_wm_compile *c)
    struct brw_reg reg;
    int nr_interp_regs = 0;
    GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
+    GLuint reg_index = 0;
+
+    memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
+    c->first_free_grf = 0;

    for (i = 0; i < 4; i++) {
        if (i < c->key.nr_depth_regs) 
@ -187,14 +301,20 @@ static void prealloc_reg(struct brw_wm_compile *c)
            reg = brw_vec8_grf(0, 0);
 	set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
    }
-    c->reg_index += 2 * c->key.nr_depth_regs;
+    reg_index += 2 * c->key.nr_depth_regs;

    /* constants */
    {
-        const int nr_params = c->fp->program.Base.Parameters->NumParameters;
+        const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
+        const GLuint nr_temps = c->fp->program.Base.NumTemporaries;

        /* use a real constant buffer, or just use a section of the GRF? */
-        c->fp->use_const_buffer = GL_FALSE; /* (nr_params > 8);*/
+        /* XXX this heuristic may need adjustment... */
+        if ((nr_params + nr_temps) * 4 + reg_index > 80)
+           c->fp->use_const_buffer = GL_TRUE;
+        else
+           c->fp->use_const_buffer = GL_FALSE;
+        /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/

        if (c->fp->use_const_buffer) {
           /* We'll use a real constant buffer and fetch constants from
@ -216,7 +336,7 @@ static void prealloc_reg(struct brw_wm_compile *c)
           for (i = 0; i < nr_params; i++) {
              /* loop over XYZW channels */
              for (j = 0; j < 4; j++, index++) {
-                 reg = brw_vec1_grf(c->reg_index + index / 8, index % 8);
+                 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
                 /* Save pointer to parameter/constant value.
                  * Constants will be copied in prepare_constant_buffer()
                  */
@ -226,7 +346,7 @@ static void prealloc_reg(struct brw_wm_compile *c)
           }
           /* number of constant regs used (each reg is float[8]) */
           c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
-           c->reg_index += c->nr_creg;
+           reg_index += c->nr_creg;
        }
    }

@ -234,20 +354,24 @@ static void prealloc_reg(struct brw_wm_compile *c)
    for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
 	if (inputs & (1<<i)) {
 	    nr_interp_regs++;
-	    reg = brw_vec8_grf(c->reg_index, 0);
+	    reg = brw_vec8_grf(reg_index, 0);
 	    for (j = 0; j < 4; j++)
 		set_reg(c, PROGRAM_PAYLOAD, i, j, reg);
-	    c->reg_index += 2;
+	    reg_index += 2;
 	}
    }

    c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
    c->prog_data.urb_read_length = nr_interp_regs * 2;
    c->prog_data.curb_read_length = c->nr_creg;
-    c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
-    c->reg_index++;
-    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
-    c->reg_index += 2;
+    c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
+    reg_index++;
+    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
+    reg_index += 2;
+
+    /* mark GRF regs [0..reg_index-1] as in-use */
+    for (i = 0; i < reg_index; i++)
+       prealloc_grf(c, i);

    /* An instruction may reference up to three constants.
     * They'll be found in these registers.
@ -256,7 +380,7 @@ static void prealloc_reg(struct brw_wm_compile *c)
    if (c->fp->use_const_buffer) {
       for (i = 0; i < 3; i++) {
          c->current_const[i].index = -1;
-          c->current_const[i].reg = alloc_tmp(c);
+          c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
       }
    }
 #if 0
@ -2595,7 +2719,6 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
    struct brw_compile *p = &c->func;
    struct brw_indirect stack_index = brw_indirect(0, 0);

-    c->reg_index = 0;
    prealloc_reg(c);
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
@ -2603,6 +2726,8 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
    for (i = 0; i < c->nr_fp_insns; i++) {
        const struct prog_instruction *inst = &c->prog_instructions[i];

+        c->cur_inst = i;
+
 #if 0
        _mesa_printf("Inst %d: ", i);
        _mesa_print_instruction(inst);
@ -2833,17 +2958,13 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 		_mesa_printf("unsupported IR in fragment shader %d\n",
 			inst->Opcode);
 	}
+
 	if (inst->CondUpdate)
 	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 	else
 	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
    }
    post_wm_emit(c);
-
-    if (c->reg_index >= BRW_WM_MAX_GRF) {
-        _mesa_problem(NULL, "Ran out of registers in brw_wm_emit_glsl()");
-        /* XXX we need to do some proper error recovery here */
-    }
 }


@ -2867,6 +2988,6 @@ void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
        brw_wm_print_program(c, "brw_wm_glsl_emit done");
    }

-    c->prog_data.total_grf = c->reg_index;
+    c->prog_data.total_grf = num_grf_used(c);
    c->prog_data.total_scratch = 0;
 }
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@ -268,7 +268,7 @@ brw_create_texture_surface( struct brw_context *brw,
      surf.ss0.cube_neg_z = 1;
   }

-   bo = brw_upload_cache(&brw->cache, BRW_SS_SURFACE,
+   bo = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
 			 key, sizeof(*key),
 			 &key->bo, key->bo ? 1 : 0,
 			 &surf, sizeof(surf),
@ -321,10 +321,11 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit )
   key.tiling = intelObj->mt->region->tiling;

   dri_bo_unreference(brw->wm.surf_bo[surf]);
-   brw->wm.surf_bo[surf] = brw_search_cache(&brw->cache, BRW_SS_SURFACE,
-                                         &key, sizeof(key),
-                                         &key.bo, key.bo ? 1 : 0,
-                                         NULL);
+   brw->wm.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
+                                            BRW_SS_SURFACE,
+                                            &key, sizeof(key),
+                                            &key.bo, key.bo ? 1 : 0,
+                                            NULL);
   if (brw->wm.surf_bo[surf] == NULL) {
      brw->wm.surf_bo[surf] = brw_create_texture_surface(brw, &key);
   }
@ -362,7 +363,7 @@ brw_create_constant_surface( struct brw_context *brw,
   surf.ss3.pitch = (key->pitch * key->cpp) - 1; /* ignored?? */
   brw_set_surface_tiling(&surf, key->tiling); /* tiling now allowed */
 
-   bo = brw_upload_cache(&brw->cache, BRW_SS_SURFACE,
+   bo = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
 			 key, sizeof(*key),
 			 &key->bo, key->bo ? 1 : 0,
 			 &surf, sizeof(surf),
@ -427,7 +428,8 @@ brw_update_wm_constant_surface( GLcontext *ctx,
   */

   dri_bo_unreference(brw->wm.surf_bo[surf]);
-   brw->wm.surf_bo[surf] = brw_search_cache(&brw->cache, BRW_SS_SURFACE,
+   brw->wm.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
+                                            BRW_SS_SURFACE,
                                            &key, sizeof(key),
                                            &key.bo, key.bo ? 1 : 0,
                                            NULL);
@ -456,17 +458,14 @@ brw_update_vs_constant_surface( GLcontext *ctx,

   assert(surf == 0);

-   /* free old const buffer if too small */
-   if (const_buffer && const_buffer->size < size) {
-      dri_bo_unreference(const_buffer);
-      const_buffer = NULL;
-   }
+   /* We always create a new VS constant buffer so that several can be
+    * in flight at a time.  Free the old one first...
+    */
+   dri_bo_unreference(const_buffer);

-   /* alloc new buffer if needed */
-   if (!const_buffer) {
-      const_buffer =
-         drm_intel_bo_alloc(intel->bufmgr, "vp_const_buffer", size, 64);
-   }
+   /* alloc new buffer */
+   const_buffer =
+      drm_intel_bo_alloc(intel->bufmgr, "vp_const_buffer", size, 64);

   memset(&key, 0, sizeof(key));

@ -487,7 +486,8 @@ brw_update_vs_constant_surface( GLcontext *ctx,
   */

   dri_bo_unreference(brw->vs.surf_bo[surf]);
-   brw->vs.surf_bo[surf] = brw_search_cache(&brw->cache, BRW_SS_SURFACE,
+   brw->vs.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
+                                            BRW_SS_SURFACE,
                                            &key, sizeof(key),
                                            &key.bo, key.bo ? 1 : 0,
                                            NULL);
@ -569,10 +569,11 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
   dri_bo_unreference(brw->wm.surf_bo[unit]);
   brw->wm.surf_bo[unit] = NULL;
   if (cached) 
-       brw->wm.surf_bo[unit] = brw_search_cache(&brw->cache, BRW_SS_SURFACE,
-	       &key, sizeof(key),
-	       &region_bo, 1,
-	       NULL);
+       brw->wm.surf_bo[unit] = brw_search_cache(&brw->surface_cache,
+                                                BRW_SS_SURFACE,
+                                                &key, sizeof(key),
+                                                &region_bo, 1,
+                                                NULL);

   if (brw->wm.surf_bo[unit] == NULL) {
      struct brw_surface_state surf;
@ -598,7 +599,8 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
      surf.ss0.writedisable_alpha = !key.color_mask[3];

      /* Key size will never match key size for textures, so we're safe. */
-      brw->wm.surf_bo[unit] = brw_upload_cache(&brw->cache, BRW_SS_SURFACE,
+      brw->wm.surf_bo[unit] = brw_upload_cache(&brw->surface_cache,
+                                               BRW_SS_SURFACE,
                                               &key, sizeof(key),
 					       &region_bo, 1,
 					       &surf, sizeof(surf),
@ -630,7 +632,7 @@ brw_wm_get_binding_table(struct brw_context *brw)

   assert(brw->wm.nr_surfaces <= BRW_WM_MAX_SURF);

-   bind_bo = brw_search_cache(&brw->cache, BRW_SS_SURF_BIND,
+   bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
 			      NULL, 0,
 			      brw->wm.surf_bo, brw->wm.nr_surfaces,
 			      NULL);
@ -646,7 +648,7 @@ brw_wm_get_binding_table(struct brw_context *brw)
         else
            data[i] = 0;

-      bind_bo = brw_upload_cache( &brw->cache, BRW_SS_SURF_BIND,
+      bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
 				  NULL, 0,
 				  brw->wm.surf_bo, brw->wm.nr_surfaces,
 				  data, data_size,
@ -746,7 +748,7 @@ brw_vs_get_binding_table(struct brw_context *brw)

   assert(brw->vs.nr_surfaces <= BRW_VS_MAX_SURF);

-   bind_bo = brw_search_cache(&brw->cache, BRW_SS_SURF_BIND,
+   bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
 			      NULL, 0,
 			      brw->vs.surf_bo, brw->vs.nr_surfaces,
 			      NULL);
@ -762,7 +764,7 @@ brw_vs_get_binding_table(struct brw_context *brw)
         else
            data[i] = 0;

-      bind_bo = brw_upload_cache( &brw->cache, BRW_SS_SURF_BIND,
+      bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
 				  NULL, 0,
 				  brw->vs.surf_bo, brw->vs.nr_surfaces,
 				  data, data_size,
@ -787,8 +789,7 @@ brw_vs_get_binding_table(struct brw_context *brw)


 /**
- * Vertex shader surfaces.  Just constant buffer for now.  Could add vertex 
- * shader textures in the future.
+ * Vertex shader surfaces (constant buffer).
 */
 static void prepare_vs_surfaces(struct brw_context *brw )
 {
@ -824,8 +825,12 @@ prepare_surfaces(struct brw_context *brw)

 const struct brw_tracked_state brw_wm_surfaces = {
   .dirty = {
-      .mesa = _NEW_COLOR | _NEW_TEXTURE | _NEW_BUFFERS | _NEW_PROGRAM,
-      .brw = BRW_NEW_CONTEXT,
+      .mesa = (_NEW_COLOR |
+               _NEW_TEXTURE |
+               _NEW_BUFFERS |
+               _NEW_PROGRAM |
+               _NEW_PROGRAM_CONSTANTS),
+      .brw = (BRW_NEW_CONTEXT),
      .cache = 0
   },
   .prepare = prepare_surfaces,
--- a/src/mesa/drivers/dri/r200/r200_state.c
+++ b/src/mesa/drivers/dri/r200/r200_state.c
@ -2484,7 +2484,7 @@ void r200ValidateState( GLcontext *ctx )
     r200UpdateDrawBuffer(ctx);
   }

-   if (new_state & (_NEW_TEXTURE | _NEW_PROGRAM)) {
+   if (new_state & (_NEW_TEXTURE | _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS)) {
      r200UpdateTextureState( ctx );
      new_state |= rmesa->NewGLState; /* may add TEXTURE_MATRIX */
      r200UpdateLocalViewer( ctx );
@ -2523,6 +2523,7 @@ void r200ValidateState( GLcontext *ctx )
   }

   if (new_state & (_NEW_PROGRAM|
+                    _NEW_PROGRAM_CONSTANTS |
   /* need to test for pretty much anything due to possible parameter bindings */
 	_NEW_MODELVIEW|_NEW_PROJECTION|_NEW_TRANSFORM|
 	_NEW_LIGHT|_NEW_TEXTURE|_NEW_TEXTURE_MATRIX|
--- a/src/mesa/drivers/dri/r300/r300_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.c
@ -470,7 +470,8 @@ void r300TranslateFragmentShader(r300ContextPtr r300,
 			fp->translated = GL_TRUE;
 		if (fp->error || (RADEON_DEBUG & DEBUG_PIXEL))
 			r300FragmentProgramDump(fp, &fp->code);
-		r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM);
+		r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM |
+                                          _NEW_PROGRAM_CONSTANTS);
 	}

 	update_params(r300, fp);
--- a/src/mesa/drivers/dri/r300/r300_state.c
+++ b/src/mesa/drivers/dri/r300/r300_state.c
@ -1109,7 +1109,7 @@ void r300UpdateStateParameters(GLcontext * ctx, GLuint new_state)
 	struct gl_program_parameter_list *paramList;
 	GLuint i;

-	if (!(new_state & (_NEW_BUFFERS | _NEW_PROGRAM)))
+	if (!(new_state & (_NEW_BUFFERS | _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS)))
 		return;

 	fp = (struct r300_fragment_program *)ctx->FragmentProgram._Current;
@ -2357,11 +2357,12 @@ void r300UpdateShaders(r300ContextPtr rmesa)
 			hw_tcl_on = future_hw_tcl_on = 0;
 			r300ResetHwState(rmesa);

-			r300UpdateStateParameters(ctx, _NEW_PROGRAM);
+			r300UpdateStateParameters(ctx, _NEW_PROGRAM |
+                                                  _NEW_PROGRAM_CONSTANTS);
 			return;
 		}
 	}
-	r300UpdateStateParameters(ctx, _NEW_PROGRAM);
+	r300UpdateStateParameters(ctx, _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS);
 }

 static const GLfloat *get_fragmentprogram_constant(GLcontext *ctx,
--- a/src/mesa/drivers/dri/r300/r500_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r500_fragprog.c
@ -501,7 +501,8 @@ void r500TranslateFragmentShader(r300ContextPtr r300,

 		_mesa_reference_program(r300->radeon.glCtx, &compiler.program, 0);

-		r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM);
+		r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM |
+                                          _NEW_PROGRAM_CONSTANTS);

 		if (RADEON_DEBUG & DEBUG_PIXEL) {
 			if (fp->translated) {
--- a/src/mesa/shader/arbprogram.c
+++ b/src/mesa/shader/arbprogram.c
@ -74,8 +74,6 @@ _mesa_BindProgram(GLenum target, GLuint id)
   GET_CURRENT_CONTEXT(ctx);
   ASSERT_OUTSIDE_BEGIN_END(ctx);

-   FLUSH_VERTICES(ctx, _NEW_PROGRAM);
-
   /* Error-check target and get curProg */
   if ((target == GL_VERTEX_PROGRAM_ARB) && /* == GL_VERTEX_PROGRAM_NV */
        (ctx->Extensions.NV_vertex_program ||
@ -132,6 +130,9 @@ _mesa_BindProgram(GLenum target, GLuint id)
      return;
   }

+   /* signal new program (and its new constants) */
+   FLUSH_VERTICES(ctx, _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS);
+
   /* bind newProg */
   if (target == GL_VERTEX_PROGRAM_ARB) { /* == GL_VERTEX_PROGRAM_NV */
      _mesa_reference_vertprog(ctx, &ctx->VertexProgram.Current,
@ -489,7 +490,7 @@ _mesa_ProgramEnvParameter4fARB(GLenum target, GLuint index,
   GET_CURRENT_CONTEXT(ctx);
   ASSERT_OUTSIDE_BEGIN_END(ctx);

-   FLUSH_VERTICES(ctx, _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS);
+   FLUSH_VERTICES(ctx, _NEW_PROGRAM_CONSTANTS);

   if (target == GL_FRAGMENT_PROGRAM_ARB
       && ctx->Extensions.ARB_fragment_program) {
@ -537,7 +538,7 @@ _mesa_ProgramEnvParameters4fvEXT(GLenum target, GLuint index, GLsizei count,
   GLfloat * dest;
   ASSERT_OUTSIDE_BEGIN_END(ctx);

-   FLUSH_VERTICES(ctx, _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS);
+   FLUSH_VERTICES(ctx, _NEW_PROGRAM_CONSTANTS);

   if (count <= 0) {
      _mesa_error(ctx, GL_INVALID_VALUE, "glProgramEnvParameters4fv(count)");
@ -631,7 +632,7 @@ _mesa_ProgramLocalParameter4fARB(GLenum target, GLuint index,
   struct gl_program *prog;
   ASSERT_OUTSIDE_BEGIN_END(ctx);

-   FLUSH_VERTICES(ctx, _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS);
+   FLUSH_VERTICES(ctx, _NEW_PROGRAM_CONSTANTS);

   if ((target == GL_FRAGMENT_PROGRAM_NV
        && ctx->Extensions.NV_fragment_program) ||
@ -685,7 +686,7 @@ _mesa_ProgramLocalParameters4fvEXT(GLenum target, GLuint index, GLsizei count,
   GLint i;
   ASSERT_OUTSIDE_BEGIN_END(ctx);

-   FLUSH_VERTICES(ctx, _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS);
+   FLUSH_VERTICES(ctx, _NEW_PROGRAM_CONSTANTS);

   if (count <= 0) {
      _mesa_error(ctx, GL_INVALID_VALUE, "glProgramLocalParameters4fv(count)");
--- a/src/mesa/shader/nvprogram.c
+++ b/src/mesa/shader/nvprogram.c
@ -706,7 +706,7 @@ _mesa_ProgramNamedParameter4fNV(GLuint id, GLsizei len, const GLubyte *name,
   GET_CURRENT_CONTEXT(ctx);
   ASSERT_OUTSIDE_BEGIN_END(ctx);

-   FLUSH_VERTICES(ctx, _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS);
+   FLUSH_VERTICES(ctx, _NEW_PROGRAM_CONSTANTS);

   prog = _mesa_lookup_program(ctx, id);
   if (!prog || prog->Target != GL_FRAGMENT_PROGRAM_NV) {
--- a/src/mesa/shader/prog_optimize.c
+++ b/src/mesa/shader/prog_optimize.c
@ -547,15 +547,13 @@ update_interval(GLint intBegin[], GLint intEnd[], GLuint index, GLuint ic)


 /**
- * Find the live intervals for each temporary register in the program.
- * For register R, the interval [A,B] indicates that R is referenced
- * from instruction A through instruction B.
- * Special consideration is needed for loops and subroutines.
- * \return GL_TRUE if success, GL_FALSE if we cannot proceed for some reason
+ * Find first/last instruction that references each temporary register.
 */
-static GLboolean
-find_live_intervals(struct gl_program *prog,
-                    struct interval_list *liveIntervals)
+GLboolean
+_mesa_find_temp_intervals(const struct prog_instruction *instructions,
+                          GLuint numInstructions,
+                          GLint intBegin[MAX_PROGRAM_TEMPS],
+                          GLint intEnd[MAX_PROGRAM_TEMPS])
 {
   struct loop_info
   {
@ -563,26 +561,15 @@ find_live_intervals(struct gl_program *prog,
   };
   struct loop_info loopStack[MAX_LOOP_NESTING];
   GLuint loopStackDepth = 0;
-   GLint intBegin[MAX_PROGRAM_TEMPS], intEnd[MAX_PROGRAM_TEMPS];
   GLuint i;

-   /*
-    * Note: we'll return GL_FALSE below if we find relative indexing
-    * into the TEMP register file.  We can't handle that yet.
-    * We also give up on subroutines for now.
-    */
-
-   if (dbg) {
-      _mesa_printf("Optimize: Begin find intervals\n");
-   }
-
   for (i = 0; i < MAX_PROGRAM_TEMPS; i++){
      intBegin[i] = intEnd[i] = -1;
   }

   /* Scan instructions looking for temporary registers */
-   for (i = 0; i < prog->NumInstructions; i++) {
-      const struct prog_instruction *inst = prog->Instructions + i;
+   for (i = 0; i < numInstructions; i++) {
+      const struct prog_instruction *inst = instructions + i;
      if (inst->Opcode == OPCODE_BGNLOOP) {
         loopStack[loopStackDepth].Start = i;
         loopStack[loopStackDepth].End = inst->BranchTarget;
@ -595,7 +582,7 @@ find_live_intervals(struct gl_program *prog,
         return GL_FALSE;
      }
      else {
-         const GLuint numSrc = _mesa_num_inst_src_regs(inst->Opcode);
+         const GLuint numSrc = 3;/*_mesa_num_inst_src_regs(inst->Opcode);*/
         GLuint j;
         for (j = 0; j < numSrc; j++) {
            if (inst->SrcReg[j].File == PROGRAM_TEMPORARY) {
@ -624,6 +611,39 @@ find_live_intervals(struct gl_program *prog,
      }
   }

+   return GL_TRUE;
+}
+
+
+/**
+ * Find the live intervals for each temporary register in the program.
+ * For register R, the interval [A,B] indicates that R is referenced
+ * from instruction A through instruction B.
+ * Special consideration is needed for loops and subroutines.
+ * \return GL_TRUE if success, GL_FALSE if we cannot proceed for some reason
+ */
+static GLboolean
+find_live_intervals(struct gl_program *prog,
+                    struct interval_list *liveIntervals)
+{
+   GLint intBegin[MAX_PROGRAM_TEMPS], intEnd[MAX_PROGRAM_TEMPS];
+   GLuint i;
+
+   /*
+    * Note: we'll return GL_FALSE below if we find relative indexing
+    * into the TEMP register file.  We can't handle that yet.
+    * We also give up on subroutines for now.
+    */
+
+   if (dbg) {
+      _mesa_printf("Optimize: Begin find intervals\n");
+   }
+
+   /* build intermediate arrays */
+   if (!_mesa_find_temp_intervals(prog->Instructions, prog->NumInstructions,
+                                  intBegin, intEnd))
+      return GL_FALSE;
+
   /* Build live intervals list from intermediate arrays */
   liveIntervals->Num = 0;
   for (i = 0; i < MAX_PROGRAM_TEMPS; i++) {
@ -794,6 +814,96 @@ _mesa_reallocate_registers(struct gl_program *prog)



+
+
+
+
+#if 0
+static void
+_mesa_find_temporary_live_intervals(struct gl_program *prog,
+                                    GLint firstInst[MAX_PROGRAM_TEMPS],
+                                    GLint lastInst[MAX_PROGRAM_TEMPS])
+{
+   GLuint i;
+
+   for (i = 0; i < MAX_PROGRAM_TEMPS; i++) {
+      firstInst[i] = lastInst[i] = -1;
+   }
+
+   struct loop_info loopStack[MAX_LOOP_NESTING];
+   GLuint loopStackDepth = 0;
+   GLint intBegin[MAX_PROGRAM_TEMPS], intEnd[MAX_PROGRAM_TEMPS];
+   GLuint i;
+
+   /*
+    * Note: we'll return GL_FALSE below if we find relative indexing
+    * into the TEMP register file.  We can't handle that yet.
+    * We also give up on subroutines for now.
+    */
+
+   if (dbg) {
+      _mesa_printf("Optimize: Begin find intervals\n");
+   }
+
+   for (i = 0; i < MAX_PROGRAM_TEMPS; i++){
+      intBegin[i] = intEnd[i] = -1;
+   }
+
+   /* Scan instructions looking for temporary registers */
+   for (i = 0; i < prog->NumInstructions; i++) {
+      const struct prog_instruction *inst = prog->Instructions + i;
+      if (inst->Opcode == OPCODE_BGNLOOP) {
+         loopStack[loopStackDepth].Start = i;
+         loopStack[loopStackDepth].End = inst->BranchTarget;
+         loopStackDepth++;
+      }
+      else if (inst->Opcode == OPCODE_ENDLOOP) {
+         loopStackDepth--;
+      }
+      else if (inst->Opcode == OPCODE_CAL) {
+         return GL_FALSE;
+      }
+      else {
+         const GLuint numSrc = _mesa_num_inst_src_regs(inst->Opcode);
+         GLuint j;
+         for (j = 0; j < numSrc; j++) {
+            if (inst->SrcReg[j].File == PROGRAM_TEMPORARY) {
+               const GLuint index = inst->SrcReg[j].Index;
+               if (inst->SrcReg[j].RelAddr)
+                  return GL_FALSE;
+               update_interval(intBegin, intEnd, index, i);
+               if (loopStackDepth > 0) {
+                  /* extend temp register's interval to end of loop */
+                  GLuint loopEnd = loopStack[loopStackDepth - 1].End;
+                  update_interval(intBegin, intEnd, index, loopEnd);
+               }
+            }
+         }
+         if (inst->DstReg.File == PROGRAM_TEMPORARY) {
+            const GLuint index = inst->DstReg.Index;
+            if (inst->DstReg.RelAddr)
+               return GL_FALSE;
+            update_interval(intBegin, intEnd, index, i);
+            if (loopStackDepth > 0) {
+               /* extend temp register's interval to end of loop */
+               GLuint loopEnd = loopStack[loopStackDepth - 1].End;
+               update_interval(intBegin, intEnd, index, loopEnd);
+            }
+         }
+      }
+   }
+
+
+
+
+#endif
+
+
+
+
+
+
+
 /**
 * Apply optimizations to the given program to eliminate unnecessary
 * instructions, temp regs, etc.
--- a/src/mesa/shader/prog_optimize.h
+++ b/src/mesa/shader/prog_optimize.h
@ -25,7 +25,19 @@
 #ifndef PROG_OPT_H
 #define PROG_OPT_H

+
+#include "main/config.h"
+
+
 struct gl_program;
+struct prog_instruction;
+
+
+extern GLboolean
+_mesa_find_temp_intervals(const struct prog_instruction *instructions,
+                          GLuint numInstructions,
+                          GLint intBegin[MAX_PROGRAM_TEMPS],
+                          GLint intEnd[MAX_PROGRAM_TEMPS]);

 extern void
 _mesa_optimize_program(GLcontext *ctx, struct gl_program *program);
--- a/src/mesa/shader/shader_api.c
+++ b/src/mesa/shader/shader_api.c
@ -1487,7 +1487,7 @@ _mesa_use_program(GLcontext *ctx, GLuint program)
      return;
   }

-   FLUSH_VERTICES(ctx, _NEW_PROGRAM);
+   FLUSH_VERTICES(ctx, _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS);

   if (program) {
      shProg = _mesa_lookup_shader_program_err(ctx, program, "glUseProgram");
@ -1789,7 +1789,7 @@ _mesa_uniform(GLcontext *ctx, GLint location, GLsizei count,
      return;
   }

-   FLUSH_VERTICES(ctx, _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS);
+   FLUSH_VERTICES(ctx, _NEW_PROGRAM_CONSTANTS);

   uniform = &shProg->Uniforms->Uniforms[location];

@ -1929,7 +1929,7 @@ _mesa_uniform_matrix(GLcontext *ctx, GLint cols, GLint rows,
      return;
   }

-   FLUSH_VERTICES(ctx, _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS);
+   FLUSH_VERTICES(ctx, _NEW_PROGRAM_CONSTANTS);

   uniform = &shProg->Uniforms->Uniforms[location];