i965/fs: Add initial support for 16-wide dispatch on gen6.

At this point it doesn't do uniforms, which have to be laid out the same between 8 and 16. Other than that, it supports everything but flow control, which was the thing that forced us to choose 8-wide for general GLSL support. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2026-05-05 07:28:11 +02:00 · 2011-03-11 19:19:01 -08:00 · 2011-03-11 19:19:01 -08:00 · 662f1b48bd
commit 662f1b48bd
parent 76b7a0c1af
6 changed files with 233 additions and 104 deletions
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@ -204,13 +204,16 @@ struct brw_wm_prog_data {
   GLuint urb_read_length;

   GLuint first_curbe_grf;
+   GLuint first_curbe_grf_16;
   GLuint total_grf;
+   GLuint total_grf_16;
   GLuint total_scratch;

   GLuint nr_params;       /**< number of float params/constants */
   GLuint nr_pull_params;
   GLboolean error;
   int dispatch_width;
+   uint32_t prog_offset_16;

   /* Pointer to tracked values (only valid once
    * _mesa_load_state_parameters has been called at runtime).
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@ -194,6 +194,32 @@ fs_visitor::fail(const char *format, ...)
   }
 }

+void
+fs_visitor::push_force_uncompressed()
+{
+   force_uncompressed_stack++;
+}
+
+void
+fs_visitor::pop_force_uncompressed()
+{
+   force_uncompressed_stack--;
+   assert(force_uncompressed_stack >= 0);
+}
+
+void
+fs_visitor::push_force_sechalf()
+{
+   force_sechalf_stack++;
+}
+
+void
+fs_visitor::pop_force_sechalf()
+{
+   force_sechalf_stack--;
+   assert(force_sechalf_stack >= 0);
+}
+
 /**
 * Returns how many MRFs an FS opcode will write over.
 *
@ -1738,6 +1764,10 @@ fs_visitor::visit(ir_if *ir)
 {
   fs_inst *inst;

+   if (c->dispatch_width == 16) {
+      fail("Can't support (non-uniform) control flow on 16-wide\n");
+   }
+
   /* Don't point the annotation at the if statement, because then it plus
    * the then and else blocks get printed.
    */
@ -1778,6 +1808,10 @@ fs_visitor::visit(ir_loop *ir)
 {
   fs_reg counter = reg_undef;

+   if (c->dispatch_width == 16) {
+      fail("Can't support (non-uniform) control flow on 16-wide\n");
+   }
+
   if (ir->counter) {
      this->base_ir = ir->counter;
      ir->counter->accept(this);
@ -1881,6 +1915,11 @@ fs_visitor::emit(fs_inst inst)
   fs_inst *list_inst = new(mem_ctx) fs_inst;
   *list_inst = inst;

+   if (force_uncompressed_stack > 0)
+      list_inst->force_uncompressed = true;
+   else if (force_sechalf_stack > 0)
+      list_inst->force_sechalf = true;
+
   list_inst->annotation = this->current_annotation;
   list_inst->ir = this->base_ir;

@ -2006,6 +2045,7 @@ fs_visitor::emit_fb_writes()
   this->current_annotation = "FB write header";
   GLboolean header_present = GL_TRUE;
   int nr = 0;
+   int reg_width = c->dispatch_width / 8;

   if (intel->gen >= 6 &&
       !this->kill_emitted &&
@ -2019,31 +2059,44 @@ fs_visitor::emit_fb_writes()
   }

   if (c->aa_dest_stencil_reg) {
+      push_force_uncompressed();
      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
 	   fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)));
+      pop_force_uncompressed();
   }

   /* Reserve space for color. It'll be filled in per MRT below. */
   int color_mrf = nr;
-   nr += 4;
+   nr += 4 * reg_width;

   if (c->source_depth_to_render_target) {
+      if (intel->gen == 6 && c->dispatch_width == 16) {
+	 /* For outputting oDepth on gen6, SIMD8 writes have to be
+	  * used.  This would require 8-wide moves of each half to
+	  * message regs, kind of like pre-gen5 SIMD16 FB writes.
+	  * Just bail on doing so for now.
+	  */
+	 fail("Missing support for simd16 depth writes on gen6\n");
+      }
+
      if (c->computes_depth) {
 	 /* Hand over gl_FragDepth. */
 	 assert(this->frag_depth);
 	 fs_reg depth = *(variable_storage(this->frag_depth));

-	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth);
+	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), depth);
      } else {
 	 /* Pass through the payload depth. */
-	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
+	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
 	      fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
      }
+      nr += reg_width;
   }

   if (c->dest_depth_reg) {
-      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
+      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
 	   fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)));
+      nr += reg_width;
   }

   fs_reg color = reg_undef;
@ -2060,7 +2113,7 @@ fs_visitor::emit_fb_writes()
 						 target);
      if (this->frag_color || this->frag_data) {
 	 for (int i = 0; i < 4; i++) {
-	    emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + i), color);
+	    emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + i * reg_width), color);
 	    color.reg_offset++;
 	 }
      }
@ -2144,7 +2197,7 @@ fs_visitor::generate_fb_write(fs_inst *inst)
   brw_pop_insn_state(p);

   brw_fb_WRITE(p,
-		8, /* dispatch_width */
+		c->dispatch_width,
 		inst->base_mrf,
 		implied_header,
 		inst->target,
@ -2608,8 +2661,12 @@ fs_visitor::setup_paramvalues_refs()
 void
 fs_visitor::assign_curb_setup()
 {
-   c->prog_data.first_curbe_grf = c->nr_payload_regs;
   c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
+   if (c->dispatch_width == 8) {
+      c->prog_data.first_curbe_grf = c->nr_payload_regs;
+   } else {
+      c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
+   }

   /* Map the offsets in the UNIFORM file to fixed HW regs. */
   foreach_iter(exec_list_iterator, iter, this->instructions) {
@ -2618,7 +2675,7 @@ fs_visitor::assign_curb_setup()
      for (unsigned int i = 0; i < 3; i++) {
 	 if (inst->src[i].file == UNIFORM) {
 	    int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
-	    struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf +
+	    struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
 						  constant_nr / 8,
 						  constant_nr % 8);

@ -2670,7 +2727,7 @@ fs_visitor::calculate_urb_setup()
 void
 fs_visitor::assign_urb_setup()
 {
-   int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length;
+   int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;

   /* Offset all the urb_setup[] index by the actual position of the
    * setup regs, now that the location of the constants has been chosen.
@ -3516,7 +3573,7 @@ static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
 void
 fs_visitor::generate_code()
 {
-   int last_native_inst = 0;
+   int last_native_inst = p->nr_insn;
   const char *last_annotation_string = NULL;
   ir_instruction *last_annotation_ir = NULL;

@ -3532,8 +3589,8 @@ fs_visitor::generate_code()


   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-      printf("Native code for fragment shader %d:\n",
-	     ctx->Shader.CurrentFragmentProgram->Name);
+      printf("Native code for fragment shader %d (%d-wide dispatch):\n",
+	     ctx->Shader.CurrentFragmentProgram->Name, c->dispatch_width);
   }

   foreach_iter(exec_list_iterator, iter, this->instructions) {
@ -3566,6 +3623,14 @@ fs_visitor::generate_code()
      brw_set_predicate_inverse(p, inst->predicate_inverse);
      brw_set_saturate(p, inst->saturate);

+      if (inst->force_uncompressed || c->dispatch_width == 8) {
+	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      } else if (inst->force_sechalf) {
+	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+      } else {
+	 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+      }
+
      switch (inst->opcode) {
      case BRW_OPCODE_MOV:
 	 brw_MOV(p, dst, src[0]);
@ -3804,7 +3869,111 @@ fs_visitor::generate_code()
   }
 }

-GLboolean
+bool
+fs_visitor::run()
+{
+   uint32_t prog_offset_16 = 0;
+
+   brw_wm_payload_setup(brw, c);
+
+   if (c->dispatch_width == 16) {
+      if (c->prog_data.curb_read_length) {
+	 /* Haven't hooked in support for uniforms through the 16-wide
+	  * version yet.
+	  */
+	 return GL_FALSE;
+      }
+
+      /* align to 64 byte boundary. */
+      while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) {
+	 brw_NOP(p);
+      }
+
+      /* Save off the start of this 16-wide program in case we succeed. */
+      prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction);
+
+      brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+   }
+
+   if (0) {
+      emit_dummy_fs();
+   } else {
+      calculate_urb_setup();
+      if (intel->gen < 6)
+	 emit_interpolation_setup_gen4();
+      else
+	 emit_interpolation_setup_gen6();
+
+      /* Generate FS IR for main().  (the visitor only descends into
+       * functions called "main").
+       */
+      foreach_iter(exec_list_iterator, iter, *shader->ir) {
+	 ir_instruction *ir = (ir_instruction *)iter.get();
+	 base_ir = ir;
+	 ir->accept(this);
+      }
+
+      emit_fb_writes();
+
+      split_virtual_grfs();
+
+      setup_paramvalues_refs();
+      setup_pull_constants();
+
+      bool progress;
+      do {
+	 progress = false;
+
+	 progress = remove_duplicate_mrf_writes() || progress;
+
+	 progress = propagate_constants() || progress;
+	 progress = register_coalesce() || progress;
+	 progress = compute_to_mrf() || progress;
+	 progress = dead_code_eliminate() || progress;
+      } while (progress);
+
+      schedule_instructions();
+
+      assign_curb_setup();
+      assign_urb_setup();
+
+      if (0) {
+	 /* Debug of register spilling: Go spill everything. */
+	 int virtual_grf_count = virtual_grf_next;
+	 for (int i = 1; i < virtual_grf_count; i++) {
+	    spill_reg(i);
+	 }
+      }
+
+      if (0)
+	 assign_regs_trivial();
+      else {
+	 while (!assign_regs()) {
+	    if (failed)
+	       break;
+	 }
+      }
+   }
+   assert(force_uncompressed_stack == 0);
+   assert(force_sechalf_stack == 0);
+
+   if (!failed)
+      generate_code();
+
+   if (failed)
+      return GL_FALSE;
+
+   if (c->dispatch_width == 8) {
+      c->prog_data.total_grf = grf_used;
+   } else {
+      c->prog_data.total_grf_16 = grf_used;
+      c->prog_data.prog_offset_16 = prog_offset_16;
+   }
+
+   return !failed;
+}
+
+bool
 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
 {
   struct intel_context *intel = &brw->intel;
@ -3812,20 +3981,12 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
   struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;

   if (!prog)
-      return GL_FALSE;
+      return false;

   struct brw_shader *shader =
     (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
   if (!shader)
-      return GL_FALSE;
-
-   /* We always use 8-wide mode, at least for now.  For one, flow
-    * control only works in 8-wide.  Also, when we're fragment shader
-    * bound, we're almost always under register pressure as well, so
-    * 8-wide would save us from the performance cliff of spilling
-    * regs.
-    */
-   c->dispatch_width = 8;
+      return false;

   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
      printf("GLSL IR for native fragment shader %d:\n", prog->Name);
@ -3835,77 +3996,22 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)

   /* Now the main event: Visit the shader IR and generate our FS IR for it.
    */
+   c->dispatch_width = 8;
+
   fs_visitor v(c, shader);
-
-   if (0) {
-      v.emit_dummy_fs();
-   } else {
-      v.calculate_urb_setup();
-      if (intel->gen < 6)
-	 v.emit_interpolation_setup_gen4();
-      else
-	 v.emit_interpolation_setup_gen6();
-
-      /* Generate FS IR for main().  (the visitor only descends into
-       * functions called "main").
-       */
-      foreach_iter(exec_list_iterator, iter, *shader->ir) {
-	 ir_instruction *ir = (ir_instruction *)iter.get();
-	 v.base_ir = ir;
-	 ir->accept(&v);
-      }
-
-      v.emit_fb_writes();
-
-      v.split_virtual_grfs();
-
-      v.setup_paramvalues_refs();
-      v.setup_pull_constants();
-
-      bool progress;
-      do {
-	 progress = false;
-
-	 progress = v.remove_duplicate_mrf_writes() || progress;
-
-	 progress = v.propagate_constants() || progress;
-	 progress = v.register_coalesce() || progress;
-	 progress = v.compute_to_mrf() || progress;
-	 progress = v.dead_code_eliminate() || progress;
-      } while (progress);
-
-      v.schedule_instructions();
-
-      v.assign_curb_setup();
-      v.assign_urb_setup();
-
-      if (0) {
-	 /* Debug of register spilling: Go spill everything. */
-	 int virtual_grf_count = v.virtual_grf_next;
-	 for (int i = 1; i < virtual_grf_count; i++) {
-	    v.spill_reg(i);
-	 }
-      }
-
-      if (0)
-	 v.assign_regs_trivial();
-      else {
-	 while (!v.assign_regs()) {
-	    if (v.failed)
-	       break;
-	 }
-      }
+   if (!v.run()) {
+      /* FINISHME: Cleanly fail, test at link time, etc. */
+      assert(!"not reached");
+      return false;
   }

-   if (!v.failed)
-      v.generate_code();
+   if (intel->gen >= 6) {
+      c->dispatch_width = 16;
+      fs_visitor v2(c, shader);
+      v2.run();
+   }

-   assert(!v.failed); /* FINISHME: Cleanly fail, tested at link time, etc. */
+   c->prog_data.dispatch_width = 8;

-   if (v.failed)
-      return GL_FALSE;
-
-   c->prog_data.total_grf = v.grf_used;
-
-   return GL_TRUE;
+   return true;
 }
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@ -343,6 +343,8 @@ public:
   bool eot;
   bool header_present;
   bool shadow_compare;
+   bool force_uncompressed;
+   bool force_sechalf;
   uint32_t offset; /* spill/unspill offset */

   /** @{
@ -405,6 +407,8 @@ public:
      this->live_intervals_valid = false;

      this->kill_emitted = false;
+      this->force_uncompressed_stack = 0;
+      this->force_sechalf_stack = 0;
   }

   ~fs_visitor()
@ -461,6 +465,7 @@ public:
      return emit(fs_inst(opcode, dst, src0, src1, src2));
   }

+   bool run();
   void setup_paramvalues_refs();
   void assign_curb_setup();
   void calculate_urb_setup();
@ -481,6 +486,11 @@ public:
   void schedule_instructions();
   void fail(const char *msg, ...);

+   void push_force_uncompressed();
+   void pop_force_uncompressed();
+   void push_force_sechalf();
+   void pop_force_sechalf();
+
   void generate_code();
   void generate_fb_write(fs_inst *inst);
   void generate_pixel_xy(struct brw_reg dst, bool is_x);
@ -568,6 +578,9 @@ public:
   fs_reg reg_null_cmp;

   int grf_used;
+
+   int force_uncompressed_stack;
+   int force_sechalf_stack;
 };

 GLboolean brw_do_channel_expressions(struct exec_list *instructions);
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@ -120,7 +120,7 @@ brw_wm_non_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
   brw_wm_emit(c);
 }

-static void
+void
 brw_wm_payload_setup(struct brw_context *brw,
 		     struct brw_wm_compile *c)
 {
@ -225,18 +225,13 @@ static void do_wm_prog( struct brw_context *brw,

   brw_init_compile(brw, &c->func);

-   brw_wm_payload_setup(brw, c);
-
   if (!brw_wm_fs_emit(brw, c)) {
-      /*
-       * Shader which use GLSL features such as flow control are handled
-       * differently from "simple" shaders.
-       */
+      /* Fallback for fixed function and ARB_fp shaders. */
      c->dispatch_width = 16;
      brw_wm_payload_setup(brw, c);
      brw_wm_non_glsl_emit(brw, c);
+      c->prog_data.dispatch_width = 16;
   }
-   c->prog_data.dispatch_width = c->dispatch_width;

   /* Scratch space is used for register spilling */
   if (c->last_scratch) {
@ -467,7 +462,7 @@ static void brw_prepare_wm_prog(struct brw_context *brw)
   struct brw_wm_prog_key key;
   struct brw_fragment_program *fp = (struct brw_fragment_program *)
      brw->fragment_program;
-     
+
   brw_wm_populate_key(brw, &key);

   /* Make an early check for the key.
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@ -314,7 +314,7 @@ void brw_wm_print_program( struct brw_wm_compile *c,
 void brw_wm_lookup_iz(struct intel_context *intel,
 		      struct brw_wm_compile *c);

-GLboolean brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c);
+bool brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c);

 /* brw_wm_emit.c */
 void emit_alu1(struct brw_compile *p,
@ -474,5 +474,7 @@ struct gl_shader_program *brw_new_shader_program(struct gl_context *ctx, GLuint

 bool brw_color_buffer_write_enabled(struct brw_context *brw);
 bool brw_render_target_supported(gl_format format);
+void brw_wm_payload_setup(struct brw_context *brw,
+			  struct brw_wm_compile *c);

 #endif
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@ -143,14 +143,19 @@ upload_wm_state(struct brw_context *brw)
   dw2 |= (ALIGN(brw->wm.sampler_count, 4) / 4) << GEN6_WM_SAMPLER_COUNT_SHIFT;
   dw4 |= (brw->wm.prog_data->first_curbe_grf <<
 	   GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
+   dw4 |= (brw->wm.prog_data->first_curbe_grf_16 <<
+	   GEN6_WM_DISPATCH_START_GRF_SHIFT_2);

   dw5 |= (brw->wm_max_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT;

   /* CACHE_NEW_WM_PROG */
-   if (brw->wm.prog_data->dispatch_width == 8)
+   if (brw->wm.prog_data->dispatch_width == 8) {
      dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
-   else
+      if (brw->wm.prog_data->prog_offset_16)
+	 dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
+   } else {
      dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
+   }

   /* _NEW_LINE */
   if (ctx->Line.StippleFlag)
@ -194,7 +199,12 @@ upload_wm_state(struct brw_context *brw)
   OUT_BATCH(dw5);
   OUT_BATCH(dw6);
   OUT_BATCH(0); /* kernel 1 pointer */
-   OUT_BATCH(0); /* kernel 2 pointer */
+   if (brw->wm.prog_data->prog_offset_16) {
+      OUT_RELOC(brw->wm.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0,
+		brw->wm.prog_data->prog_offset_16);
+   } else {
+      OUT_BATCH(0); /* kernel 2 pointer */
+   }
   ADVANCE_BATCH();
 }