i965/fs: Emit better b2f of an expression on GEN4 and GEN5

On platforms that do not natively generate 0u and ~0u for Boolean results, b2f expressions that look like f = b2f(expr cmp 0) will generate better code by pretending the expression is f = ir_triop_sel(0.0, 1.0, expr cmp 0) This is because the last instruction of "expr" can generate the condition code for the "cmp 0". This avoids having to do the "-(b & 1)" trick to generate 0u or ~0u for the Boolean result. This means code like mov(16) g16<1>F 1F mul.ge.f0(16) null g6<8,8,1>F g14<8,8,1>F (+f0) sel(16) m6<1>F g16<8,8,1>F 0F will be generated instead of mul(16) g2<1>F g12<8,8,1>F g4<8,8,1>F cmp.ge.f0(16) g2<1>D g4<8,8,1>F 0F and(16) g4<1>D g2<8,8,1>D 1D and(16) m6<1>D -g4<8,8,1>D 0x3f800000UD v2: When the comparison is either == 0.0 or != 0.0 use the knowledge that the true (or false) case already results in zero would allow better code generation by possibly avoiding a load-immediate instruction. v3: Apply the optimization even when neither comparitor is zero. Shader-db results: GM45 (0x2A42): total instructions in shared programs: 3551002 -> 3550829 (-0.00%) instructions in affected programs: 33269 -> 33096 (-0.52%) helped: 121 Iron Lake (0x0046): total instructions in shared programs: 4993327 -> 4993146 (-0.00%) instructions in affected programs: 34199 -> 34018 (-0.53%) helped: 129 No change on other platforms. Signed-off-by: Ian Romanick <ian.d.romanick@intel.com> Reviewed-by: Tapani Palli <tapani.palli@intel.com>
2026-03-19 07:50:37 +01:00 · 2015-01-23 17:31:12 -08:00 · 2015-01-23 17:31:12 -08:00 · b616164c95
commit b616164c95
parent 036e347f3c
2 changed files with 99 additions and 4 deletions
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@ -307,6 +307,7 @@ public:
                 const fs_reg &a);
   void emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst,
                    const fs_reg &src0, const fs_reg &src1);
+   bool try_emit_b2f_of_comparison(ir_expression *ir);
   bool try_emit_saturate(ir_expression *ir);
   bool try_emit_line(ir_expression *ir);
   bool try_emit_mad(ir_expression *ir);
@ -317,6 +318,7 @@ public:
   bool opt_saturate_propagation();
   bool opt_cmod_propagation();
   void emit_bool_to_cond_code(ir_rvalue *condition);
+   void emit_bool_to_cond_code_of_reg(ir_expression *expr, fs_reg op[3]);
   void emit_if_gen6(ir_if *ir);
   void emit_unspill(bblock_t *block, fs_inst *inst, fs_reg reg,
                     uint32_t spill_offset, int count);
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@ -475,6 +475,87 @@ fs_visitor::try_emit_mad(ir_expression *ir)
   return true;
 }

+bool
+fs_visitor::try_emit_b2f_of_comparison(ir_expression *ir)
+{
+   /* On platforms that do not natively generate 0u and ~0u for Boolean
+    * results, b2f expressions that look like
+    *
+    *     f = b2f(expr cmp 0)
+    *
+    * will generate better code by pretending the expression is
+    *
+    *     f = ir_triop_csel(0.0, 1.0, expr cmp 0)
+    *
+    * This is because the last instruction of "expr" can generate the
+    * condition code for the "cmp 0".  This avoids having to do the "-(b & 1)"
+    * trick to generate 0u or ~0u for the Boolean result.  This means code like
+    *
+    *     mov(16)         g16<1>F         1F
+    *     mul.ge.f0(16)   null            g6<8,8,1>F      g14<8,8,1>F
+    *     (+f0) sel(16)   m6<1>F          g16<8,8,1>F     0F
+    *
+    * will be generated instead of
+    *
+    *     mul(16)         g2<1>F          g12<8,8,1>F     g4<8,8,1>F
+    *     cmp.ge.f0(16)   g2<1>D          g4<8,8,1>F      0F
+    *     and(16)         g4<1>D          g2<8,8,1>D      1D
+    *     and(16)         m6<1>D          -g4<8,8,1>D     0x3f800000UD
+    *
+    * When the comparison is either == 0.0 or != 0.0 using the knowledge that
+    * the true (or false) case already results in zero would allow better code
+    * generation by possibly avoiding a load-immediate instruction.
+    */
+   ir_expression *cmp = ir->operands[0]->as_expression();
+   if (cmp == NULL)
+      return false;
+
+   if (cmp->operation == ir_binop_equal || cmp->operation == ir_binop_nequal) {
+      for (unsigned i = 0; i < 2; i++) {
+         ir_constant *c = cmp->operands[i]->as_constant();
+         if (c == NULL || !c->is_zero())
+            continue;
+
+         ir_expression *expr = cmp->operands[i ^ 1]->as_expression();
+         if (expr != NULL) {
+            fs_reg op[2];
+
+            for (unsigned j = 0; j < 2; j++) {
+               cmp->operands[j]->accept(this);
+               op[j] = this->result;
+
+               resolve_ud_negate(&op[j]);
+            }
+
+            emit_bool_to_cond_code_of_reg(cmp, op);
+
+            /* In this case we know when the condition is true, op[i ^ 1]
+             * contains zero.  Invert the predicate, use op[i ^ 1] as src0,
+             * and immediate 1.0f as src1.
+             */
+            this->result = vgrf(ir->type);
+            op[i ^ 1].type = BRW_REGISTER_TYPE_F;
+
+            fs_inst *inst = emit(SEL(this->result, op[i ^ 1], fs_reg(1.0f)));
+            inst->predicate = BRW_PREDICATE_NORMAL;
+            inst->predicate_inverse = cmp->operation == ir_binop_equal;
+            return true;
+         }
+      }
+   }
+
+   emit_bool_to_cond_code(cmp);
+
+   fs_reg temp = vgrf(ir->type);
+   emit(MOV(temp, fs_reg(1.0f)));
+
+   this->result = vgrf(ir->type);
+   fs_inst *inst = emit(SEL(this->result, temp, fs_reg(0.0f)));
+   inst->predicate = BRW_PREDICATE_NORMAL;
+
+   return true;
+}
+
 static int
 pack_pixel_offset(float x)
 {
@ -639,6 +720,11 @@ fs_visitor::visit(ir_expression *ir)
      inst->predicate = BRW_PREDICATE_NORMAL;
      return;

+   case ir_unop_b2f:
+      if (brw->gen <= 5 && try_emit_b2f_of_comparison(ir))
+         return;
+      break;
+
   case ir_unop_interpolate_at_centroid:
   case ir_binop_interpolate_at_offset:
   case ir_binop_interpolate_at_sample:
@ -2508,7 +2594,6 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
   }

   fs_reg op[3];
-   fs_inst *inst;

   assert(expr->get_num_operands() <= 3);
   for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
@ -2520,6 +2605,14 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
      resolve_ud_negate(&op[i]);
   }

+   emit_bool_to_cond_code_of_reg(expr, op);
+}
+
+void
+fs_visitor::emit_bool_to_cond_code_of_reg(ir_expression *expr, fs_reg op[3])
+{
+   fs_inst *inst;
+
   switch (expr->operation) {
   case ir_unop_logic_not:
      inst = emit(AND(reg_null_d, op[0], fs_reg(1)));
@ -2528,7 +2621,7 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)

   case ir_binop_logic_xor:
      if (brw->gen <= 5) {
-         fs_reg temp = vgrf(ir->type);
+         fs_reg temp = vgrf(expr->type);
         emit(XOR(temp, op[0], op[1]));
         inst = emit(AND(reg_null_d, temp, fs_reg(1)));
      } else {
@ -2539,7 +2632,7 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)

   case ir_binop_logic_or:
      if (brw->gen <= 5) {
-         fs_reg temp = vgrf(ir->type);
+         fs_reg temp = vgrf(expr->type);
         emit(OR(temp, op[0], op[1]));
         inst = emit(AND(reg_null_d, temp, fs_reg(1)));
      } else {
@ -2550,7 +2643,7 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)

   case ir_binop_logic_and:
      if (brw->gen <= 5) {
-         fs_reg temp = vgrf(ir->type);
+         fs_reg temp = vgrf(expr->type);
         emit(AND(temp, op[0], op[1]));
         inst = emit(AND(reg_null_d, temp, fs_reg(1)));
      } else {