diff --git a/src/gallium/drivers/r300/compiler/r3xx_vertprog.c b/src/gallium/drivers/r300/compiler/r3xx_vertprog.c
index 5ad1f4eb857..4ee3b878b4c 100644
--- a/src/gallium/drivers/r300/compiler/r3xx_vertprog.c
+++ b/src/gallium/drivers/r300/compiler/r3xx_vertprog.c
@@ -237,6 +237,36 @@ static void ei_math1(struct r300_vertex_program_code *vp,
 	inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
 }
 
+static void ei_cmp(struct r300_vertex_program_code *vp,
+				struct rc_sub_instruction *vpi,
+				unsigned int * inst)
+{
+	inst[0] = PVS_OP_DST_OPERAND(VE_COND_MUX_GTE,
+				     0,
+				     0,
+				     t_dst_index(vp, &vpi->DstReg),
+				     t_dst_mask(vpi->DstReg.WriteMask),
+				     t_dst_class(vpi->DstReg.File),
+                                     vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
+
+	/* Arguments with constant swizzles still count as a unique
+	 * temporary, so we should make sure these arguments share a
+	 * register index with one of the other arguments. */
+	for (unsigned i = 0; i < 3; i++) {
+		unsigned j = (i + 1) % 3;
+		if (vpi->SrcReg[i].File == RC_FILE_NONE &&
+			(vpi->SrcReg[j].File == RC_FILE_NONE ||
+			 vpi->SrcReg[j].File == RC_FILE_TEMPORARY)) {
+			vpi->SrcReg[i].Index = vpi->SrcReg[j].Index;
+			break;
+		}
+	}
+
+	inst[1] = t_src(vp, &vpi->SrcReg[0]);
+	inst[2] = t_src(vp, &vpi->SrcReg[2]);
+	inst[3] = t_src(vp, &vpi->SrcReg[1]);
+}
+
 static void ei_lit(struct r300_vertex_program_code *vp,
 				      struct rc_sub_instruction *vpi,
 				      unsigned int * inst)
@@ -414,6 +444,7 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user)
 		case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
 		case RC_OPCODE_ARR: ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst); break;
 		case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
+		case RC_OPCODE_CMP: ei_cmp(compiler->code, vpi, inst); break;
 		case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
 		case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
 		case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
diff --git a/src/gallium/drivers/r300/compiler/radeon_program_alu.c b/src/gallium/drivers/r300/compiler/radeon_program_alu.c
index 314fa7655ee..2e2d75143e1 100644
--- a/src/gallium/drivers/r300/compiler/radeon_program_alu.c
+++ b/src/gallium/drivers/r300/compiler/radeon_program_alu.c
@@ -395,7 +395,18 @@ int radeonTransformALU(
 static void transform_r300_vertex_CMP(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	/* There is no decent CMP available, so let's rig one up.
+	/* R5xx has a CMP, but we can use it only if it reads from less than
+	 * three different temps. */
+	if (c->is_r500 &&
+	    (inst->U.I.SrcReg[0].File != RC_FILE_TEMPORARY ||
+	     inst->U.I.SrcReg[1].File != RC_FILE_TEMPORARY ||
+	     inst->U.I.SrcReg[2].File != RC_FILE_TEMPORARY ||
+	     inst->U.I.SrcReg[0].Index == inst->U.I.SrcReg[1].Index ||
+	     inst->U.I.SrcReg[1].Index == inst->U.I.SrcReg[2].Index ||
+	     inst->U.I.SrcReg[0].Index == inst->U.I.SrcReg[2].Index))
+		return;
+
+	/* There is no decent CMP available on r300, so let's rig one up.
 	 * CMP is defined as dst = src0 < 0.0 ? src1 : src2
 	 * The following sequence consumes zero to two temps and two extra slots
 	 * (the second temp and the second slot is consumed by transform_LRP),
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 1e920b0cc74..c7086853a8a 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -512,6 +512,7 @@ static int r300_get_video_param(struct pipe_screen *screen,
 
 static const nir_shader_compiler_options r500_vs_compiler_options = {
    COMMON_NIR_OPTIONS,
+   .has_fused_comp_and_csel = true,
 
    /* Have HW loops support and 1024 max instr count, but don't unroll *too*
     * hard.
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
index 269281cd235..bd1202be031 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -1952,7 +1952,6 @@ static void* r300_create_vs_state(struct pipe_context* pipe,
            .ubo_vec4_max = 0x00ff,
        };
        static const struct nir_to_tgsi_options hwtcl_r500_options = {
-           .lower_cmp = true,
            .ubo_vec4_max = 0x00ff,
        };
        const struct nir_to_tgsi_options *ntt_options;