diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index 6c9aa7e5117..449eddece2a 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -1511,6 +1511,10 @@ if (!isnormal(dst))
    dst = copysignf(0.0f, src0);
 """)
 
+opcode("ushr_and_pan", 0, tuint, [0, 0, 0], [tuint, tuint, tuint], False, "",
+       "(src0 >> (src1 & (sizeof(src0) * 8 - 1))) & src2",
+       description = "Unsigned right-shift followed by a bitwise AND." + shift_note)
+
 # vc4-specific opcodes
 
 # Saturated vector add for 4 8bit ints.
diff --git a/src/panfrost/compiler/bifrost/bifrost_compile.c b/src/panfrost/compiler/bifrost/bifrost_compile.c
index 558ee51abf1..bf5ae914554 100644
--- a/src/panfrost/compiler/bifrost/bifrost_compile.c
+++ b/src/panfrost/compiler/bifrost/bifrost_compile.c
@@ -3926,6 +3926,10 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
                         instr->op == nir_op_sdot_4x8_iadd_sat);
       break;
    }
+   case nir_op_ushr_and_pan: {
+      bi_rshift_and_to(b, sz, dst, s0, s2, bi_byte(s1, 0), false);
+      break;
+   }
 
    default:
       fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
@@ -5849,6 +5853,9 @@ bi_optimize_nir(nir_shader *nir, unsigned gpu_id, nir_variable_mode robust2_mode
       late_algebraic |= late_algebraic_progress;
    }
 
+   /* Vectorize instruction generated by bifrost_nir_lower_algebraic_late */
+   NIR_PASS(_, nir, nir_opt_vectorize, bi_vectorize_filter, &gpu_id);
+
    while (late_algebraic) {
       late_algebraic = false;
       NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late);
@@ -5858,6 +5865,9 @@ bi_optimize_nir(nir_shader *nir, unsigned gpu_id, nir_variable_mode robust2_mode
       NIR_PASS(_, nir, nir_opt_cse);
    }
 
+   /* Scalarize vector generated by nir_opt_algebraic_late that are not supported */
+   NIR_PASS(_, nir, nir_lower_alu_width, bi_vectorize_filter, &gpu_id);
+
    NIR_PASS(_, nir, nir_lower_load_const_to_scalar);
    NIR_PASS(_, nir, nir_opt_dce);
 
diff --git a/src/panfrost/compiler/bifrost/bifrost_nir_algebraic.py b/src/panfrost/compiler/bifrost/bifrost_nir_algebraic.py
index 47e8747b4e5..53896ba52a5 100644
--- a/src/panfrost/compiler/bifrost/bifrost_nir_algebraic.py
+++ b/src/panfrost/compiler/bifrost/bifrost_nir_algebraic.py
@@ -104,6 +104,13 @@ algebraic_late = [
     # On v11+, because FROUND.v2f16 is gone we end up with precision issues.
     # We lower ffract here instead to ensure lower_bit_size has been performed.
     (('ffract', a), ('fadd', a, ('fneg', ('ffloor', a))), 'gpu_arch >= 11'),
+
+    # Fuse SHIFT and bitwise AND
+    (('iand', ('ushr', 'a@32', 'b@32'), 'c@32'), ('ushr_and_pan', a, b, c)),
+    (('iand', ('ushr', 'a@16', 'b@32'), 'c@16'), ('ushr_and_pan', a, ('u2u16', b), c)),
+    (('iand', ('extract_u8', 'a@16', 1), 15), ('ushr_and_pan', a, 8, 15)),
+    (('iand', 'a@16', 15), ('ushr_and_pan', a, 0, 15)),
+    (('ushr', 'a@16', 12), ('ushr_and_pan', a, 12, 15)),
 ]
 
 # nir_lower_bool_to_bitsize can generate needless conversions.