diff --git a/src/intel/compiler/brw/brw_compile_gs.cpp b/src/intel/compiler/brw/brw_compile_gs.cpp
index b9c79742fa6..74e5b5f2d6c 100644
--- a/src/intel/compiler/brw/brw_compile_gs.cpp
+++ b/src/intel/compiler/brw/brw_compile_gs.cpp
@@ -288,6 +288,14 @@ brw_compile_gs(const struct brw_compiler *compiler,
    prog_data->output_vertex_size_hwords =
       align(output_vertex_size_bytes, 32) / 32;
 
+   const unsigned starting_urb_offset =
+      2 * prog_data->control_data_header_size_hwords +
+      ((prog_data->static_vertex_count == -1) ? 2 : 0);
+
+   BRW_NIR_PASS(brw_nir_lower_deferred_urb_writes, compiler->devinfo,
+                &prog_data->base.vue_map, starting_urb_offset,
+                2 * prog_data->output_vertex_size_hwords);
+
    /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
     * That divides up as follows:
     *
diff --git a/src/intel/compiler/brw/brw_compile_tes.cpp b/src/intel/compiler/brw/brw_compile_tes.cpp
index 44c4550772f..6cd8301e22b 100644
--- a/src/intel/compiler/brw/brw_compile_tes.cpp
+++ b/src/intel/compiler/brw/brw_compile_tes.cpp
@@ -40,8 +40,6 @@ run_tes(brw_shader &s)
    if (s.failed)
       return false;
 
-   s.emit_urb_writes();
-
    brw_calculate_cfg(s);
 
    s.emit_tes_terminate();
@@ -132,6 +130,9 @@ brw_compile_tes(const struct brw_compiler *compiler,
 
    brw_postprocess_nir(pt, debug_enabled, key->base.robust_flags);
 
+   BRW_NIR_PASS(brw_nir_lower_deferred_urb_writes, devinfo,
+                &prog_data->base.vue_map, 0, 0);
+
    unsigned output_size_bytes = prog_data->base.vue_map.num_slots * 4 * 4;
 
    assert(output_size_bytes >= 1);
diff --git a/src/intel/compiler/brw/brw_compile_vs.cpp b/src/intel/compiler/brw/brw_compile_vs.cpp
index 86cd321912a..549d5199ebb 100644
--- a/src/intel/compiler/brw/brw_compile_vs.cpp
+++ b/src/intel/compiler/brw/brw_compile_vs.cpp
@@ -214,8 +214,6 @@ run_vs(brw_shader &s)
    if (s.failed)
       return false;
 
-   s.emit_urb_writes();
-
    brw_calculate_cfg(s);
 
    ASSERTED bool eot = s.mark_last_urb_write_with_eot();
@@ -300,6 +298,9 @@ brw_compile_vs(const struct brw_compiler *compiler,
    brw_postprocess_nir(pt, debug_enabled,
                        key->base.robust_flags);
 
+   BRW_NIR_PASS(brw_nir_lower_deferred_urb_writes, compiler->devinfo,
+                &prog_data->base.vue_map, 0, 0);
+
    unsigned nr_attribute_slots = util_bitcount64(prog_data->inputs_read);
    /* gl_VertexID and gl_InstanceID are system values, but arrive via an
     * incoming vertex attribute.  So, add an extra slot.
diff --git a/src/intel/compiler/brw/brw_from_nir.cpp b/src/intel/compiler/brw/brw_from_nir.cpp
index 5cf8d49dcbf..5bf59798b4d 100644
--- a/src/intel/compiler/brw/brw_from_nir.cpp
+++ b/src/intel/compiler/brw/brw_from_nir.cpp
@@ -78,56 +78,6 @@ setup_imm_b(const brw_builder &bld, int8_t v)
    return tmp;
 }
 
-static void
-brw_from_nir_setup_outputs(nir_to_brw_state &ntb)
-{
-   brw_shader &s = ntb.s;
-
-   if (s.stage == MESA_SHADER_TESS_CTRL ||
-       s.stage == MESA_SHADER_TASK ||
-       s.stage == MESA_SHADER_MESH ||
-       s.stage == MESA_SHADER_FRAGMENT ||
-       s.stage == MESA_SHADER_COMPUTE)
-      return;
-
-   unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };
-
-   /* Calculate the size of output registers in a separate pass, before
-    * allocating them.  With ARB_enhanced_layouts, multiple output variables
-    * may occupy the same slot, but have different type sizes.
-    */
-   nir_foreach_shader_out_variable(var, s.nir) {
-      const int loc = var->data.driver_location;
-      const unsigned var_vec4s = nir_variable_count_slots(var, var->type);
-      vec4s[loc] = MAX2(vec4s[loc], var_vec4s);
-   }
-
-   for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) {
-      if (vec4s[loc] == 0) {
-         loc++;
-         continue;
-      }
-
-      unsigned reg_size = vec4s[loc];
-
-      /* Check if there are any ranges that start within this range and extend
-       * past it. If so, include them in this allocation.
-       */
-      for (unsigned i = 1; i < reg_size; i++) {
-         assert(i + loc < ARRAY_SIZE(vec4s));
-         reg_size = MAX2(vec4s[i + loc] + i, reg_size);
-      }
-
-      brw_reg reg = ntb.bld.vgrf(BRW_TYPE_F, 4 * reg_size);
-      for (unsigned i = 0; i < reg_size; i++) {
-         assert(loc + i < ARRAY_SIZE(s.outputs));
-         s.outputs[loc + i] = offset(reg, ntb.bld, 4 * i);
-      }
-
-      loc += reg_size;
-   }
-}
-
 static brw_reg
 emit_work_group_id_setup(nir_to_brw_state &ntb)
 {
@@ -2647,8 +2597,6 @@ emit_gs_vertex(nir_to_brw_state &ntb, const nir_src &vertex_count_nir_src,
       abld.emit(BRW_OPCODE_ENDIF);
    }
 
-   s.emit_urb_writes(vertex_count);
-
    /* In stream mode we have to set control data bits for all vertices
     * unless we have disabled control data bits completely (which we do
     * do for MESA_PRIM_POINTS outputs that don't use streams).
@@ -3143,13 +3091,6 @@ brw_from_nir_emit_gs_intrinsic(nir_to_brw_state &ntb,
 
    case nir_intrinsic_emit_vertex_with_counter:
       emit_gs_vertex(ntb, instr->src[0], nir_intrinsic_stream_id(instr));
-
-      /* After an EmitVertex() call, the values of all outputs are undefined.
-       * If this is not in control flow, recreate a fresh set of output
-       * registers to keep their live ranges separate.
-       */
-      if (instr->instr.block->cf_node.parent->type == nir_cf_node_function)
-         brw_from_nir_setup_outputs(ntb);
       break;
 
    case nir_intrinsic_end_primitive_with_counter:
@@ -5520,22 +5461,6 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
       break;
    }
 
-   case nir_intrinsic_store_output: {
-      assert(nir_src_bit_size(instr->src[0]) == 32);
-      brw_reg src = get_nir_src(ntb, instr->src[0], -1);
-
-      unsigned store_offset = nir_src_as_uint(instr->src[1]);
-      unsigned num_components = instr->num_components;
-      unsigned first_component = nir_intrinsic_component(instr);
-
-      brw_reg new_dest = retype(offset(s.outputs[instr->const_index[0]], bld,
-                                      4 * store_offset), src.type);
-
-      brw_combine_with_vec(bld, offset(new_dest, bld, first_component),
-                           src, num_components);
-      break;
-   }
-
    case nir_intrinsic_load_subgroup_size:
       /* This should only happen for fragment shaders because every other case
        * is lowered in NIR so we can optimize on it.
@@ -6949,7 +6874,6 @@ brw_from_nir(brw_shader *s)
    /* emit the arrays used for inputs and outputs - load/store intrinsics will
     * be converted to reads/writes of these arrays
     */
-   brw_from_nir_setup_outputs(ntb);
    brw_from_nir_emit_system_values(ntb);
    ntb.s.last_scratch = align(ntb.nir->scratch_size, 4) * ntb.s.dispatch_width;
 
diff --git a/src/intel/compiler/brw/brw_nir.c b/src/intel/compiler/brw/brw_nir.c
index 7a0bb1975a4..d6fec632c03 100644
--- a/src/intel/compiler/brw/brw_nir.c
+++ b/src/intel/compiler/brw/brw_nir.c
@@ -463,6 +463,159 @@ brw_nir_lower_outputs_to_urb_intrinsics(nir_shader *nir,
                                      nir_metadata_control_flow, (void *) cd);
 }
 
+/* See if comps[0..3] has any non-undef values. */
+static bool
+slot_defined(nir_scalar *comps)
+{
+   for (unsigned i = 0; i < 4; i++) {
+      if (comps[i].def && !nir_def_is_undef(comps[i].def))
+         return true;
+   }
+   return false;
+}
+
+/* Replace any NULL defs in comps[0..(n-1)] with undef */
+static void
+fill_undefs(nir_scalar *comps, nir_def *undef, unsigned n)
+{
+   for (unsigned i = 0; i < n; i++) {
+      if (!comps[i].def)
+         comps[i] = nir_get_scalar(undef, 0);
+   }
+}
+
+static void
+emit_urb_writes(nir_builder *b,
+                const struct intel_device_info *devinfo,
+                nir_scalar *outputs,
+                unsigned num_slots,
+                nir_def *offset)
+{
+   nir_def *undef = nir_undef(b, 1, 32);
+
+   /* Primitive Shading Rate defaults to (1, 1) in half-float */
+   if (devinfo->has_coarse_pixel_primitive_and_cb && !outputs[0].def)
+      outputs[0] = nir_get_scalar(nir_imm_int(b, 0x3C003C00), 0);
+
+   /* Viewport, Layer, and Point Size default to 0 */
+   for (unsigned i = 0; i < 4; i++) {
+      if (!outputs[i].def)
+         outputs[i] = nir_get_scalar(nir_imm_int(b, 0), 0);
+   }
+
+   /* Emit URB writes */
+   for (unsigned slot = 0; slot < num_slots; slot++) {
+      if (!slot_defined(&outputs[4 * slot]))
+         continue;
+
+      const bool vec8 = slot + 1 < num_slots &&
+                        slot_defined(&outputs[4 * (slot + 1)]);
+
+      fill_undefs(&outputs[4 * slot], undef, vec8 ? 8 : 4);
+
+      nir_def *val = nir_vec_scalars(b, &outputs[4 * slot], vec8 ? 8 : 4);
+
+      if (devinfo->ver >= 20) {
+         nir_def *addr = nir_iadd(b, output_handle(b),
+                                  nir_imul_imm(b, offset, 16));
+         nir_store_urb_lsc_intel(b, val, addr, .base = 16 * slot);
+      } else {
+         nir_store_urb_vec4_intel(b, val, output_handle(b), offset,
+                                  nir_imm_int(b, vec8 ? 0xff : 0xf),
+                                  .base = slot);
+      }
+
+      if (vec8)
+         slot++;
+   }
+}
+
+bool
+brw_nir_lower_deferred_urb_writes(nir_shader *nir,
+                                  const struct intel_device_info *devinfo,
+                                  const struct intel_vue_map *vue_map,
+                                  unsigned extra_urb_slot_offset,
+                                  unsigned gs_vertex_stride)
+{
+   nir_scalar *outputs = calloc(vue_map->num_slots, 4 * sizeof(nir_scalar));
+
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+         switch (intrin->intrinsic) {
+         case nir_intrinsic_store_output:
+         case nir_intrinsic_store_per_view_output: {
+            nir_src *view_index = nir_get_io_arrayed_index_src(intrin);
+            nir_src *offset = nir_get_io_offset_src(intrin);
+
+            const nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
+            const unsigned slot =
+               vue_map->varying_to_slot[sem.location] +
+               (view_index ? nir_src_as_uint(*view_index) : 0);
+               nir_src_as_uint(*offset);
+            const unsigned c = io_component(intrin, NULL);
+            const unsigned mask = nir_intrinsic_write_mask(intrin);
+            assert(slot != -1);
+            assert(c < 4);
+
+            u_foreach_bit(i, mask) {
+               outputs[4 * slot + c + i] =
+                  nir_scalar_resolved(intrin->src[0].ssa, i);
+            }
+
+            nir_instr_remove(instr);
+            break;
+         }
+
+         case nir_intrinsic_emit_vertex_with_counter: {
+            /* The only purpose of primitives sent to non-zero streams
+             * is to be recorded by transform feedback, so if it is disabled,
+             * we can discard all geometry bound for those streams.
+             */
+            if (nir_intrinsic_stream_id(intrin) > 0 &&
+                !nir->info.has_transform_feedback_varyings) {
+               nir_instr_remove(instr);
+               break;
+            }
+
+            nir_builder b = nir_builder_at(nir_before_instr(instr));
+            b.constant_fold_alu = true;
+            nir_def *offset =
+               nir_iadd_imm(&b, nir_imul_imm(&b, intrin->src[0].ssa,
+                                                 gs_vertex_stride),
+                            extra_urb_slot_offset);
+
+            emit_urb_writes(&b, devinfo, outputs, vue_map->num_slots, offset);
+            /* After EmitVertex() all outputs are undefined */
+            memset(outputs, 0, 4 * vue_map->num_slots * sizeof(nir_scalar));
+
+            /* Leave emit_vertex_with_counter for control data writes */
+            break;
+         }
+
+         default:
+            break;
+         }
+      }
+   }
+
+   if (nir->info.stage != MESA_SHADER_GEOMETRY) {
+      nir_builder b = nir_builder_at(nir_after_impl(impl));
+      emit_urb_writes(&b, devinfo, outputs, vue_map->num_slots,
+                      nir_imm_int(&b, 0));
+   }
+
+   free(outputs);
+
+   return nir_progress(true, impl, nir_metadata_control_flow);
+}
+
+
 static bool
 lower_task_payload_to_urb(nir_builder *b, nir_intrinsic_instr *io, void *data)
 {
@@ -735,60 +888,6 @@ remap_tess_levels(nir_shader *nir,
                                      nir_metadata_control_flow, &cb);
 }
 
-/* Replace store_per_view_output to plain store_output, mapping the view index
- * to IO offset. Because we only use per-view outputs for position, the offset
- * pitch is always 1. */
-static bool
-lower_per_view_outputs(nir_builder *b,
-                       nir_intrinsic_instr *intrin,
-                       UNUSED void *cb_data)
-{
-   if (intrin->intrinsic != nir_intrinsic_store_per_view_output &&
-       intrin->intrinsic != nir_intrinsic_load_per_view_output)
-      return false;
-
-   b->cursor = nir_before_instr(&intrin->instr);
-
-   nir_src *view_index = nir_get_io_arrayed_index_src(intrin);
-   nir_src *offset = nir_get_io_offset_src(intrin);
-
-   nir_def *new_offset = nir_iadd(b, view_index->ssa, offset->ssa);
-
-   nir_intrinsic_instr *new;
-   if (intrin->intrinsic == nir_intrinsic_store_per_view_output)
-      new = nir_store_output(b, intrin->src[0].ssa, new_offset);
-   else {
-      nir_def *new_def = nir_load_output(b, intrin->def.num_components,
-                                         intrin->def.bit_size, new_offset);
-      new = nir_def_as_intrinsic(new_def);
-   }
-
-   nir_intrinsic_set_base(new, nir_intrinsic_base(intrin));
-   nir_intrinsic_set_range(new, nir_intrinsic_range(intrin));
-   nir_intrinsic_set_write_mask(new, nir_intrinsic_write_mask(intrin));
-   nir_intrinsic_set_component(new, nir_intrinsic_component(intrin));
-   nir_intrinsic_set_src_type(new, nir_intrinsic_src_type(intrin));
-
-   nir_io_semantics sem = nir_intrinsic_io_semantics(intrin);
-   /* the meaning of the offset src is different for brw */
-   sem.no_validate = 1;
-   nir_intrinsic_set_io_semantics(new, sem);
-
-   if (intrin->intrinsic == nir_intrinsic_load_per_view_output)
-      nir_def_rewrite_uses(&intrin->def, &new->def);
-   nir_instr_remove(&intrin->instr);
-
-   return true;
-}
-
-static bool
-brw_nir_lower_per_view_outputs(nir_shader *nir)
-{
-   return nir_shader_intrinsics_pass(nir, lower_per_view_outputs,
-                                     nir_metadata_control_flow,
-                                     NULL);
-}
-
 static bool
 brw_nir_should_vectorize_urb(unsigned align_mul, unsigned align_offset,
                              unsigned bit_size,
@@ -1306,13 +1405,8 @@ brw_nir_lower_fs_inputs(nir_shader *nir,
 void
 brw_nir_lower_vue_outputs(nir_shader *nir)
 {
-   nir_foreach_shader_out_variable(var, nir) {
-      var->data.driver_location = var->data.location;
-   }
-
    NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out, type_size_vec4,
             nir_lower_io_lower_64bit_to_32);
-   NIR_PASS(_, nir, brw_nir_lower_per_view_outputs);
 }
 
 void
diff --git a/src/intel/compiler/brw/brw_nir.h b/src/intel/compiler/brw/brw_nir.h
index 978ad981e87..81faeac06a0 100644
--- a/src/intel/compiler/brw/brw_nir.h
+++ b/src/intel/compiler/brw/brw_nir.h
@@ -219,6 +219,11 @@ struct brw_lower_urb_cb_data {
 bool brw_nir_lower_inputs_to_urb_intrinsics(nir_shader *, const struct brw_lower_urb_cb_data *);
 
 bool brw_nir_lower_outputs_to_urb_intrinsics(nir_shader *, const struct brw_lower_urb_cb_data *);
+bool brw_nir_lower_deferred_urb_writes(nir_shader *nir,
+                                       const struct intel_device_info *devinfo,
+                                       const struct intel_vue_map *vue_map,
+                                       unsigned extra_urb_slot_offset,
+                                       unsigned gs_vertex_stride);
 
 void brw_nir_opt_vectorize_urb(struct brw_pass_tracker *pt);
 
diff --git a/src/intel/compiler/brw/brw_shader.cpp b/src/intel/compiler/brw/brw_shader.cpp
index 482df008755..bd2679b5f7d 100644
--- a/src/intel/compiler/brw/brw_shader.cpp
+++ b/src/intel/compiler/brw/brw_shader.cpp
@@ -19,257 +19,6 @@
 #include "compiler/nir/nir_builder.h"
 #include "util/u_math.h"
 
-void
-brw_shader::emit_urb_writes(const brw_reg &gs_vertex_count)
-{
-   int slot, urb_offset, length;
-   int starting_urb_offset = 0;
-   const struct brw_vue_prog_data *vue_prog_data =
-      brw_vue_prog_data(this->prog_data);
-   const struct intel_vue_map *vue_map = &vue_prog_data->vue_map;
-   bool flush;
-   brw_reg sources[8];
-   brw_reg urb_handle;
-
-   switch (stage) {
-   case MESA_SHADER_VERTEX:
-      urb_handle = vs_payload().urb_handles;
-      break;
-   case MESA_SHADER_TESS_EVAL:
-      urb_handle = tes_payload().urb_output;
-      break;
-   case MESA_SHADER_GEOMETRY:
-      urb_handle = gs_payload().urb_handles;
-      break;
-   default:
-      UNREACHABLE("invalid stage");
-   }
-
-   const brw_builder bld = brw_builder(this);
-
-   brw_reg per_slot_offsets;
-
-   if (stage == MESA_SHADER_GEOMETRY) {
-      const struct brw_gs_prog_data *gs_prog_data =
-         brw_gs_prog_data(this->prog_data);
-
-      /* We need to increment the Global Offset to skip over the control data
-       * header and the extra "Vertex Count" field (1 HWord) at the beginning
-       * of the VUE.  We're counting in OWords, so the units are doubled.
-       */
-      starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords;
-      if (gs_prog_data->static_vertex_count == -1)
-         starting_urb_offset += 2;
-
-      /* The URB offset is in 128-bit units, so we need to multiply by 2 */
-      const int output_vertex_size_owords =
-         gs_prog_data->output_vertex_size_hwords * 2;
-
-      /* On Xe2+ platform, LSC can operate on the Dword data element with byte
-       * offset granularity, so convert per slot offset in bytes since it's in
-       * Owords (16-bytes) unit else keep per slot offset in oword unit for
-       * previous platforms.
-       */
-      const int output_vertex_size = devinfo->ver >= 20 ?
-                                     output_vertex_size_owords * 16 :
-                                     output_vertex_size_owords;
-      if (gs_vertex_count.file == IMM) {
-         per_slot_offsets = brw_imm_ud(output_vertex_size *
-                                       gs_vertex_count.ud);
-      } else {
-         per_slot_offsets = bld.vgrf(BRW_TYPE_UD);
-         bld.MUL(per_slot_offsets, gs_vertex_count,
-                 brw_imm_ud(output_vertex_size));
-      }
-   }
-
-   length = 0;
-   urb_offset = starting_urb_offset;
-   flush = false;
-
-   /* SSO shaders can have VUE slots allocated which are never actually
-    * written to, so ignore them when looking for the last (written) slot.
-    */
-   int last_slot = vue_map->num_slots - 1;
-   while (last_slot > 0 &&
-          (vue_map->slot_to_varying[last_slot] == BRW_VARYING_SLOT_PAD ||
-           outputs[vue_map->slot_to_varying[last_slot]].file == BAD_FILE)) {
-      last_slot--;
-   }
-
-   bool urb_written = false;
-   for (slot = 0; slot < vue_map->num_slots; slot++) {
-      int varying = vue_map->slot_to_varying[slot];
-      switch (varying) {
-      case VARYING_SLOT_PSIZ: {
-         /* The point size varying slot is the vue header and is always in the
-          * vue map. If anything in the header is going to be read back by HW,
-          * we need to initialize it, in particular the viewport & layer
-          * values.
-          *
-          * SKL PRMs, Volume 7: 3D-Media-GPGPU, Vertex URB Entry (VUE)
-          * Formats:
-          *
-          *    "VUEs are written in two ways:
-          *
-          *       - At the top of the 3D Geometry pipeline, the VF's
-          *         InputAssembly function creates VUEs and initializes them
-          *         from data extracted from Vertex Buffers as well as
-          *         internally generated data.
-          *
-          *       - VS, GS, HS and DS threads can compute, format, and write
-          *         new VUEs as thread output."
-          *
-          *    "Software must ensure that any VUEs subject to readback by the
-          *     3D pipeline start with a valid Vertex Header. This extends to
-          *     all VUEs with the following exceptions:
-          *
-          *       - If the VS function is enabled, the VF-written VUEs are not
-          *         required to have Vertex Headers, as the VS-incoming
-          *         vertices are guaranteed to be consumed by the VS (i.e.,
-          *         the VS thread is responsible for overwriting the input
-          *         vertex data).
-          *
-          *       - If the GS FF is enabled, neither VF-written VUEs nor VS
-          *         thread-generated VUEs are required to have Vertex Headers,
-          *         as the GS will consume all incoming vertices.
-          *
-          *       - If Rendering is disabled, VertexHeaders are not required
-          *         anywhere."
-          */
-         brw_reg zero =
-            retype(brw_allocate_vgrf_units(*this, dispatch_width / 8), BRW_TYPE_UD);
-         bld.MOV(zero, brw_imm_ud(0u));
-
-         if (vue_map->slots_valid & VARYING_BIT_PRIMITIVE_SHADING_RATE &&
-             this->outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE].file != BAD_FILE) {
-            sources[length++] = this->outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE];
-         } else if (devinfo->has_coarse_pixel_primitive_and_cb) {
-            uint32_t one_fp16 = 0x3C00;
-            brw_reg one_by_one_fp16 =
-               retype(brw_allocate_vgrf_units(*this, dispatch_width / 8), BRW_TYPE_UD);
-            bld.MOV(one_by_one_fp16, brw_imm_ud((one_fp16 << 16) | one_fp16));
-            sources[length++] = one_by_one_fp16;
-         } else {
-            sources[length++] = zero;
-         }
-
-         if (vue_map->slots_valid & VARYING_BIT_LAYER)
-            sources[length++] = this->outputs[VARYING_SLOT_LAYER];
-         else
-            sources[length++] = zero;
-
-         if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
-            sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
-         else
-            sources[length++] = zero;
-
-         if (vue_map->slots_valid & VARYING_BIT_PSIZ)
-            sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
-         else
-            sources[length++] = zero;
-         break;
-      }
-      case VARYING_SLOT_EDGE:
-         UNREACHABLE("unexpected scalar vs output");
-         break;
-
-      default:
-         /* gl_Position is always in the vue map, but isn't always written by
-          * the shader.  Other varyings (clip distances) get added to the vue
-          * map but don't always get written.  In those cases, the
-          * corresponding this->output[] slot will be invalid we and can skip
-          * the urb write for the varying.  If we've already queued up a vue
-          * slot for writing we flush a mlen 5 urb write, otherwise we just
-          * advance the urb_offset.
-          */
-         if (varying == BRW_VARYING_SLOT_PAD ||
-             this->outputs[varying].file == BAD_FILE) {
-            if (length > 0)
-               flush = true;
-            else
-               urb_offset++;
-            break;
-         }
-
-         int slot_offset = 0;
-
-         /* When using Primitive Replication, there may be multiple slots
-          * assigned to POS.
-          */
-         if (varying == VARYING_SLOT_POS)
-            slot_offset = slot - vue_map->varying_to_slot[VARYING_SLOT_POS];
-
-         for (unsigned i = 0; i < 4; i++) {
-            sources[length++] = offset(this->outputs[varying], bld,
-                                       i + (slot_offset * 4));
-         }
-         break;
-      }
-
-      const brw_builder abld = bld.annotate("URB write");
-
-      /* If we've queued up 8 registers of payload (2 VUE slots), if this is
-       * the last slot or if we need to flush (see BAD_FILE varying case
-       * above), emit a URB write send now to flush out the data.
-       */
-      if (length == 8 || (length > 0 && slot == last_slot))
-         flush = true;
-      if (flush) {
-         brw_reg srcs[URB_LOGICAL_NUM_SRCS];
-
-         srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
-         srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offsets;
-         srcs[URB_LOGICAL_SRC_DATA] =
-            retype(brw_allocate_vgrf_units(*this, (dispatch_width / 8) * length), BRW_TYPE_F);
-         abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
-
-         brw_urb_inst *urb = abld.URB_WRITE(srcs, ARRAY_SIZE(srcs));
-         urb->components = length;
-         urb->offset = urb_offset * (devinfo->ver >= 20 ? 16 : 1);
-         urb_offset = starting_urb_offset + slot + 1;
-         length = 0;
-         flush = false;
-         urb_written = true;
-      }
-   }
-
-   /* If we don't have any valid slots to write, just do a minimal urb write
-    * send to terminate the shader.  This includes 1 slot of undefined data,
-    * because it's invalid to write 0 data:
-    *
-    * From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions -
-    * Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read >
-    * Write Data Payload:
-    *
-    *    "The write data payload can be between 1 and 8 message phases long."
-    */
-   if (!urb_written) {
-      /* For GS, just turn EmitVertex() into a no-op.  We don't want it to
-       * end the thread, and emit_gs_thread_end() already emits a SEND with
-       * EOT at the end of the program for us.
-       */
-      if (stage == MESA_SHADER_GEOMETRY)
-         return;
-
-      brw_reg uniform_urb_handle =
-         retype(brw_allocate_vgrf_units(*this, dispatch_width / 8), BRW_TYPE_UD);
-      brw_reg payload =
-         retype(brw_allocate_vgrf_units(*this, dispatch_width / 8), BRW_TYPE_UD);
-
-      bld.exec_all().MOV(uniform_urb_handle, urb_handle);
-
-      brw_reg srcs[URB_LOGICAL_NUM_SRCS];
-      srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle;
-      srcs[URB_LOGICAL_SRC_DATA] = payload;
-
-      brw_urb_inst *urb = bld.URB_WRITE(srcs, ARRAY_SIZE(srcs));
-      urb->offset = devinfo->ver >= 20 ? 16 : 1;
-      urb->components = 1;
-      return;
-   }
-}
-
 void
 brw_shader::emit_tes_terminate()
 {
diff --git a/src/intel/compiler/brw/brw_shader.h b/src/intel/compiler/brw/brw_shader.h
index 200bdf86aab..fa9f62764f2 100644
--- a/src/intel/compiler/brw/brw_shader.h
+++ b/src/intel/compiler/brw/brw_shader.h
@@ -88,7 +88,6 @@ public:
    void fail(const char *msg, ...);
    void limit_dispatch_width(unsigned n, const char *msg);
 
-   void emit_urb_writes(const brw_reg &gs_vertex_count = brw_reg());
    void emit_gs_control_data_bits(const brw_reg &vertex_count);
    brw_reg gs_urb_channel_mask(const brw_reg &dword_index);
    brw_reg gs_urb_per_slot_dword_index(const brw_reg &vertex_count);