From 873dfb673b643db7263fc77b3734aa17fa26aecd Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Mon, 23 Jan 2023 13:52:30 -0800
Subject: [PATCH] anv: Perform load_constant address math in 32-bit rather than
 64-bit

We lower NIR's load_constant to load_global_constant, which uses A64
bindless messages.  As such, we do the following math to produce the
address for each load:

   base_lo@32 <- BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW
   base_hi@32 <- BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH
   base@64 <- pack_64_2x32_split(base_lo, base_hi)
   addr@64 <- iadd(base@64, u2u64(offset@32))

On platforms that emulate 64-bit math, we have to emit additional code
for the 64-bit iadd to handle the possibility of a carry happening and
affecting the top bits.

However, NIR constant data is always uploaded adjacent to the shader
assembly, in the same buffer.  These buffers are required to live in a
4GB region of memory starting at Instruction State Base Address.  We
always place the base address at a 4GB address.  So the constant data
always lives in a buffer entirely contained within a 4GB region, which
means any offsets from the start of the buffer cannot possibly affect
the high bits.

So instead, we can simply do a 32-bit addition between the low bits of
the base and the offset, then pack that with the unchanged high bits.

On anv, INSTRUCTION_STATE_POOL_MIN_ADDRESS is 8GB, so the high bits are
always 0x2.  We don't even need to patch that portion of the address and
can just use an immediate value.  We do still need to pack, however.

fossil-db on Icelake indicates the following for affected shaders:

   Instrs: 10830023 -> 10750080 (-0.74%)
   Cycles: 1048521282 -> 1046770379 (-0.17%); split: -0.33%, +0.16%
   Subgroup size: 103104 -> 103112 (+0.01%)
   Send messages: 570886 -> 570760 (-0.02%)
   Loop count: 14428 -> 14429 (+0.01%)
   Spill count: 14246 -> 14244 (-0.01%); split: -0.06%, +0.04%
   Fill count: 22802 -> 22794 (-0.04%); split: -0.04%, +0.01%
   Scratch Memory Size: 654336 -> 662528 (+1.25%)

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20999>
---
 src/intel/vulkan/anv_nir_apply_pipeline_layout.c | 11 ++++++-----
 src/intel/vulkan/anv_pipeline_cache.c            |  5 +----
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
index 8560ef39a75..d3a07fe9c13 100644
--- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
+++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
@@ -1040,13 +1040,14 @@ lower_load_constant(nir_builder *b, nir_intrinsic_instr *intrin,
    unsigned max_offset = b->shader->constant_data_size - load_size;
    offset = nir_umin(b, offset, nir_imm_int(b, max_offset));
 
-   nir_ssa_def *const_data_base_addr = nir_pack_64_2x32_split(b,
-      nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW),
-      nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH));
+   nir_ssa_def *const_data_addr = nir_pack_64_2x32_split(b,
+      nir_iadd(b,
+         nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW),
+         offset),
+      nir_imm_int(b, INSTRUCTION_STATE_POOL_MIN_ADDRESS >> 32));
 
    nir_ssa_def *data =
-      nir_load_global_constant(b, nir_iadd(b, const_data_base_addr,
-                                              nir_u2u64(b, offset)),
+      nir_load_global_constant(b, const_data_addr,
                                load_align,
                                intrin->dest.ssa.num_components,
                                intrin->dest.ssa.bit_size);
diff --git a/src/intel/vulkan/anv_pipeline_cache.c b/src/intel/vulkan/anv_pipeline_cache.c
index d50a84e2a58..6f00095ce59 100644
--- a/src/intel/vulkan/anv_pipeline_cache.c
+++ b/src/intel/vulkan/anv_pipeline_cache.c
@@ -123,10 +123,7 @@ anv_shader_bin_create(struct anv_device *device,
       .id = BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW,
       .value = shader_data_addr,
    };
-   reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
-      .id = BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH,
-      .value = shader_data_addr >> 32,
-   };
+   assert(shader_data_addr >> 32 == INSTRUCTION_STATE_POOL_MIN_ADDRESS >> 32);
    reloc_values[rv_count++] = (struct brw_shader_reloc_value) {
       .id = BRW_SHADER_RELOC_SHADER_START_OFFSET,
       .value = shader->kernel.offset,