treewide: Replace calling to function ALIGN with align

This is done by grep ALIGN( to align( docs,*.xml,blake3 is excluded Signed-off-by: Yonggang Luo <luoyonggang@gmail.com> Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Acked-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38365>
2026-03-16 19:10:43 +01:00 · 2025-11-11 15:33:54 +08:00 · 2025-11-11 15:33:54 +08:00 · ecb0ccf603
commit ecb0ccf603
parent 03a32b3fe2
119 changed files with 246 additions and 246 deletions
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@ -2317,8 +2317,8 @@ void ac_get_task_info(const struct radeon_info *info,
   /* Ensure that the addresses of each ring are 256 byte aligned. */
   task_info->payload_entry_size = payload_entry_size;
   task_info->num_entries = num_entries;
-   task_info->draw_ring_offset = ALIGN(AC_TASK_CTRLBUF_BYTES, 256);
-   task_info->payload_ring_offset = ALIGN(task_info->draw_ring_offset + draw_ring_bytes, 256);
+   task_info->draw_ring_offset = align(AC_TASK_CTRLBUF_BYTES, 256);
+   task_info->payload_ring_offset = align(task_info->draw_ring_offset + draw_ring_bytes, 256);
   task_info->bo_size_bytes = task_info->payload_ring_offset + payload_ring_bytes;
 }

--- a/src/amd/common/ac_rgp.c
+++ b/src/amd/common/ac_rgp.c
@ -1046,7 +1046,7 @@ ac_sqtt_dump_data(const struct radeon_info *rad_info, struct ac_sqtt_trace *sqtt
                                      sizeof(struct sqtt_code_object_database_record),
                                      record, &elf_size_calc, flags);
         /* Align to 4 bytes per the RGP file spec. */
-         code_object_record.size = ALIGN(elf_size_calc, 4);
+         code_object_record.size = align(elf_size_calc, 4);
         fseek(output, file_offset, SEEK_SET);
         fwrite(&code_object_record, sizeof(struct sqtt_code_object_database_record),
                1, output);
--- a/src/amd/common/ac_rgp_elf_object_pack.c
+++ b/src/amd/common/ac_rgp_elf_object_pack.c
@ -245,7 +245,7 @@ ac_rgp_write_msgpack(FILE *output,
                     }
            }
   ac_msgpack_resize_if_required(&msgpack, 4 - (msgpack.offset % 4));
-   msgpack.offset = ALIGN(msgpack.offset, 4);
+   msgpack.offset = align(msgpack.offset, 4);
   fwrite(msgpack.mem, 1, msgpack.offset, output);
   *written_size = msgpack.offset;
   ac_msgpack_destroy(&msgpack);
@ -321,7 +321,7 @@ ac_rgp_file_write_elf_text(FILE *output, uint32_t *elf_size_calc,
   }

   symbol_offset += rgp_shader_data->code_size;
-   uint32_t aligned = ALIGN(symbol_offset, 256) - symbol_offset;
+   uint32_t aligned = align(symbol_offset, 256) - symbol_offset;
   fseek(output, aligned, SEEK_CUR);
   *elf_size_calc += aligned;
   *text_size = symbol_offset + aligned;
--- a/src/amd/vulkan/layers/radv_sqtt_layer.c
+++ b/src/amd/vulkan/layers/radv_sqtt_layer.c
@ -1393,7 +1393,7 @@ radv_fill_code_object_record(struct radv_device *device, struct rgp_shader_data
   shader_data->vgpr_count = shader->config.num_vgprs;
   shader_data->sgpr_count = shader->config.num_sgprs;
   shader_data->scratch_memory_size = shader->config.scratch_bytes_per_wave;
-   shader_data->lds_size = ALIGN(shader->config.lds_size, lds_increment);
+   shader_data->lds_size = align(shader->config.lds_size, lds_increment);
   shader_data->wavefront_size = shader->info.wave_size;
   shader_data->base_address = va & 0xffffffffffff;
   shader_data->elf_symbol_offset = 0;
--- a/src/amd/vulkan/radv_acceleration_structure.c
+++ b/src/amd/vulkan/radv_acceleration_structure.c
@ -162,7 +162,7 @@ radv_get_acceleration_structure_layout(struct radv_device *device,
      offset += bvh_size / 64 * 4;

   /* The BVH and hence bvh_offset needs 64 byte alignment for RT nodes. */
-   offset = ALIGN(offset, 64);
+   offset = align(offset, 64);
   accel_struct->bvh_offset = offset;

   /* root node */
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@ -543,7 +543,7 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat

   if (radv_shader_should_clear_lds(device, stage->nir)) {
      const unsigned chunk_size = 16; /* max single store size */
-      const unsigned shared_size = ALIGN(stage->nir->info.shared_size, chunk_size);
+      const unsigned shared_size = align(stage->nir->info.shared_size, chunk_size);
      NIR_PASS(_, stage->nir, nir_clear_shared_memory, shared_size, chunk_size);
   }

--- a/src/amd/vulkan/radv_queue.c
+++ b/src/amd/vulkan/radv_queue.c
@ -146,9 +146,9 @@ radv_sparse_image_bind_memory(struct radv_device *device, const VkSparseImageMem
                                        (uint64_t)bind_offset.x * surface->prt_tile_height * surface->prt_tile_depth) *
                                          bs;

-      uint32_t aligned_extent_width = ALIGN(bind_extent.width, surface->prt_tile_width);
-      uint32_t aligned_extent_height = ALIGN(bind_extent.height, surface->prt_tile_height);
-      uint32_t aligned_extent_depth = ALIGN(bind_extent.depth, surface->prt_tile_depth);
+      uint32_t aligned_extent_width = align(bind_extent.width, surface->prt_tile_width);
+      uint32_t aligned_extent_height = align(bind_extent.height, surface->prt_tile_height);
+      uint32_t aligned_extent_depth = align(bind_extent.depth, surface->prt_tile_depth);

      bool whole_subres = (bind_extent.height <= surface->prt_tile_height || aligned_extent_width == pitch) &&
                          (bind_extent.depth <= surface->prt_tile_depth ||
--- a/src/amd/vulkan/radv_sdma.c
+++ b/src/amd/vulkan/radv_sdma.c
@ -134,7 +134,7 @@ radv_sdma_get_chunked_copy_info(const struct radv_device *const device, const st
 {
   const unsigned extent_horizontal_blocks = DIV_ROUND_UP(extent.width * img->texel_scale, img->blk_w);
   const unsigned extent_vertical_blocks = DIV_ROUND_UP(extent.height, img->blk_h);
-   const unsigned aligned_row_pitch = ALIGN(extent_horizontal_blocks, 4);
+   const unsigned aligned_row_pitch = align(extent_horizontal_blocks, 4);
   const unsigned aligned_row_bytes = aligned_row_pitch * img->bpp;

   /* Assume that we can always copy at least one full row at a time. */
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@ -760,7 +760,7 @@ radv_shader_spirv_to_nir(struct radv_device *device, const struct radv_shader_st

      if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
         const unsigned chunk_size = 16; /* max single store size */
-         const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);
+         const unsigned shared_size = align(nir->info.shared_size, chunk_size);
         NIR_PASS(_, nir, nir_zero_initialize_shared_memory, shared_size, chunk_size);
      }
   }
@ -945,7 +945,7 @@ radv_lower_ngg(struct radv_device *device, struct radv_shader_stage *ngg_stage,
               &ngg_stage->info.ngg_lds_scratch_size);
   } else if (nir->info.stage == MESA_SHADER_MESH) {
      /* ACO aligns the workgroup size to the wave size. */
-      unsigned hw_workgroup_size = ALIGN(info->workgroup_size, info->wave_size);
+      unsigned hw_workgroup_size = align(info->workgroup_size, info->wave_size);

      bool scratch_ring = false;
      NIR_PASS(_, nir, ac_nir_lower_ngg_mesh, &pdev->info, options.export_clipdist_mask, options.vs_output_param_offset,
--- a/src/amd/vulkan/radv_shader_object.c
+++ b/src/amd/vulkan/radv_shader_object.c
@ -620,7 +620,7 @@ radv_get_shader_binary_size(const struct radv_shader_binary *binary)
   size_t size = sizeof(uint32_t); /* has_binary */

   if (binary)
-      size += SHA1_DIGEST_LENGTH + 4 + ALIGN(binary->total_size, 4);
+      size += SHA1_DIGEST_LENGTH + 4 + align(binary->total_size, 4);

   return size;
 }
--- a/src/asahi/compiler/agx_compile.c
+++ b/src/asahi/compiler/agx_compile.c
@ -3450,7 +3450,7 @@ agx_compile_function_nir(nir_shader *nir, nir_function_impl *impl,
    */
   if (ctx->any_scratch) {
      assert(!ctx->is_preamble && "preambles don't use scratch");
-      ctx->scratch_size_B = ALIGN(nir->scratch_size, 16);
+      ctx->scratch_size_B = align(nir->scratch_size, 16);
   }

   /* Stop the main shader or preamble shader after the exit block. For real
@ -3509,7 +3509,7 @@ agx_compile_function_nir(nir_shader *nir, nir_function_impl *impl,

   if (ctx->scratch_size_B > 0) {
      /* Apple always allocate 40 more bytes in the entrypoint and align to 4. */
-      uint64_t stack_size = ALIGN(DIV_ROUND_UP(ctx->scratch_size_B, 4) + 10, 4);
+      uint64_t stack_size = align(DIV_ROUND_UP(ctx->scratch_size_B, 4) + 10, 4);

      assert(stack_size < INT16_MAX);

--- a/src/asahi/layout/layout.h
+++ b/src/asahi/layout/layout.h
@ -339,10 +339,10 @@ static inline bool
 ail_is_level_allocated_compressed(const struct ail_layout *layout,
                                  unsigned level)
 {
-   unsigned width_sa = ALIGN(
+   unsigned width_sa = align(
      ail_effective_width_sa(layout->width_px, layout->sample_count_sa), 16);

-   unsigned height_sa = ALIGN(
+   unsigned height_sa = align(
      ail_effective_height_sa(layout->height_px, layout->sample_count_sa), 16);

   return layout->compressed &&
--- a/src/asahi/vulkan/hk_descriptor_set_layout.c
+++ b/src/asahi/vulkan/hk_descriptor_set_layout.c
@ -80,7 +80,7 @@ hk_descriptor_stride_align_for_type(
         *stride = MAX2(*stride, desc_stride);
         *alignment = MAX2(*alignment, desc_align);
      }
-      *stride = ALIGN(*stride, *alignment);
+      *stride = align(*stride, *alignment);
      break;

   default:
--- a/src/compiler/glsl/gl_nir_link_varyings.c
+++ b/src/compiler/glsl/gl_nir_link_varyings.c
@ -2093,7 +2093,7 @@ xfb_decl_store(struct xfb_decl *xfb_decl, const struct gl_constants *consts,
      if (max_member_alignment && has_xfb_qualifiers) {
         max_member_alignment[buffer] = MAX2(max_member_alignment[buffer],
                                             _mesa_gl_datatype_is_64bit(xfb_decl->type) ? 2 : 1);
-         info->Buffers[buffer].Stride = ALIGN(xfb_offset,
+         info->Buffers[buffer].Stride = align(xfb_offset,
                                              max_member_alignment[buffer]);
      } else {
         info->Buffers[buffer].Stride = xfb_offset;
@ -2926,7 +2926,7 @@ varying_matches_assign_locations(struct varying_matches *vm,
          (previous_packing_class != vm->matches[i].packing_class) ||
          (vm->matches[i].packing_order == PACKING_ORDER_VEC3 &&
           dont_pack_vec3)) {
-         *location = ALIGN(*location, 4);
+         *location = align(*location, 4);
      }

      previous_var_xfb = var->data.is_xfb;
@ -2967,7 +2967,7 @@ varying_matches_assign_locations(struct varying_matches *vm,
            break;
         }

-         *location = ALIGN(*location + 1, 4);
+         *location = align(*location + 1, 4);
         slot_end = *location + num_components - 1;
      }

@ -3229,9 +3229,9 @@ tfeedback_candidate_generator(struct tfeedback_candidate_generator_state *state,
          * (c) each double-precision variable captured must be aligned to a
          *     multiple of eight bytes relative to the beginning of a vertex.
          */
-         state->xfb_offset_floats = ALIGN(state->xfb_offset_floats, 2);
+         state->xfb_offset_floats = align(state->xfb_offset_floats, 2);
         /* 64-bit members of structs are also aligned. */
-         state->varying_floats = ALIGN(state->varying_floats, 2);
+         state->varying_floats = align(state->varying_floats, 2);
      }

      candidate->xfb_offset_floats = state->xfb_offset_floats;
--- a/src/compiler/nir/nir_lower_shader_calls.c
+++ b/src/compiler/nir/nir_lower_shader_calls.c
@ -586,7 +586,7 @@ spill_ssa_defs_and_lower_shader_calls(nir_shader *shader, uint32_t num_calls,
                  def = nir_b2b32(&before, def);

               const unsigned comp_size = def->bit_size / 8;
-               offset = ALIGN(offset, comp_size);
+               offset = align(offset, comp_size);

               new_def = spill_fill(&before, &after, def,
                                    index, call_idx,
@ -617,7 +617,7 @@ spill_ssa_defs_and_lower_shader_calls(nir_shader *shader, uint32_t num_calls,

         nir_builder *b = &before;

-         offset = ALIGN(offset, options->stack_alignment);
+         offset = align(offset, options->stack_alignment);
         max_scratch_size = MAX2(max_scratch_size, offset);

         /* First thing on the called shader's stack is the resume address
@ -1642,11 +1642,11 @@ nir_opt_sort_and_pack_stack(nir_shader *shader,

      unsigned scratch_size = start_call_scratch;
      util_dynarray_foreach(&ops, struct scratch_item, item) {
-         item->new_offset = ALIGN(scratch_size, item->bit_size / 8);
+         item->new_offset = align(scratch_size, item->bit_size / 8);
         scratch_size = item->new_offset + (item->bit_size * item->num_components) / 8;
         _mesa_hash_table_u64_insert(value_id_to_item, item->value, item);
      }
-      shader->scratch_size = ALIGN(scratch_size, stack_alignment);
+      shader->scratch_size = align(scratch_size, stack_alignment);

      /* Update offsets in the instructions */
      nir_foreach_block_safe(block, impl) {
--- a/src/compiler/nir/nir_lower_task_shader.c
+++ b/src/compiler/nir/nir_lower_task_shader.c
@ -132,7 +132,7 @@ static bool
 nir_lower_nv_task_count(nir_shader *shader)
 {
   lower_task_nv_state state = {
-      .task_count_shared_addr = ALIGN(shader->info.shared_size, 4),
+      .task_count_shared_addr = align(shader->info.shared_size, 4),
   };

   shader->info.shared_size += 4;
@ -277,7 +277,7 @@ emit_shared_to_payload_copy(nir_builder *b,
      off += remaining_dwords * 4;
   }

-   assert(s->payload_offset_in_bytes + ALIGN(payload_size, 4) == off);
+   assert(s->payload_offset_in_bytes + align(payload_size, 4) == off);
 }

 static bool
@ -453,7 +453,7 @@ nir_lower_task_shader(nir_shader *shader,
                            requires_payload_in_shared(shader, atomics, small_types);

   lower_task_state state = {
-      .payload_shared_addr = ALIGN(shader->info.shared_size, 16),
+      .payload_shared_addr = align(shader->info.shared_size, 16),
      .payload_in_shared = payload_in_shared,
      .payload_offset_in_bytes = options.payload_offset_in_bytes,
   };
--- a/src/etnaviv/drm/etnaviv_bo_cache.c
+++ b/src/etnaviv/drm/etnaviv_bo_cache.c
@ -151,7 +151,7 @@ struct etna_bo *etna_bo_cache_alloc(struct etna_bo_cache *cache, uint32_t *size,
 	struct etna_bo *bo;
 	struct etna_bo_bucket *bucket;

-	*size = ALIGN(*size, 4096);
+	*size = align(*size, 4096);
 	bucket = get_bucket(cache, *size);

 	/* see if we can be green and recycle: */
--- a/src/etnaviv/drm/etnaviv_cmd_stream.c
+++ b/src/etnaviv/drm/etnaviv_cmd_stream.c
@ -105,7 +105,7 @@ struct etna_cmd_stream *etna_cmd_stream_new(struct etna_pipe *pipe,
 	}

 	/* allocate even number of 32-bit words */
-	size = ALIGN(size, 2);
+	size = align(size, 2);

 	stream->base.buffer = malloc(size * sizeof(uint32_t));
 	if (!stream->base.buffer) {
--- a/src/freedreno/computerator/a6xx.cc
+++ b/src/freedreno/computerator/a6xx.cc
@ -257,7 +257,7 @@ cs_program_emit_regs(fd_cs &cs, struct kernel *kernel)
   if (v->pvtmem_size > 0) {
      uint32_t per_fiber_size = v->pvtmem_size;
      uint32_t per_sp_size =
-         ALIGN(per_fiber_size * a6xx_backend->info->fibers_per_sp, 1 << 12);
+         align(per_fiber_size * a6xx_backend->info->fibers_per_sp, 1 << 12);
      uint32_t total_size = per_sp_size * a6xx_backend->info->num_sp_cores;

      struct fd_bo *pvtmem = fd_bo_new(a6xx_backend->dev, total_size, 0, "pvtmem");
--- a/src/freedreno/decode/pgmdump.c
+++ b/src/freedreno/decode/pgmdump.c
@ -268,7 +268,7 @@ next_sect(struct state *state, int *sect_size)
   *sect_size = end - state->buf;

   /* copy the section to keep things nicely 32b aligned: */
-   sect = malloc(ALIGN(*sect_size, 4));
+   sect = malloc(align(*sect_size, 4));
   memcpy(sect, state->buf, *sect_size);

   state->sz -= *sect_size + 4;
@ -584,8 +584,8 @@ dump_shaders_a3xx(struct state *state)

      if (!compact) {
         if (state->hdr->revision >= 7) {
-            instrs += ALIGN(instrs_size, 8) - instrs_size;
-            instrs_size = ALIGN(instrs_size, 8);
+            instrs += align(instrs_size, 8) - instrs_size;
+            instrs_size = align(instrs_size, 8);
         }
         instrs += 32;
         instrs_size -= 32;
--- a/src/freedreno/decode/pgmdump2.c
+++ b/src/freedreno/decode/pgmdump2.c
@ -347,9 +347,9 @@ decode_shader_descriptor_block(struct state *state,
            unsigned dwords = 2 * stats.instlen;

            if (gpu_id >= 400) {
-               dwords = ALIGN(dwords, 16 * 2);
+               dwords = align(dwords, 16 * 2);
            } else {
-               dwords = ALIGN(dwords, 4 * 2);
+               dwords = align(dwords, 4 * 2);
            }

            unsigned half_regs = state->half_regs;
--- a/src/freedreno/drm/freedreno_bo_heap.c
+++ b/src/freedreno/drm/freedreno_bo_heap.c
@ -205,7 +205,7 @@ fd_bo_heap_alloc(struct fd_bo_heap *heap, uint32_t size, uint32_t flags)
    */
   size = MAX2(size, SUBALLOC_ALIGNMENT);

-   size = ALIGN(size, SUBALLOC_ALIGNMENT);
+   size = align(size, SUBALLOC_ALIGNMENT);

   simple_mtx_lock(&heap->lock);
   /* Allocate larger buffers from the bottom, and smaller buffers from top
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@ -6026,7 +6026,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
      ir3_instr_move_before(unlock, end);
   }

-   so->pvtmem_size = ALIGN(so->pvtmem_size, compiler->pvtmem_per_fiber_align);
+   so->pvtmem_size = align(so->pvtmem_size, compiler->pvtmem_per_fiber_align);

   /* Note that max_bary counts inputs that are not bary.f'd for FS: */
   if (so->type == MESA_SHADER_FRAGMENT)
--- a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
+++ b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c
@ -36,7 +36,7 @@ get_ubo_load_range(nir_shader *nir, nir_intrinsic_instr *instr,
      return false;

   r->start = ROUND_DOWN_TO(offset, alignment * 16);
-   r->end = ALIGN(offset + size, alignment * 16);
+   r->end = align(offset + size, alignment * 16);

   return true;
 }
--- a/src/freedreno/ir3/ir3_ra.c
+++ b/src/freedreno/ir3/ir3_ra.c
@ -1237,7 +1237,7 @@ compress_regs_left(struct ra_ctx *ctx, struct ra_file *file,
      }

      if (!(cur_reg->flags & IR3_REG_HALF))
-         physreg = ALIGN(physreg, 2);
+         physreg = align(physreg, 2);

      d("pushing reg %u physreg %u\n", cur_reg->name, physreg);

@ -1330,7 +1330,7 @@ find_best_gap(struct ra_ctx *ctx, struct ra_file *file,
   BITSET_WORD *available =
      is_early_clobber(dst) ? file->available_to_evict : file->available;

-   unsigned start = ALIGN(file->start, alignment);
+   unsigned start = align(file->start, alignment);
   if (start + size > file_size)
      start = 0;
   unsigned candidate = start;
--- a/src/freedreno/ir3/ir3_shader.c
+++ b/src/freedreno/ir3/ir3_shader.c
@ -37,7 +37,7 @@ ir3_const_ensure_imm_size(struct ir3_shader_variant *v, unsigned size)
   /* Immediates are uploaded in units of vec4 so make sure our buffer is large
    * enough.
    */
-   size = ALIGN(size, 4);
+   size = align(size, 4);

   /* Pre-a7xx, the immediates that get lowered to const registers are
    * emitted as part of the const state so the total size of immediates
--- a/src/freedreno/ir3/ir3_shared_ra.c
+++ b/src/freedreno/ir3/ir3_shared_ra.c
@ -362,7 +362,7 @@ find_best_gap(struct ra_ctx *ctx, struct ir3_register *dst, unsigned size,
   if (size > file_size)
      return (physreg_t) ~0;

-   unsigned start = ALIGN(ctx->start, alignment);
+   unsigned start = align(ctx->start, alignment);
   if (start + size > file_size)
      start = 0;
   unsigned candidate = start;
@ -395,7 +395,7 @@ find_best_spill_reg(struct ra_ctx *ctx, struct ir3_register *reg,
   unsigned file_size = reg_file_size(reg);
   unsigned min_cost = UINT_MAX;

-   unsigned start = ALIGN(ctx->start, alignment);
+   unsigned start = align(ctx->start, alignment);
   if (start + size > file_size)
      start = 0;
   physreg_t candidate = start;
--- a/src/freedreno/vulkan/tu_descriptor_set.cc
+++ b/src/freedreno/vulkan/tu_descriptor_set.cc
@ -602,7 +602,7 @@ tu_descriptor_set_create(struct tu_device *device,
         &layout->binding[layout->binding_count - 1];
      if (binding->type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
         layout_size = binding->offset +
-            ALIGN(variable_count, 4 * A6XX_TEX_CONST_DWORDS);
+            align(variable_count, 4 * A6XX_TEX_CONST_DWORDS);
      } else {
         uint32_t stride = binding->size;
         layout_size = binding->offset + variable_count * stride;
--- a/src/freedreno/vulkan/tu_pipeline.cc
+++ b/src/freedreno/vulkan/tu_pipeline.cc
@ -683,7 +683,7 @@ tu6_emit_link_map(struct tu_cs *cs,
                  enum a6xx_state_block sb)
 {
   const struct ir3_const_state *const_state = ir3_const_state(consumer);
-   uint32_t size = ALIGN(consumer->input_size, 4);
+   uint32_t size = align(consumer->input_size, 4);

   if (size == 0)
      return;
--- a/src/freedreno/vulkan/tu_shader.cc
+++ b/src/freedreno/vulkan/tu_shader.cc
@ -2289,9 +2289,9 @@ tu_setup_pvtmem(struct tu_device *dev,
         tu_bo_finish(dev, pvtmem_bo->bo);

      pvtmem_bo->per_fiber_size =
-         util_next_power_of_two(ALIGN(pvtmem_bytes, 512));
+         util_next_power_of_two(align(pvtmem_bytes, 512));
      pvtmem_bo->per_sp_size =
-         ALIGN(pvtmem_bo->per_fiber_size *
+         align(pvtmem_bo->per_fiber_size *
                  dev->physical_device->info->fibers_per_sp,
               1 << 12);
      uint32_t total_size =
@ -2729,7 +2729,7 @@ tu_shader_create(struct tu_device *dev,
          * is allocated at the API level, and it's up to the user to ensure
          * that accesses are limited to those bounds.
          */
-         const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);
+         const unsigned shared_size = align(nir->info.shared_size, chunk_size);
         NIR_PASS(_, nir, nir_zero_initialize_shared_memory, shared_size,
                  chunk_size);
      }
--- a/src/freedreno/vulkan/tu_suballoc.cc
+++ b/src/freedreno/vulkan/tu_suballoc.cc
@ -55,7 +55,7 @@ tu_suballoc_bo_alloc(struct tu_suballoc_bo *suballoc_bo,
 {
   struct tu_bo *bo = suballoc->bo;
   if (bo) {
-      uint32_t offset = ALIGN(suballoc->next_offset, alignment);
+      uint32_t offset = align(suballoc->next_offset, alignment);
      if (offset + size <= bo->size) {
         suballoc_bo->bo = tu_bo_get_ref(bo);
         suballoc_bo->iova = bo->iova + offset;
--- a/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c
@ -6073,7 +6073,7 @@ void lp_build_nir_soa_func(struct gallivm_state *gallivm,

   bld.shader = shader;

-   bld.scratch_size = ALIGN(shader->scratch_size, 8);
+   bld.scratch_size = align(shader->scratch_size, 8);
   if (params->scratch_ptr)
      bld.scratch_ptr = params->scratch_ptr;
   else if (shader->scratch_size) {
--- a/src/gallium/drivers/crocus/crocus_batch.c
+++ b/src/gallium/drivers/crocus/crocus_batch.c
@ -872,7 +872,7 @@ submit_batch(struct crocus_batch *batch)
      .buffer_count = batch->exec_count,
      .batch_start_offset = 0,
      /* This must be QWord aligned. */
-      .batch_len = ALIGN(batch->primary_batch_size, 8),
+      .batch_len = align(batch->primary_batch_size, 8),
      .flags = I915_EXEC_RENDER |
               I915_EXEC_NO_RELOC |
               I915_EXEC_BATCH_FIRST |
--- a/src/gallium/drivers/crocus/crocus_blorp.c
+++ b/src/gallium/drivers/crocus/crocus_blorp.c
@ -56,11 +56,11 @@ stream_state(struct crocus_batch *batch,
             uint32_t *out_offset,
             struct crocus_bo **out_bo)
 {
-   uint32_t offset = ALIGN(batch->state.used, alignment);
+   uint32_t offset = align(batch->state.used, alignment);

   if (offset + size >= STATE_SZ && !batch->no_wrap) {
      crocus_batch_flush(batch);
-      offset = ALIGN(batch->state.used, alignment);
+      offset = align(batch->state.used, alignment);
   } else if (offset + size >= batch->state.bo->size) {
      const unsigned new_size =
         MIN2(batch->state.bo->size + batch->state.bo->size / 2,
--- a/src/gallium/drivers/crocus/crocus_context.c
+++ b/src/gallium/drivers/crocus/crocus_context.c
@ -62,7 +62,7 @@ crocus_init_identifier_bo(struct crocus_context *ice)
      return false;

   ice->workaround_bo->kflags |= EXEC_OBJECT_CAPTURE;
-   ice->workaround_offset = ALIGN(
+   ice->workaround_offset = align(
      intel_debug_write_identifiers(bo_map, 4096, "Crocus"), 32);

   crocus_bo_unmap(ice->workaround_bo);
--- a/src/gallium/drivers/crocus/crocus_program_cache.c
+++ b/src/gallium/drivers/crocus/crocus_program_cache.c
@ -179,7 +179,7 @@ crocus_alloc_item_data(struct crocus_context *ice, uint32_t size)
   uint32_t offset = ice->shaders.cache_next_offset;

   /* Programs are always 64-byte aligned, so set up the next one now */
-   ice->shaders.cache_next_offset = ALIGN(offset + size, 64);
+   ice->shaders.cache_next_offset = align(offset + size, 64);
   return offset;
 }

--- a/src/gallium/drivers/crocus/crocus_resource.c
+++ b/src/gallium/drivers/crocus/crocus_resource.c
@ -229,7 +229,7 @@ crocus_resource_configure_main(const struct crocus_screen *screen,
       devinfo->ver < 6) {
      /* align row pitch to 4 so we can keep using BLT engine */
      row_pitch_B = util_format_get_stride(templ->format, templ->width0);
-      row_pitch_B = ALIGN(row_pitch_B, 4);
+      row_pitch_B = align(row_pitch_B, 4);
   }

   const struct isl_surf_init_info init_info = {
@ -1437,7 +1437,7 @@ crocus_map_tiled_memcpy(struct crocus_transfer *map)
   struct crocus_resource *res = (struct crocus_resource *) xfer->resource;
   struct isl_surf *surf = &res->surf;

-   xfer->stride = ALIGN(surf->row_pitch_B, 16);
+   xfer->stride = align(surf->row_pitch_B, 16);
   xfer->layer_stride = xfer->stride * box->height;

   unsigned x1, x2, y1, y2;
--- a/src/gallium/drivers/crocus/crocus_state.c
+++ b/src/gallium/drivers/crocus/crocus_state.c
@ -329,11 +329,11 @@ stream_state(struct crocus_batch *batch,
             unsigned alignment,
             uint32_t *out_offset)
 {
-   uint32_t offset = ALIGN(batch->state.used, alignment);
+   uint32_t offset = align(batch->state.used, alignment);

   if (offset + size >= STATE_SZ && !batch->no_wrap) {
      crocus_batch_flush(batch);
-      offset = ALIGN(batch->state.used, alignment);
+      offset = align(batch->state.used, alignment);
   } else if (offset + size >= batch->state.bo->size) {
      const unsigned new_size =
         MIN2(batch->state.bo->size + batch->state.bo->size / 2,
@ -6591,7 +6591,7 @@ crocus_upload_dirty_render_state(struct crocus_context *ice,
            struct crocus_resource *res = (void *) tgt->base.buffer;
            uint32_t start = tgt->base.buffer_offset;
 #if GFX_VER < 8
-            uint32_t end = ALIGN(start + tgt->base.buffer_size, 4);
+            uint32_t end = align(start + tgt->base.buffer_size, 4);
 #endif
            crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
               sob.SOBufferIndex = i;
@ -8143,7 +8143,7 @@ crocus_upload_compute_state(struct crocus_context *ice,
         vfe.URBEntryAllocationSize = GFX_VER == 8 ? 2 : 0;

         vfe.CURBEAllocationSize =
-            ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
+            align(cs_prog_data->push.per_thread.regs * dispatch.threads +
                  cs_prog_data->push.cross_thread.regs, 2);
      }
   }
@ -8159,15 +8159,15 @@ crocus_upload_compute_state(struct crocus_context *ice,
         elk_cs_push_const_total_size(cs_prog_data, dispatch.threads);
      uint32_t *curbe_data_map =
         stream_state(batch,
-                      ALIGN(push_const_size, 64), 64,
+                      align(push_const_size, 64), 64,
                      &curbe_data_offset);
      assert(curbe_data_map);
-      memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64));
+      memset(curbe_data_map, 0x5a, align(push_const_size, 64));
      crocus_fill_cs_push_const_buffer(cs_prog_data, dispatch.threads,
                                       curbe_data_map);

      crocus_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
-         curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);
+         curbe.CURBETotalDataLength = align(push_const_size, 64);
         curbe.CURBEDataStartAddress = curbe_data_offset;
      }
   }
--- a/src/gallium/drivers/d3d12/d3d12_resource.cpp
+++ b/src/gallium/drivers/d3d12/d3d12_resource.cpp
@ -1199,11 +1199,11 @@ fill_buffer_location(struct d3d12_context *ctx,
      buf_loc.PlacedFootprint.Footprint.Height = res->base.b.height0;
      buf_loc.PlacedFootprint.Footprint.Depth = res->base.b.depth0;
   } else {
-      buf_loc.PlacedFootprint.Footprint.Width = ALIGN(trans->base.b.box.width,
+      buf_loc.PlacedFootprint.Footprint.Width = align(trans->base.b.box.width,
                                                      util_format_get_blockwidth(res->base.b.format));
-      buf_loc.PlacedFootprint.Footprint.Height = ALIGN(trans->base.b.box.height,
+      buf_loc.PlacedFootprint.Footprint.Height = align(trans->base.b.box.height,
                                                       util_format_get_blockheight(res->base.b.format));
-      buf_loc.PlacedFootprint.Footprint.Depth = ALIGN(depth,
+      buf_loc.PlacedFootprint.Footprint.Depth = align(depth,
                                                      util_format_get_blockdepth(res->base.b.format));
   }

--- a/src/gallium/drivers/d3d12/d3d12_video_encoder_bitstream_builder_hevc.cpp
+++ b/src/gallium/drivers/d3d12/d3d12_video_encoder_bitstream_builder_hevc.cpp
@ -479,8 +479,8 @@ d3d12_video_bitstream_builder_hevc::build_sps(const HevcVideoParameterSet& paren
   viewport.Width = crop_window_upper_layer.front /* passes width */ - ((crop_window_upper_layer.left + crop_window_upper_layer.right) * SubWidthC);
   viewport.Height = crop_window_upper_layer.back /* passes height */- ((crop_window_upper_layer.top + crop_window_upper_layer.bottom) * SubHeightC);

-   m_latest_sps.pic_width_in_luma_samples = ALIGN(encodeResolution.Width, picDimensionMultipleRequirement);
-   m_latest_sps.pic_height_in_luma_samples = ALIGN(encodeResolution.Height, picDimensionMultipleRequirement);
+   m_latest_sps.pic_width_in_luma_samples = align(encodeResolution.Width, picDimensionMultipleRequirement);
+   m_latest_sps.pic_height_in_luma_samples = align(encodeResolution.Height, picDimensionMultipleRequirement);
   m_latest_sps.conf_win_right_offset = (m_latest_sps.pic_width_in_luma_samples - viewport.Width) / SubWidthC;
   m_latest_sps.conf_win_bottom_offset = (m_latest_sps.pic_height_in_luma_samples - viewport.Height) / SubHeightC;

--- a/src/gallium/drivers/ethosu/ethosu_cmd.c
+++ b/src/gallium/drivers/ethosu/ethosu_cmd.c
@ -90,7 +90,7 @@ emit_strides(
   if (tensor->layout == ETHOSU_LAYOUT_NHCWB16) {
      tensor_x = 16 * elem_size;
      tensor_c = tensor_x * tensor->shape.width;
-      tensor_y = elem_size * tensor->shape.width * ALIGN(tensor->shape.depth, 16);
+      tensor_y = elem_size * tensor->shape.width * align(tensor->shape.depth, 16);
   } else {
      tensor_c = elem_size;
      tensor_x = tensor->shape.depth * tensor_c;
--- a/src/gallium/drivers/ethosu/ethosu_coefs.c
+++ b/src/gallium/drivers/ethosu/ethosu_coefs.c
@ -16,7 +16,7 @@ fill_scale_and_biases(struct ethosu_subgraph *subgraph, struct ethosu_operation
                                     PIPE_MAP_READ, &transfer_in);
   unsigned idx = 0;

-   *scales_size = ALIGN(operation->ofm.shape.depth * 10, 16);
+   *scales_size = align(operation->ofm.shape.depth * 10, 16);
   *scales = malloc(*scales_size);
   memset(*scales, 0, *scales_size);

--- a/src/gallium/drivers/ethosu/ethosu_lower.c
+++ b/src/gallium/drivers/ethosu/ethosu_lower.c
@ -228,7 +228,7 @@ ethosu_lower_concatenation(struct ethosu_subgraph *subgraph,
      if (tensor->layout == ETHOSU_LAYOUT_NHWC)
         operation->ofm.tiles.addresses[0] += poperation->input_tensors[i]->dims[3];
      else if (tensor->layout == ETHOSU_LAYOUT_NHCWB16)
-         operation->ofm.tiles.addresses[0] += poperation->input_tensors[i]->dims[2] * ALIGN(poperation->input_tensors[i]->dims[3], 16);
+         operation->ofm.tiles.addresses[0] += poperation->input_tensors[i]->dims[2] * align(poperation->input_tensors[i]->dims[3], 16);
      else
         assert(0 && "Unsupported layout");
   }
--- a/src/gallium/drivers/ethosu/ethosu_ml.c
+++ b/src/gallium/drivers/ethosu/ethosu_ml.c
@ -69,7 +69,7 @@ ethosu_allocate_feature_map(struct ethosu_subgraph *subgraph, struct ethosu_feat
   if (tensor->layout == ETHOSU_LAYOUT_NHWC) {
      size = tensor->shape.width * tensor->shape.height * tensor->shape.depth;
   } else if (tensor->layout == ETHOSU_LAYOUT_NHCWB16) {
-      size = tensor->shape.width * tensor->shape.height * ALIGN(tensor->shape.depth, 16);
+      size = tensor->shape.width * tensor->shape.height * align(tensor->shape.depth, 16);
   } else {
      assert(0 && "Unsupported layout");
      size = 0; // This should never happen
--- a/src/gallium/drivers/ethosu/ethosu_sched.c
+++ b/src/gallium/drivers/ethosu/ethosu_sched.c
@ -18,11 +18,11 @@ _get_ifm_blocksize(struct ethosu_operation *operation, struct ethosu_block ofm_b

   // IFM block height
   int h = required_input_size(ofm_block.height, operation->kernel.stride_y, MIN2(operation->kernel.height, SUB_KERNEL_MAX.height));
-   h = ALIGN(h, OFM_UBLOCK.height);
+   h = align(h, OFM_UBLOCK.height);

   // IFM block width
   int w = required_input_size(ofm_block.width, operation->kernel.stride_x, MIN2(operation->kernel.width, SUB_KERNEL_MAX.width));
-   w = ALIGN(w, OFM_UBLOCK.width);
+   w = align(w, OFM_UBLOCK.width);

   ifm_block.height = h;
   ifm_block.width = w;
@ -34,8 +34,8 @@ _get_ifm_blocksize(struct ethosu_operation *operation, struct ethosu_block ofm_b
 static bool
 try_block_config(struct ethosu_operation *operation, struct ethosu_block ofm_block, struct ethosu_block ifm_block, struct ethosu_shram_layout *layout)
 {
-   int ifm_bytes = ifm_block.width * ifm_block.height * ALIGN(ifm_block.depth, 8);
-   int ifm_banks = ALIGN(DIV_ROUND_UP(ifm_bytes, BANK_SIZE_BYTES) * 2, IFM_GRANULE);
+   int ifm_bytes = ifm_block.width * ifm_block.height * align(ifm_block.depth, 8);
+   int ifm_banks = align(DIV_ROUND_UP(ifm_bytes, BANK_SIZE_BYTES) * 2, IFM_GRANULE);
   int lut_bytes = operation->type == ETHOSU_OPERATION_TYPE_ELTWISE ? operation->eltwise.lut_bytes : 0;
   int lut_banks = MAX2(DIV_ROUND_UP(lut_bytes, 1024), SHRAM_RESERVED_END_BANKS);
   int lut_start = SHRAM_TOTAL_BANKS - lut_banks;
@ -44,8 +44,8 @@ try_block_config(struct ethosu_operation *operation, struct ethosu_block ofm_blo
   int acc_start = lut_start;

   if (operation->type != ETHOSU_OPERATION_TYPE_ELTWISE) {
-      int acc_bytes = (ofm_block.width * ofm_block.height * ALIGN(ofm_block.depth, 8) * 32) / 8;
-      int acc_banks = ALIGN(DIV_ROUND_UP(acc_bytes, BANK_SIZE_BYTES) * 2, ACC_GRANULE);
+      int acc_bytes = (ofm_block.width * ofm_block.height * align(ofm_block.depth, 8) * 32) / 8;
+      int acc_banks = align(DIV_ROUND_UP(acc_bytes, BANK_SIZE_BYTES) * 2, ACC_GRANULE);
      acc_start -= acc_banks;
   } else {
      int ifm2_banks = ifm_banks; /* TODO: Fix for scalar eltwise */
@ -89,12 +89,12 @@ find_block_config(struct ethosu_operation *operation)
   unsigned depth = MAX2(OFM_UBLOCK.depth, MIN2(search_space.depth, ARCH_SPLIT_DEPTH));

   if (depth < operation->ofm.shape.depth) {
-      depth = ALIGN(depth, ARCH_SPLIT_DEPTH);
+      depth = align(depth, ARCH_SPLIT_DEPTH);
   }

-   search_space.width = ALIGN(search_space.width, OFM_UBLOCK.width);
-   search_space.height = ALIGN(search_space.height, OFM_UBLOCK.height);
-   search_space.depth = ALIGN(search_space.depth, OFM_UBLOCK.depth);
+   search_space.width = align(search_space.width, OFM_UBLOCK.width);
+   search_space.height = align(search_space.height, OFM_UBLOCK.height);
+   search_space.depth = align(search_space.depth, OFM_UBLOCK.depth);

   while (depth <= search_space.depth) {
      bool wont_fit[search_space.height + 1][search_space.width + 1];
@ -110,7 +110,7 @@ find_block_config(struct ethosu_operation *operation)
            struct ethosu_block ifm_block = _get_ifm_blocksize(operation, ofm_block);

            if (!is_equal_depth)
-               ifm_block.depth = ALIGN(MIN2(operation->ifm.shape.depth, operation->conv.part_kernel_first ? 16 : 32), IFM_UBLOCK.depth);
+               ifm_block.depth = align(MIN2(operation->ifm.shape.depth, operation->conv.part_kernel_first ? 16 : 32), IFM_UBLOCK.depth);

            // Try to fit the blocks in SHRAM
            struct ethosu_shram_layout layout = {0};
@ -179,7 +179,7 @@ find_block_config(struct ethosu_operation *operation)

      depth += OFM_UBLOCK.depth;
      if (depth < operation->ofm.shape.depth) {
-         depth = ALIGN(depth, ARCH_SPLIT_DEPTH);
+         depth = align(depth, ARCH_SPLIT_DEPTH);
      }
   }

--- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c
@ -1167,9 +1167,9 @@ create_nn_config(struct etna_ml_subgraph *subgraph, const struct etna_operation
      unsigned in_image_tile_x_size = map->out_image_tile_x_size + weight_width - 1;
      unsigned in_image_tile_y_size = map->out_image_tile_y_size + weight_width - 1;
      image_cache_size = in_image_tile_x_size * in_image_tile_y_size;
-      image_cache_size = ALIGN(image_cache_size, 16);
+      image_cache_size = align(image_cache_size, 16);
      image_cache_size *= input_channels;
-      image_cache_size = ALIGN(image_cache_size, 128);
+      image_cache_size = align(image_cache_size, 128);
   }

   ML_DBG("coefficients_size 0x%x (%d) image_size 0x%x (%d)\n", coef_cache_size, coef_cache_size, image_cache_size, image_cache_size);
@ -1182,7 +1182,7 @@ create_nn_config(struct etna_ml_subgraph *subgraph, const struct etna_operation
      map->kernel_pattern_msb = 0x0;
      map->kernel_pattern_low = 0x0;
      map->kernel_pattern_high = 0x0;
-      map->kernel_cache_end_address = MAX2(MIN2(ALIGN(map->kernel_cache_start_address + coef_cache_size, 128), oc_sram_size), 0xa00);
+      map->kernel_cache_end_address = MAX2(MIN2(align(map->kernel_cache_start_address + coef_cache_size, 128), oc_sram_size), 0xa00);
   } else {
      /* Doesn't fit in the 512KB we have of on-chip SRAM */
      map->kernel_caching_mode = SRAM_CACHE_MODE_PARTIAL_CACHE;
--- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v7.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v7.c
@ -441,7 +441,7 @@ calculate_weight_bo_size(struct etna_ml_subgraph *subgraph, const struct etna_op
   struct pipe_context *context = subgraph->base.context;
   struct etna_context *ctx = etna_context(context);
   unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
-   unsigned header_size = ALIGN(nn_core_count * 4, 64);
+   unsigned header_size = align(nn_core_count * 4, 64);
   unsigned input_channels = operation->addition ? 1 : operation->input_channels;
   unsigned output_channels = operation->addition ? 1 : operation->output_channels;
   unsigned cores_used = MIN2(output_channels, nn_core_count);
@ -453,7 +453,7 @@ calculate_weight_bo_size(struct etna_ml_subgraph *subgraph, const struct etna_op

   weights_size = operation->weight_width * operation->weight_height * input_channels;
   core_size = 1 + 2 + (weights_size + 4 + 4) * kernels_per_core;
-   core_size_aligned = ALIGN(core_size, 64);
+   core_size_aligned = align(core_size, 64);
   compressed_size_aligned = header_size + core_size_aligned * cores_used;

   return compressed_size_aligned;
@ -466,7 +466,7 @@ calculate_zrl_bits(struct etna_ml_subgraph *subgraph, const struct etna_operatio
   struct etna_context *ctx = etna_context(context);
   unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
   unsigned max_zrl_bits = etna_ml_get_core_info(ctx)->nn_zrl_bits;
-   unsigned header_size = ALIGN(nn_core_count * 4, 64);
+   unsigned header_size = align(nn_core_count * 4, 64);
   unsigned input_channels = operation->addition ? 1 : operation->input_channels;
   unsigned output_channels = operation->addition ? 1 : operation->output_channels;
   unsigned cores_used = MIN2(output_channels, nn_core_count);
@ -518,7 +518,7 @@ etna_ml_create_coeffs_v7(struct etna_ml_subgraph *subgraph, const struct etna_op
   struct pipe_context *context = subgraph->base.context;
   struct etna_context *ctx = etna_context(context);
   unsigned nn_core_count = etna_ml_get_core_info(ctx)->nn_core_count;
-   unsigned header_size = ALIGN(nn_core_count * 4, 64);
+   unsigned header_size = align(nn_core_count * 4, 64);
   unsigned input_channels = operation->addition ? 1 : operation->input_channels;
   unsigned output_channels = operation->addition ? 1 : operation->output_channels;
   unsigned cores_used = MIN2(output_channels, nn_core_count);
@ -548,7 +548,7 @@ etna_ml_create_coeffs_v7(struct etna_ml_subgraph *subgraph, const struct etna_op
      else
         actual_size = write_core_sequential(subgraph, map, core, operation, zrl_bits);

-      actual_size = ALIGN(actual_size, 64);
+      actual_size = align(actual_size, 64);
      max_core_size = MAX2(actual_size, max_core_size);

      header[core] = actual_size;
--- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn_v8.c
@ -685,7 +685,7 @@ create_bo(struct etna_ml_subgraph *subgraph, const struct etna_operation *operat
      input_channels = 2 * output_channels;

   unsigned header_size = 64;
-   unsigned body_size = ALIGN(DIV_ROUND_UP(output_channels, cores_used) * (input_channels * operation->weight_height * operation->weight_width + 4 + 4), 64) * 2;
+   unsigned body_size = align(DIV_ROUND_UP(output_channels, cores_used) * (input_channels * operation->weight_height * operation->weight_width + 4 + 4), 64) * 2;
   unsigned tail_size = 64;
   max_size = header_size + cores_used * body_size + tail_size;

@ -803,7 +803,7 @@ etna_ml_create_coeffs_v8(struct etna_ml_subgraph *subgraph, const struct etna_op
   header->symbol_map = pack_symbol_map(symbol_map);
   header->version = 1;

-   map += ALIGN(sizeof(*header), 64) / 4;
+   map += align(sizeof(*header), 64) / 4;

   encoder_init(&encoder, symbol_map, map);

--- a/src/gallium/drivers/freedreno/a6xx/fd6_const.cc
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_const.cc
@ -553,7 +553,7 @@ fd6_emit_link_map(struct fd_context *ctx, fd_cs &cs,
   if (CHIP == A7XX && producer->compiler->load_shader_consts_via_preamble) {
      const struct ir3_const_state *const_state = ir3_const_state(consumer);
      int base = const_state->primitive_map_ubo.idx;
-      uint32_t size = ALIGN(consumer->input_size, 4);
+      uint32_t size = align(consumer->input_size, 4);

      fd6_upload_emit_driver_ubo(ctx, cs, consumer, base, size, producer->output_loc);
   } else {
--- a/src/gallium/drivers/freedreno/a6xx/fd6_gmem.cc
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_gmem.cc
@ -1366,8 +1366,8 @@ set_blit_scissor(struct fd_batch *batch, fd_cs &cs)

   blit_scissor.minx = 0;
   blit_scissor.miny = 0;
-   blit_scissor.maxx = ALIGN(pfb->width, 16);
-   blit_scissor.maxy = ALIGN(pfb->height, 4);
+   blit_scissor.maxx = align(pfb->width, 16);
+   blit_scissor.maxy = align(pfb->height, 4);

   fd_pkt4(cs, 2)
      .add(A6XX_RB_RESOLVE_CNTL_1(.x = blit_scissor.minx, .y = blit_scissor.miny))
--- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
@ -647,7 +647,7 @@ ir3_get_private_mem(struct fd_context *ctx, const struct ir3_shader_variant *so)
      if (ctx->pvtmem[so->pvtmem_per_wave].bo)
         fd_bo_del(ctx->pvtmem[so->pvtmem_per_wave].bo);

-      uint32_t per_sp_size = ALIGN(per_fiber_size * fibers_per_sp, 1 << 12);
+      uint32_t per_sp_size = align(per_fiber_size * fibers_per_sp, 1 << 12);
      uint32_t total_size = per_sp_size * num_sp_cores;

      ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size = per_fiber_size;
--- a/src/gallium/drivers/iris/i915/iris_kmd_backend.c
+++ b/src/gallium/drivers/iris/i915/iris_kmd_backend.c
@ -372,7 +372,7 @@ i915_batch_submit(struct iris_batch *batch)
      .buffer_count = validation_count,
      .batch_start_offset = 0,
      /* This must be QWord aligned. */
-      .batch_len = ALIGN(batch->primary_batch_size, 8),
+      .batch_len = align(batch->primary_batch_size, 8),
      .flags = batch->i915.exec_flags |
               I915_EXEC_NO_RELOC |
               I915_EXEC_BATCH_FIRST |
--- a/src/gallium/drivers/iris/iris_bufmgr.c
+++ b/src/gallium/drivers/iris/iris_bufmgr.c
@ -2305,7 +2305,7 @@ intel_aux_map_buffer_alloc(void *driver_ctx, uint32_t size)
   struct iris_bufmgr *bufmgr = (struct iris_bufmgr *)driver_ctx;

   unsigned int page_size = getpagesize();
-   size = MAX2(ALIGN(size, page_size), page_size);
+   size = MAX2(align(size, page_size), page_size);

   struct iris_bo *bo = alloc_fresh_bo(bufmgr, size, BO_ALLOC_CAPTURE);
   if (!bo) {
--- a/src/gallium/drivers/iris/iris_indirect_gen.c
+++ b/src/gallium/drivers/iris/iris_indirect_gen.c
@ -398,7 +398,7 @@ emit_indirect_generate_draw(struct iris_batch *batch,
   float *vertices =
      upload_state(batch, ice->state.dynamic_uploader,
                   &ice->draw.generation.vertices,
-                   ALIGN(9 * sizeof(float), 8), 8);
+                   align(9 * sizeof(float), 8), 8);

   vertices[0] = x1; vertices[1] = y1; vertices[2] = z; /* v0 */
   vertices[3] = x0; vertices[4] = y1; vertices[5] = z; /* v1 */
--- a/src/gallium/drivers/iris/iris_resource.c
+++ b/src/gallium/drivers/iris/iris_resource.c
@ -2254,7 +2254,7 @@ iris_map_tiled_memcpy(struct iris_transfer *map)
   struct iris_resource *res = (struct iris_resource *) xfer->resource;
   struct isl_surf *surf = &res->surf;

-   xfer->stride = ALIGN(surf->row_pitch_B, 16);
+   xfer->stride = align(surf->row_pitch_B, 16);
   xfer->layer_stride = xfer->stride * box->height;

   unsigned x1, x2, y1, y2;
--- a/src/gallium/drivers/iris/iris_screen.c
+++ b/src/gallium/drivers/iris/iris_screen.c
@ -617,7 +617,7 @@ iris_init_identifier_bo(struct iris_screen *screen)

   screen->workaround_address = (struct iris_address) {
      .bo = screen->workaround_bo,
-      .offset = ALIGN(
+      .offset = align(
         intel_debug_write_identifiers(bo_map, 4096, "Iris"), 32),
   };

--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@ -9385,7 +9385,7 @@ iris_upload_gpgpu_walker(struct iris_context *ice,
         vfe.URBEntryAllocationSize = 2;

         vfe.CURBEAllocationSize =
-            ALIGN(cs_data->push.per_thread.regs * dispatch.threads +
+            align(cs_data->push.per_thread.regs * dispatch.threads +
                  cs_data->push.cross_thread.regs, 2);
      }
   }
@ -9403,15 +9403,15 @@ iris_upload_gpgpu_walker(struct iris_context *ice,
      uint32_t *curbe_data_map =
         stream_state(batch, ice->state.dynamic_uploader,
                      &ice->state.last_res.cs_thread_ids,
-                      ALIGN(push_const_size, 64), 64,
+                      align(push_const_size, 64), 64,
                      &curbe_data_offset);
      assert(curbe_data_map);
-      memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64));
+      memset(curbe_data_map, 0x5a, align(push_const_size, 64));
      iris_fill_cs_push_const_buffer(screen, shader, dispatch.threads,
                                     curbe_data_map);

      iris_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
-         curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);
+         curbe.CURBETotalDataLength = align(push_const_size, 64);
         curbe.CURBEDataStartAddress = curbe_data_offset;
      }
   }
--- a/src/gallium/drivers/lima/drm-shim/lima_noop.c
+++ b/src/gallium/drivers/lima/drm-shim/lima_noop.c
@ -63,7 +63,7 @@ lima_ioctl_gem_create(int fd, unsigned long request, void *arg)

   struct shim_fd *shim_fd = drm_shim_fd_lookup(fd);
   struct shim_bo *bo = calloc(1, sizeof(*bo));
-   size_t size = ALIGN(create->size, 4096);
+   size_t size = align(create->size, 4096);

   drm_shim_bo_init(bo, size);

--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
@ -1343,7 +1343,7 @@ out:
   info_out->bin.maxGPR = prog->maxGPR;
   info_out->bin.code = prog->code;
   info_out->bin.codeSize = prog->binSize;
-   info_out->bin.tlsSpace = ALIGN(prog->tlsSize, 0x10);
+   info_out->bin.tlsSpace = align(prog->tlsSize, 0x10);

   delete prog;
   nv50_ir::Target::destroy(targ);
--- a/src/gallium/drivers/nouveau/nv50/nv50_compute.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
@ -389,7 +389,7 @@ nv50_compute_validate_surfaces(struct nv50_context *nv50)
         PUSH_DATAh(push, res->address + buffer->buffer_offset);
         PUSH_DATA (push, res->address + buffer->buffer_offset);
         PUSH_DATA (push, 0); /* pitch? */
-         PUSH_DATA (push, ALIGN(buffer->buffer_size, 256) - 1);
+         PUSH_DATA (push, align(buffer->buffer_size, 256) - 1);
         PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);
         BCTX_REFN(nv50->bufctx_cp, CP_BUF, res, RDWR);
         util_range_add(&res->base, &res->valid_buffer_range,
@ -420,7 +420,7 @@ nv50_compute_validate_surfaces(struct nv50_context *nv50)
            PUSH_DATAh(push, address);
            PUSH_DATA (push, address);
            PUSH_DATA (push, 0); /* pitch? */
-            PUSH_DATA (push, ALIGN(view->u.buf.size, 0x100) - 1);
+            PUSH_DATA (push, align(view->u.buf.size, 0x100) - 1);
            PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);
         } else {
            struct nv50_miptree *mt = nv50_miptree(view->resource);
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@ -391,7 +391,7 @@ static void si_lower_ngg(struct si_shader *shader, nir_shader *nir,
               shader->info.nr_param_exports || shader->info.nr_prim_param_exports,
               &out_needs_scratch_ring,
               shader->wave_size,
-               ALIGN(max_workgroup_size, shader->wave_size),
+               align(max_workgroup_size, shader->wave_size),
               false,
               false);
      shader->info.uses_mesh_scratch_ring = out_needs_scratch_ring;
@ -923,7 +923,7 @@ static void run_late_optimization_and_lowering_passes(struct si_nir_shader_ctx *

   if (si_should_clear_lds(sel->screen, nir)) {
      const unsigned chunk_size = 16; /* max single store size */
-      const unsigned shared_size = ALIGN(nir->info.shared_size, chunk_size);
+      const unsigned shared_size = align(nir->info.shared_size, chunk_size);
      NIR_PASS(_, nir, nir_clear_shared_memory, shared_size, chunk_size);
   }

--- a/src/gallium/drivers/radeonsi/si_shader_binary.c
+++ b/src/gallium/drivers/radeonsi/si_shader_binary.c
@ -472,7 +472,7 @@ void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shad
                      "HSPatchOuts: %u ESOutputs: %u GSOutputs: %u VSOutputs: %u PSOutputs: %u "
                      "InlineUniforms: %u DivergentLoop: %u (%s, W%u)",
                      conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
-                      ALIGN(conf->lds_size, ac_shader_get_lds_alloc_granularity(screen->info.gfx_level)),
+                      align(conf->lds_size, ac_shader_get_lds_alloc_granularity(screen->info.gfx_level)),
                      conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
                      conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs,
                      num_ls_outputs, num_hs_outputs,
--- a/src/gallium/drivers/radeonsi/si_sqtt.c
+++ b/src/gallium/drivers/radeonsi/si_sqtt.c
@ -769,7 +769,7 @@ si_sqtt_add_code_object(struct si_context *sctx,
      record->shader_data[i].hw_stage = hw_stage;
      record->shader_data[i].is_combined = false;
      record->shader_data[i].scratch_memory_size = shader->config.scratch_bytes_per_wave;
-      record->shader_data[i].lds_size = ALIGN(shader->config.lds_size, ac_shader_get_lds_alloc_granularity(sctx->gfx_level));
+      record->shader_data[i].lds_size = align(shader->config.lds_size, ac_shader_get_lds_alloc_granularity(sctx->gfx_level));
      record->shader_data[i].wavefront_size = shader->wave_size;

      record->shader_stages_mask |= 1 << i;
--- a/src/gallium/drivers/rocket/rkt_coefs.c
+++ b/src/gallium/drivers/rocket/rkt_coefs.c
@ -31,12 +31,12 @@ rkt_fill_weights(struct rkt_ml_subgraph *subgraph,

   input_channels = MAX2(input_channels, FEATURE_ATOMIC_SIZE);

-   output_channels = ALIGN(output_channels, 2);
+   output_channels = align(output_channels, 2);
   if (rkt_is_depthwise(poperation))
      output_channels = 1;

   weights_size = weights_width * weights_height * output_channels *
-                  ALIGN(input_channels, WEIGHT_ATOMIC_SIZE) * 2;
+                  align(input_channels, WEIGHT_ATOMIC_SIZE) * 2;

   rsc =
      pipe_buffer_create(pcontext->screen, 0, PIPE_USAGE_DEFAULT, weights_size);
@ -62,7 +62,7 @@ rkt_fill_weights(struct rkt_ml_subgraph *subgraph,
                     unsigned oc = oc1 * WEIGHT_ATOMIC_SIZE + oc2;
                     unsigned ic = ic1 * input_channel_groups + ic2;
                     if (output_channels_real > 2 &&
-                         oc >= ALIGN(output_channels_real, 2))
+                         oc >= align(output_channels_real, 2))
                        continue;

                     if (oc >= output_channels_real)
--- a/src/gallium/drivers/rocket/rkt_ml.c
+++ b/src/gallium/drivers/rocket/rkt_ml.c
@ -105,7 +105,7 @@ compile_operation(struct rkt_ml_subgraph *subgraph,

      unsigned size =
         util_dynarray_num_elements(&regcfgs[i], uint64_t) * sizeof(uint64_t);
-      regcfg_total_size += ALIGN(size, 64);
+      regcfg_total_size += align(size, 64);
   }

   operation->regcmd = pipe_buffer_create(pcontext->screen, 0,
@ -129,13 +129,13 @@ compile_operation(struct rkt_ml_subgraph *subgraph,
            util_dynarray_element(&regcfgs[i], uint64_t, reg_count - 3);

         uint64_t addr = rkt_resource(operation->regcmd)->phys_addr +
-                         regcmd_offset + ALIGN(size * sizeof(uint64_t), 64);
+                         regcmd_offset + align(size * sizeof(uint64_t), 64);
         *next_address_reg |= addr << 16;

         unsigned regs_to_fetch =
            util_dynarray_num_elements(&regcfgs[i + 1], uint64_t);
         regs_to_fetch -= 4;
-         regs_to_fetch = ALIGN(regs_to_fetch / 2, 2);
+         regs_to_fetch = align(regs_to_fetch / 2, 2);
         *reg_count_reg |= regs_to_fetch << 16;
      }

@ -151,7 +151,7 @@ compile_operation(struct rkt_ml_subgraph *subgraph,
         rkt_dump_buffer(regcmd, "regcmd", 0, i, regcmd_offset,
                         (size + 4) * sizeof(uint64_t));

-      regcmd_offset += ALIGN(size * sizeof(uint64_t), 64);
+      regcmd_offset += align(size * sizeof(uint64_t), 64);
   }

   pipe_buffer_unmap(pcontext, transfer);
--- a/src/gallium/drivers/rocket/rkt_task.c
+++ b/src/gallium/drivers/rocket/rkt_task.c
@ -101,7 +101,7 @@ fill_task(struct rkt_ml_subgraph *subgraph,

   task->input_height = operation->input_height;
   task->input_channels =
-      ALIGN(MAX2(operation->input_channels, FEATURE_ATOMIC_SIZE),
+      align(MAX2(operation->input_channels, FEATURE_ATOMIC_SIZE),
            FEATURE_ATOMIC_SIZE);
   task->input_channels_real = operation->input_channels;
   task->input_zero_point = operation->input_zero_point;
@ -111,11 +111,11 @@ fill_task(struct rkt_ml_subgraph *subgraph,
   task->output_height = operation->output_height;

   task->output_channels_real = operation->output_channels;
-   task->output_channels = ALIGN(MAX2(operation->output_channels, 32), 32);
+   task->output_channels = align(MAX2(operation->output_channels, 32), 32);
   if (operation->depthwise) {
      if (task->output_channels_real <= 32)
         task->output_channels *= 2;
-      task->output_channels = ALIGN(task->output_channels, 64);
+      task->output_channels = align(task->output_channels, 64);
   }

   task->output_zero_point = operation->output_zero_point;
@ -170,7 +170,7 @@ fill_task(struct rkt_ml_subgraph *subgraph,
   if (operation->depthwise)
      task->weights_kernels = 1;
   else
-      task->weights_kernels = ALIGN(operation->output_channels, 2);
+      task->weights_kernels = align(operation->output_channels, 2);

   task->surfaces_per_row = task->output_width * task->output_height * 2;
   if (operation->depthwise)
--- a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
+++ b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
@ -201,8 +201,8 @@ virgl_drm_winsys_resource_create_blob(struct virgl_winsys *qws,
      return NULL;

   /* Make sure blob is page aligned. */
-   width = ALIGN(width, getpagesize());
-   size = ALIGN(size, getpagesize());
+   width = align(width, getpagesize());
+   size = align(size, getpagesize());

   blob_id = p_atomic_inc_return(&qdws->blob_id);
   cmd[0] = VIRGL_CMD0(VIRGL_CCMD_PIPE_RESOURCE_CREATE, 0, VIRGL_PIPE_RES_CREATE_SIZE);
--- a/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c
+++ b/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c
@ -311,8 +311,8 @@ virgl_vtest_winsys_resource_create(struct virgl_winsys *vws,

   if ((flags & (VIRGL_RESOURCE_FLAG_MAP_PERSISTENT |
                 VIRGL_RESOURCE_FLAG_MAP_COHERENT))) {
-      width = ALIGN(width, getpagesize());
-      size = ALIGN(size, getpagesize());
+      width = align(width, getpagesize());
+      size = align(size, getpagesize());
      new_handle = virgl_vtest_winsys_resource_create_blob(vws, target, format, bind,
                                                           width, height, depth,
                                                           array_size, last_level, nr_samples,
--- a/src/imagination/vulkan/pvr_image.c
+++ b/src/imagination/vulkan/pvr_image.c
@ -89,7 +89,7 @@ static void pvr_image_init_physical_extent(struct pvr_image *image)
      if (image->vk.usage & (VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
                             VK_IMAGE_USAGE_TRANSFER_DST_BIT)) {
         image->physical_extent.width =
-            ALIGN(image->physical_extent.width,
+            align(image->physical_extent.width,
                  ROGUE_PBESTATE_REG_WORD0_LINESTRIDE_UNIT_SIZE);
      }
   }
@ -110,11 +110,11 @@ static void pvr_image_setup_mip_levels(struct pvr_image *image)
   for (uint32_t i = 0; i < image->vk.mip_levels; i++) {
      struct pvr_mip_level *mip_level = &image->mip_levels[i];

-      mip_level->pitch = cpp * ALIGN(extent.width, extent_alignment);
-      mip_level->height_pitch = ALIGN(extent.height, extent_alignment);
+      mip_level->pitch = cpp * align(extent.width, extent_alignment);
+      mip_level->height_pitch = align(extent.height, extent_alignment);
      mip_level->size = image->vk.samples * mip_level->pitch *
                        mip_level->height_pitch *
-                        ALIGN(extent.depth, extent_alignment);
+                        align(extent.depth, extent_alignment);
      mip_level->offset = image->layer_size;

      image->layer_size += mip_level->size;
@ -129,11 +129,11 @@ static void pvr_image_setup_mip_levels(struct pvr_image *image)
       * were present so we need to account for that in the `layer_size`.
       */
      while (extent.height != 1 || extent.width != 1 || extent.depth != 1) {
-         const uint32_t height_pitch = ALIGN(extent.height, extent_alignment);
-         const uint32_t pitch = cpp * ALIGN(extent.width, extent_alignment);
+         const uint32_t height_pitch = align(extent.height, extent_alignment);
+         const uint32_t pitch = cpp * align(extent.width, extent_alignment);

         image->layer_size += image->vk.samples * pitch * height_pitch *
-                              ALIGN(extent.depth, extent_alignment);
+                              align(extent.depth, extent_alignment);

         extent.height = u_minify(extent.height, 1);
         extent.width = u_minify(extent.width, 1);
--- a/src/imagination/vulkan/pvr_robustness.c
+++ b/src/imagination/vulkan/pvr_robustness.c
@ -61,7 +61,7 @@ VkResult pvr_init_robustness_buffer(struct pvr_device *device)
   VkResult result;

 #define ROBUSTNESS_BUFFER_OFFSET_ALIGN16(cur_offset, add) \
-   ((uint16_t)ALIGN((cur_offset + (uint16_t)(add)), 16))
+   ((uint16_t)align((cur_offset + (uint16_t)(add)), 16))

   robustness_buffer_offsets[PVR_ROBUSTNESS_BUFFER_FORMAT_UINT64] = offset;
   offset = ROBUSTNESS_BUFFER_OFFSET_ALIGN16(offset, sizeof(uint64_t) * 4);
--- a/src/intel/blorp/blorp.c
+++ b/src/intel/blorp/blorp.c
@ -314,8 +314,8 @@ blorp_hiz_op(struct blorp_batch *batch, struct blorp_surf *surf,
                           params.depth.view.base_level);
      params.y1 = u_minify(params.depth.surf.logical_level0_px.height,
                           params.depth.view.base_level);
-      params.x1 = ALIGN(params.x1, 8);
-      params.y1 = ALIGN(params.y1, 4);
+      params.x1 = align(params.x1, 8);
+      params.y1 = align(params.y1, 4);

      if (params.depth.view.base_level == 0) {
         /* TODO: What about MSAA? */
--- a/src/intel/blorp/blorp_blit.c
+++ b/src/intel/blorp/blorp_blit.c
@ -1731,9 +1731,9 @@ blorp_surf_retile_w_to_y(const struct isl_device *isl_dev,
   const unsigned x_align = 8, y_align = info->surf.samples != 0 ? 8 : 4;
   info->surf.tiling = ISL_TILING_Y0;
   info->surf.logical_level0_px.width =
-      ALIGN(info->surf.logical_level0_px.width, x_align) * 2;
+      align(info->surf.logical_level0_px.width, x_align) * 2;
   info->surf.logical_level0_px.height =
-      ALIGN(info->surf.logical_level0_px.height, y_align) / 2;
+      align(info->surf.logical_level0_px.height, y_align) / 2;
   info->tile_x_sa *= 2;
   info->tile_y_sa /= 2;
 }
@ -1984,8 +1984,8 @@ try_blorp_blit(struct blorp_batch *batch,
         isl_get_interleaved_msaa_px_size_sa(params->dst.surf.samples);
      params->x0 = ROUND_DOWN_TO(params->x0, 2) * px_size_sa.width;
      params->y0 = ROUND_DOWN_TO(params->y0, 2) * px_size_sa.height;
-      params->x1 = ALIGN(params->x1, 2) * px_size_sa.width;
-      params->y1 = ALIGN(params->y1, 2) * px_size_sa.height;
+      params->x1 = align(params->x1, 2) * px_size_sa.width;
+      params->y1 = align(params->y1, 2) * px_size_sa.height;

      blorp_surf_fake_interleaved_msaa(batch->blorp->isl_dev, &params->dst);

@ -2045,8 +2045,8 @@ try_blorp_blit(struct blorp_batch *batch,
      const unsigned y_align = params->dst.surf.samples != 0 ? 8 : 4;
      params->x0 = ROUND_DOWN_TO(params->x0, x_align) * 2;
      params->y0 = ROUND_DOWN_TO(params->y0, y_align) / 2;
-      params->x1 = ALIGN(params->x1, x_align) * 2;
-      params->y1 = ALIGN(params->y1, y_align) / 2;
+      params->x1 = align(params->x1, x_align) * 2;
+      params->y1 = align(params->y1, y_align) / 2;

      /* Retile the surface to Y-tiled */
      blorp_surf_retile_w_to_y(batch->blorp->isl_dev, &params->dst);
--- a/src/intel/blorp/blorp_clear.c
+++ b/src/intel/blorp/blorp_clear.c
@ -387,8 +387,8 @@ get_fast_clear_rect(const struct isl_device *dev,

   *x0 = ROUND_DOWN_TO(*x0,  x_align) / x_scaledown;
   *y0 = ROUND_DOWN_TO(*y0, y_align) / y_scaledown;
-   *x1 = ALIGN(*x1, x_align) / x_scaledown;
-   *y1 = ALIGN(*y1, y_align) / y_scaledown;
+   *x1 = align(*x1, x_align) / x_scaledown;
+   *y1 = align(*y1, y_align) / y_scaledown;
 }

 static void
@ -541,7 +541,7 @@ blorp_fast_clear(struct blorp_batch *batch,
         assert(surf->surf->levels > 1 ||
                surf->surf->logical_level0_px.d > 1 ||
                surf->surf->logical_level0_px.a > 1);
-         const int phys_height0 = ALIGN(surf->surf->logical_level0_px.h,
+         const int phys_height0 = align(surf->surf->logical_level0_px.h,
                                        surf->surf->image_alignment_el.h);
         unaligned_height = phys_height0 % 32;
         size_B = (int64_t)surf->surf->row_pitch_B * (phys_height0 - unaligned_height);
@ -583,7 +583,7 @@ blorp_fast_clear(struct blorp_batch *batch,
                                 mem_surf.addr.offset, size_B, ISL_TILING_4);
               assert(isl_surf.logical_level0_px.h == 32);
               assert(isl_surf.logical_level0_px.a == 1);
-               isl_surf.row_pitch_B = ALIGN(isl_surf.row_pitch_B, 16 * 128);
+               isl_surf.row_pitch_B = align(isl_surf.row_pitch_B, 16 * 128);
            } else {
               isl_surf_from_mem(batch->blorp->isl_dev, &isl_surf,
                                 mem_surf.addr.offset, size_B, ISL_TILING_64);
@ -1341,8 +1341,8 @@ blorp_ccs_resolve(struct blorp_batch *batch,
         x_scaledown = aux_fmtl->bw / 2;
         y_scaledown = aux_fmtl->bh / 2;
      }
-      params.x1 = ALIGN(params.x1, x_scaledown) / x_scaledown;
-      params.y1 = ALIGN(params.y1, y_scaledown) / y_scaledown;
+      params.x1 = align(params.x1, x_scaledown) / x_scaledown;
+      params.y1 = align(params.y1, y_scaledown) / y_scaledown;
   }

   if (batch->blorp->isl_dev->info->ver >= 10) {
--- a/src/intel/blorp/blorp_genX_exec_brw.h
+++ b/src/intel/blorp/blorp_genX_exec_brw.h
@ -1656,7 +1656,7 @@ blorp_get_compute_push_const(struct blorp_batch *batch,
 {
   const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data;
   const unsigned push_const_size =
-      ALIGN(brw_cs_push_const_total_size(cs_prog_data, threads), 64);
+      align(brw_cs_push_const_total_size(cs_prog_data, threads), 64);
   assert(cs_prog_data->push.cross_thread.size +
          cs_prog_data->push.per_thread.size == sizeof(params->wm_inputs));

@ -1860,7 +1860,7 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
      vfe.URBEntryAllocationSize = 2;

      const uint32_t vfe_curbe_allocation =
-         ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
+         align(cs_prog_data->push.per_thread.regs * dispatch.threads +
               cs_prog_data->push.cross_thread.regs, 2);
      vfe.CURBEAllocationSize = vfe_curbe_allocation;
   }
--- a/src/intel/blorp/blorp_genX_exec_elk.h
+++ b/src/intel/blorp/blorp_genX_exec_elk.h
@ -1882,7 +1882,7 @@ blorp_get_compute_push_const(struct blorp_batch *batch,
 {
   const struct elk_cs_prog_data *cs_prog_data = params->cs_prog_data;
   const unsigned push_const_size =
-      ALIGN(elk_cs_push_const_total_size(cs_prog_data, threads), 64);
+      align(elk_cs_push_const_total_size(cs_prog_data, threads), 64);
   assert(cs_prog_data->push.cross_thread.size +
          cs_prog_data->push.per_thread.size == sizeof(params->wm_inputs));

@ -1981,7 +1981,7 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
      vfe.URBEntryAllocationSize = GFX_VER >= 8 ? 2 : 0;

      const uint32_t vfe_curbe_allocation =
-         ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
+         align(cs_prog_data->push.per_thread.regs * dispatch.threads +
               cs_prog_data->push.cross_thread.regs, 2);
      vfe.CURBEAllocationSize = vfe_curbe_allocation;
   }
--- a/src/intel/common/intel_urb_config.c
+++ b/src/intel/common/intel_urb_config.c
@ -135,7 +135,7 @@ intel_get_urb_config(const struct intel_device_info *devinfo,
    * Round them all up.
    */
   for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
-      min_entries[i] = ALIGN(min_entries[i], granularity[i]);
+      min_entries[i] = align(min_entries[i], granularity[i]);
   }

   unsigned entry_size_bytes[4];
@ -323,7 +323,7 @@ intel_get_mesh_urb_config(const struct intel_device_info *devinfo,
    *       in slices beyond Slice0) of the MESH URB allocation, specified in
    *       multiples of 8 KB.
    */
-   push_constant_kb = ALIGN(push_constant_kb, 8);
+   push_constant_kb = align(push_constant_kb, 8);
   total_urb_kb -= push_constant_kb;
   const unsigned total_urb_avail_mesh_task_kb = total_urb_kb;

@ -360,9 +360,9 @@ intel_get_mesh_urb_config(const struct intel_device_info *devinfo,
    */
   const unsigned min_mesh_entries = urb_cfg->size[MESA_SHADER_MESH] < 9 ? 8 : 1;
   const unsigned min_task_entries = urb_cfg->size[MESA_SHADER_TASK] < 9 ? 8 : 1;
-   const unsigned min_mesh_urb_kb = ALIGN(urb_cfg->size[MESA_SHADER_MESH] *
+   const unsigned min_mesh_urb_kb = align(urb_cfg->size[MESA_SHADER_MESH] *
                                          min_mesh_entries * 64, 1024) / 1024;
-   const unsigned min_task_urb_kb = ALIGN(urb_cfg->size[MESA_SHADER_TASK] *
+   const unsigned min_task_urb_kb = align(urb_cfg->size[MESA_SHADER_TASK] *
                                          min_task_entries * 64, 1024) / 1024;

   total_urb_kb -= (min_mesh_urb_kb + min_task_urb_kb);
@ -380,8 +380,8 @@ intel_get_mesh_urb_config(const struct intel_device_info *devinfo,
    *       in slices beyond Slice0) of the TASK URB allocation, specified in
    *       multiples of 8 KB.
    */
-   if ((total_urb_avail_mesh_task_kb - ALIGN(mesh_urb_kb, 8)) >= min_task_entries) {
-      mesh_urb_kb = ALIGN(mesh_urb_kb, 8);
+   if ((total_urb_avail_mesh_task_kb - align(mesh_urb_kb, 8)) >= min_task_entries) {
+      mesh_urb_kb = align(mesh_urb_kb, 8);
   } else {
      mesh_urb_kb = ROUND_DOWN_TO(mesh_urb_kb, 8);
   }
--- a/src/intel/compiler/brw/brw_compile_fs.cpp
+++ b/src/intel/compiler/brw/brw_compile_fs.cpp
@ -1219,7 +1219,7 @@ remap_attr_reg(brw_shader &s,
   const bool per_prim = src.nr < prog_data->num_per_primitive_inputs;
   const unsigned base = urb_start +
      (per_prim ? 0 :
-       ALIGN(prog_data->num_per_primitive_inputs / 2,
+       align(prog_data->num_per_primitive_inputs / 2,
             reg_unit(s.devinfo)) * s.max_polygons);
   const unsigned idx = per_prim ? src.nr :
      src.nr - prog_data->num_per_primitive_inputs;
--- a/src/intel/compiler/brw/brw_compile_gs.cpp
+++ b/src/intel/compiler/brw/brw_compile_gs.cpp
@ -223,7 +223,7 @@ brw_compile_gs(const struct brw_compiler *compiler,

   /* 1 HWORD = 32 bytes = 256 bits */
   prog_data->control_data_header_size_hwords =
-      ALIGN(control_data_header_size_bits, 256) / 256;
+      align(control_data_header_size_bits, 256) / 256;

   /* Compute the output vertex size.
    *
@ -276,7 +276,7 @@ brw_compile_gs(const struct brw_compiler *compiler,
   unsigned output_vertex_size_bytes = prog_data->base.vue_map.num_slots * 16;
   assert(output_vertex_size_bytes <= GFX7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES);
   prog_data->output_vertex_size_hwords =
-      ALIGN(output_vertex_size_bytes, 32) / 32;
+      align(output_vertex_size_bytes, 32) / 32;

   /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
    * That divides up as follows:
@ -330,7 +330,7 @@ brw_compile_gs(const struct brw_compiler *compiler,


   /* URB entry sizes are stored as a multiple of 64 bytes in gfx7+. */
-   prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+   prog_data->base.urb_entry_size = align(output_size_bytes, 64) / 64;

   assert(nir->info.gs.output_primitive < ARRAY_SIZE(gl_prim_to_hw_prim));
   prog_data->output_topology =
--- a/src/intel/compiler/brw/brw_compile_mesh.cpp
+++ b/src/intel/compiler/brw/brw_compile_mesh.cpp
@ -124,7 +124,7 @@ brw_nir_lower_tue_outputs(nir_shader *nir, brw_tue_map *map)
   NIR_PASS(_, nir, nir_lower_explicit_io,
            nir_var_mem_task_payload, nir_address_format_32bit_offset);

-   map->size_dw = ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8);
+   map->size_dw = align(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8);
 }

 static void
@ -203,7 +203,7 @@ brw_nir_align_launch_mesh_workgroups_instr(nir_builder *b,
   /* This will avoid special case in nir_lower_task_shader dealing with
    * not vec4-aligned payload when payload_in_shared workaround is enabled.
    */
-   nir_intrinsic_set_range(intrin, ALIGN(range, 16));
+   nir_intrinsic_set_range(intrin, align(range, 16));

   return true;
 }
@ -505,7 +505,7 @@ brw_nir_lower_tue_inputs(nir_shader *nir, const brw_tue_map *map)
      /* The types for Task Output and Mesh Input should match, so their sizes
       * should also match.
       */
-      assert(!map || map->size_dw == ALIGN(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8));
+      assert(!map || map->size_dw == align(DIV_ROUND_UP(nir->info.task_payload_size, 4), 8));
   } else {
      /* Mesh doesn't read any input, to make it clearer set the
       * task_payload_size to zero instead of keeping an incomplete size that
--- a/src/intel/compiler/brw/brw_compile_tcs.cpp
+++ b/src/intel/compiler/brw/brw_compile_tcs.cpp
@ -263,7 +263,7 @@ brw_compile_tcs(const struct brw_compiler *compiler,
      return NULL;

   /* URB entry sizes are stored as a multiple of 64 bytes. */
-   vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+   vue_prog_data->urb_entry_size = align(output_size_bytes, 64) / 64;

   /* HS does not use the usual payload pushing from URB to GRFs,
    * because we don't have enough registers for a full-size payload, and
--- a/src/intel/compiler/brw/brw_compile_tes.cpp
+++ b/src/intel/compiler/brw/brw_compile_tes.cpp
@ -133,7 +133,7 @@ brw_compile_tes(const struct brw_compiler *compiler,
      BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);

   /* URB entry sizes are stored as a multiple of 64 bytes. */
-   prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+   prog_data->base.urb_entry_size = align(output_size_bytes, 64) / 64;

   prog_data->base.urb_read_length = 0;

--- a/src/intel/compiler/brw/brw_eu_emit.c
+++ b/src/intel/compiler/brw/brw_eu_emit.c
@ -407,7 +407,7 @@ brw_append_insns(struct brw_codegen *p, unsigned nr_insn, unsigned alignment)
   assert(util_is_power_of_two_or_zero(sizeof(brw_eu_inst)));
   assert(util_is_power_of_two_or_zero(alignment));
   const unsigned align_insn = MAX2(alignment / sizeof(brw_eu_inst), 1);
-   const unsigned start_insn = ALIGN(p->nr_insn, align_insn);
+   const unsigned start_insn = align(p->nr_insn, align_insn);
   const unsigned new_nr_insn = start_insn + nr_insn;

   if (p->store_size < new_nr_insn) {
--- a/src/intel/compiler/brw/brw_from_nir.cpp
+++ b/src/intel/compiler/brw/brw_from_nir.cpp
@ -7295,7 +7295,7 @@ brw_from_nir_emit_memory_access(nir_to_brw_state &ntb,
            components = last_component - first_read_component + 1;
         }

-         total = ALIGN(components, REG_SIZE * reg_unit(devinfo) / 4);
+         total = align(components, REG_SIZE * reg_unit(devinfo) / 4);
         dest = ubld.vgrf(BRW_TYPE_UD, total);
      } else {
         total = components * bld.dispatch_width();
@ -7902,7 +7902,7 @@ brw_from_nir(brw_shader *s)
    */
   brw_from_nir_setup_outputs(ntb);
   brw_from_nir_emit_system_values(ntb);
-   ntb.s.last_scratch = ALIGN(ntb.nir->scratch_size, 4) * ntb.s.dispatch_width;
+   ntb.s.last_scratch = align(ntb.nir->scratch_size, 4) * ntb.s.dispatch_width;

   brw_from_nir_emit_impl(ntb, nir_shader_get_entrypoint((nir_shader *)ntb.nir));

--- a/src/intel/compiler/brw/brw_inst.h
+++ b/src/intel/compiler/brw/brw_inst.h
@ -527,7 +527,7 @@ brw_flag_mask(const brw_inst *inst, unsigned width)
   assert(util_is_power_of_two_nonzero(width));
   const unsigned start = (inst->flag_subreg * 16 + inst->group) &
                          ~(width - 1);
-   const unsigned end = start + ALIGN(inst->exec_size, width);
+   const unsigned end = start + align(inst->exec_size, width);
   return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
 }

--- a/src/intel/compiler/brw/brw_lower.cpp
+++ b/src/intel/compiler/brw/brw_lower.cpp
@ -467,7 +467,7 @@ brw_lower_find_live_channel(brw_shader &s)
          * specified quarter control as result.
          */
         if (inst->group > 0)
-            ubld.SHR(mask, mask, brw_imm_ud(ALIGN(inst->group, 8)));
+            ubld.SHR(mask, mask, brw_imm_ud(align(inst->group, 8)));

         ubld.AND(mask, exec_mask, mask);
         exec_mask = mask;
--- a/src/intel/compiler/brw/brw_nir_analyze_ubo_ranges.c
+++ b/src/intel/compiler/brw/brw_nir_analyze_ubo_ranges.c
@ -150,7 +150,7 @@ analyze_ubos_block(struct ubo_analysis_state *state, nir_block *block)
            nir_def_last_component_read(&intrin->def) + 1;
         const int bytes = num_components * (intrin->def.bit_size / 8);
         const int start = ROUND_DOWN_TO(byte_offset, sizeof_GRF);
-         const int end = ALIGN(byte_offset + bytes, sizeof_GRF);
+         const int end = align(byte_offset + bytes, sizeof_GRF);
         const int chunks = (end - start) / sizeof_GRF;

         /* TODO: should we count uses in loops as higher benefit? */
--- a/src/intel/compiler/brw/brw_opt_combine_constants.cpp
+++ b/src/intel/compiler/brw/brw_opt_combine_constants.cpp
@ -1574,7 +1574,7 @@ brw_opt_combine_constants(brw_shader &s)
       * seem to have additional alignment requirements, so account for that
       * too.
       */
-      assert(reg.offset == ALIGN(reg.offset, get_alignment_for_imm(imm)));
+      assert(reg.offset == align(reg.offset, get_alignment_for_imm(imm)));

      struct brw_reg imm_reg = build_imm_reg_for_copy(imm);

--- a/src/intel/compiler/brw/brw_reg_allocate.cpp
+++ b/src/intel/compiler/brw/brw_reg_allocate.cpp
@ -52,7 +52,7 @@ brw_assign_regs_trivial(brw_shader &s)
   int reg_width = s.dispatch_width / 8;

   /* Note that compressed instructions require alignment to 2 registers. */
-   hw_reg_mapping[0] = ALIGN(s.first_non_payload_grf, reg_width);
+   hw_reg_mapping[0] = align(s.first_non_payload_grf, reg_width);
   for (i = 1; i <= s.alloc.count; i++) {
      hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
                           DIV_ROUND_UP(s.alloc.sizes[i - 1],
@ -264,7 +264,7 @@ public:
       * for reg_width == 2.
       */
      int reg_width = fs->dispatch_width / 8;
-      payload_node_count = ALIGN(fs->first_non_payload_grf, reg_width);
+      payload_node_count = align(fs->first_non_payload_grf, reg_width);

      /* Get payload IP information */
      payload_last_use_ip = ralloc_array(mem_ctx, int, payload_node_count);
@ -1180,7 +1180,7 @@ brw_reg_alloc::choose_spill_reg()
 brw_reg
 brw_reg_alloc::alloc_spill_reg(unsigned size, int ip)
 {
-   int vgrf = brw_allocate_vgrf_units(*fs, ALIGN(size, reg_unit(devinfo))).nr;
+   int vgrf = brw_allocate_vgrf_units(*fs, align(size, reg_unit(devinfo))).nr;
   int class_idx = DIV_ROUND_UP(size, reg_unit(devinfo)) - 1;
   int n = ra_add_node(g, compiler->reg_set.classes[class_idx]);
   assert(n == first_vgrf_node + vgrf);
@ -1216,7 +1216,7 @@ brw_reg_alloc::spill_reg(unsigned spill_reg)
 {
   int size = fs->alloc.sizes[spill_reg];
   unsigned int spill_offset = fs->last_scratch;
-   assert(ALIGN(spill_offset, 16) == spill_offset); /* oword read/write req. */
+   assert(align(spill_offset, 16) == spill_offset); /* oword read/write req. */

   fs->spilled_any_registers = true;

--- a/src/intel/compiler/brw/brw_rt.h
+++ b/src/intel/compiler/brw/brw_rt.h
@ -233,7 +233,7 @@ brw_rt_compute_scratch_layout(struct brw_rt_scratch_layout *layout,
    */
   assert(size % 64 == 0);
   layout->sw_stack_start = size;
-   layout->sw_stack_size = ALIGN(sw_stack_size, 64);
+   layout->sw_stack_size = align(sw_stack_size, 64);

   /* Currently it's always the case that sw_stack_size is a power of
    * two, but power-of-two SW stack sizes are prone to causing
--- a/src/intel/compiler/elk/elk_eu_emit.c
+++ b/src/intel/compiler/elk/elk_eu_emit.c
@ -570,7 +570,7 @@ elk_append_insns(struct elk_codegen *p, unsigned nr_insn, unsigned alignment)
   assert(util_is_power_of_two_or_zero(sizeof(elk_inst)));
   assert(util_is_power_of_two_or_zero(alignment));
   const unsigned align_insn = MAX2(alignment / sizeof(elk_inst), 1);
-   const unsigned start_insn = ALIGN(p->nr_insn, align_insn);
+   const unsigned start_insn = align(p->nr_insn, align_insn);
   const unsigned new_nr_insn = start_insn + nr_insn;

   if (p->store_size < new_nr_insn) {
--- a/src/intel/compiler/elk/elk_fs.cpp
+++ b/src/intel/compiler/elk/elk_fs.cpp
@ -945,7 +945,7 @@ namespace {
      assert(util_is_power_of_two_nonzero(width));
      const unsigned start = (inst->flag_subreg * 16 + inst->group) &
                             ~(width - 1);
-      const unsigned end = start + ALIGN(inst->exec_size, width);
+      const unsigned end = start + align(inst->exec_size, width);
      return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
   }

@ -1525,7 +1525,7 @@ elk_fs_visitor::assign_urb_setup()
            const bool per_prim = inst->src[i].nr < prog_data->num_per_primitive_inputs;
            const unsigned base = urb_start +
               (per_prim ? 0 :
-                ALIGN(prog_data->num_per_primitive_inputs / 2,
+                align(prog_data->num_per_primitive_inputs / 2,
                      reg_unit(devinfo)));
            const unsigned idx = per_prim ? inst->src[i].nr :
               inst->src[i].nr - prog_data->num_per_primitive_inputs;
@ -5296,7 +5296,7 @@ elk_fs_visitor::lower_find_live_channel()
          * specified quarter control as result.
          */
         if (inst->group > 0)
-            ubld.SHR(mask, mask, elk_imm_ud(ALIGN(inst->group, 8)));
+            ubld.SHR(mask, mask, elk_imm_ud(align(inst->group, 8)));

         ubld.AND(mask, exec_mask, mask);
         exec_mask = mask;
@ -6003,7 +6003,7 @@ elk_fs_visitor::allocate_registers(bool allow_spilling)
             * field documentation, platforms prior to Haswell measure scratch
             * size linearly with a range of [1kB, 12kB] and 1kB granularity.
             */
-            prog_data->total_scratch = ALIGN(last_scratch, 1024);
+            prog_data->total_scratch = align(last_scratch, 1024);
            max_scratch_size = 12 * 1024;
         }
      }
@ -6666,7 +6666,7 @@ elk_nir_populate_wm_prog_data(nir_shader *shader,
 static inline int
 elk_register_blocks(int reg_count)
 {
-   return ALIGN(reg_count, 16) / 16 - 1;
+   return align(reg_count, 16) / 16 - 1;
 }

 const unsigned *
--- a/src/intel/compiler/elk/elk_fs_combine_constants.cpp
+++ b/src/intel/compiler/elk/elk_fs_combine_constants.cpp
@ -1461,7 +1461,7 @@ elk_fs_visitor::opt_combine_constants()
          * instructions seem to have additional alignment requirements, so
          * account for that too.
          */
-         reg.offset = ALIGN(reg.offset, get_alignment_for_imm(imm));
+         reg.offset = align(reg.offset, get_alignment_for_imm(imm));

         /* Ensure we have enough space in the register to copy the immediate */
         if (reg.offset + imm->size > REG_SIZE) {
@ -1555,7 +1555,7 @@ elk_fs_visitor::opt_combine_constants()
       * seem to have additional alignment requirements, so account for that
       * too.
       */
-      assert(reg.offset == ALIGN(reg.offset, get_alignment_for_imm(imm)));
+      assert(reg.offset == align(reg.offset, get_alignment_for_imm(imm)));

      struct elk_reg imm_reg = build_imm_reg_for_copy(imm);

--- a/src/intel/compiler/elk/elk_fs_nir.cpp
+++ b/src/intel/compiler/elk/elk_fs_nir.cpp
@ -5036,7 +5036,7 @@ fs_nir_emit_intrinsic(nir_to_elk_state &ntb,
            }

            const unsigned total_dwords =
-               ALIGN(instr->num_components, REG_SIZE * reg_unit(devinfo) / 4);
+               align(instr->num_components, REG_SIZE * reg_unit(devinfo) / 4);
            unsigned loaded_dwords = 0;

            const elk_fs_reg packed_consts =
@ -5225,7 +5225,7 @@ fs_nir_emit_intrinsic(nir_to_elk_state &ntb,
      break;

   case nir_intrinsic_load_global_constant_uniform_block_intel: {
-      const unsigned total_dwords = ALIGN(instr->num_components,
+      const unsigned total_dwords = align(instr->num_components,
                                          REG_SIZE * reg_unit(devinfo) / 4);
      unsigned loaded_dwords = 0;

@ -5360,7 +5360,7 @@ fs_nir_emit_intrinsic(nir_to_elk_state &ntb,
         srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_fs_reg(elk_imm_ud(GFX7_BTI_SLM));
      }

-      const unsigned total_dwords = ALIGN(instr->num_components,
+      const unsigned total_dwords = align(instr->num_components,
                                          REG_SIZE * reg_unit(devinfo) / 4);
      unsigned loaded_dwords = 0;

@ -7002,7 +7002,7 @@ nir_to_elk(elk_fs_visitor *s)
   fs_nir_setup_outputs(ntb);
   fs_nir_setup_uniforms(ntb.s);
   fs_nir_emit_system_values(ntb);
-   ntb.s.last_scratch = ALIGN(ntb.nir->scratch_size, 4) * ntb.s.dispatch_width;
+   ntb.s.last_scratch = align(ntb.nir->scratch_size, 4) * ntb.s.dispatch_width;

   fs_nir_emit_impl(ntb, nir_shader_get_entrypoint((nir_shader *)ntb.nir));

--- a/src/intel/compiler/elk/elk_fs_reg_allocate.cpp
+++ b/src/intel/compiler/elk/elk_fs_reg_allocate.cpp
@ -52,7 +52,7 @@ elk_fs_visitor::assign_regs_trivial()
   int reg_width = dispatch_width / 8;

   /* Note that compressed instructions require alignment to 2 registers. */
-   hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width);
+   hw_reg_mapping[0] = align(this->first_non_payload_grf, reg_width);
   for (i = 1; i <= this->alloc.count; i++) {
      hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
                           DIV_ROUND_UP(this->alloc.sizes[i - 1],
@ -325,7 +325,7 @@ public:
       */
      int reg_width = fs->dispatch_width / 8;
      rsi = util_logbase2(reg_width);
-      payload_node_count = ALIGN(fs->first_non_payload_grf, reg_width);
+      payload_node_count = align(fs->first_non_payload_grf, reg_width);

      /* Get payload IP information */
      payload_last_use_ip = ralloc_array(mem_ctx, int, payload_node_count);
@ -940,7 +940,7 @@ elk_fs_reg_alloc::choose_spill_reg()
 elk_fs_reg
 elk_fs_reg_alloc::alloc_spill_reg(unsigned size, int ip)
 {
-   int vgrf = fs->alloc.allocate(ALIGN(size, reg_unit(devinfo)));
+   int vgrf = fs->alloc.allocate(align(size, reg_unit(devinfo)));
   int class_idx = DIV_ROUND_UP(size, reg_unit(devinfo)) - 1;
   int n = ra_add_node(g, compiler->fs_reg_sets[rsi].classes[class_idx]);
   assert(n == first_vgrf_node + vgrf);
@ -975,7 +975,7 @@ elk_fs_reg_alloc::spill_reg(unsigned spill_reg)
 {
   int size = fs->alloc.sizes[spill_reg];
   unsigned int spill_offset = fs->last_scratch;
-   assert(ALIGN(spill_offset, 16) == spill_offset); /* oword read/write req. */
+   assert(align(spill_offset, 16) == spill_offset); /* oword read/write req. */

   /* Spills may use MRFs 13-15 in the SIMD16 case.  Our texturing is done
    * using up to 11 MRFs starting from either m1 or m2, and fb writes can use
--- a/src/intel/compiler/elk/elk_nir_analyze_ubo_ranges.c
+++ b/src/intel/compiler/elk/elk_nir_analyze_ubo_ranges.c
@ -160,7 +160,7 @@ analyze_ubos_block(struct ubo_analysis_state *state, nir_block *block)
         const int bytes = nir_intrinsic_dest_components(intrin) *
                           (intrin->def.bit_size / 8);
         const int start = ROUND_DOWN_TO(byte_offset, 32);
-         const int end = ALIGN(byte_offset + bytes, 32);
+         const int end = align(byte_offset + bytes, 32);
         const int chunks = (end - start) / 32;

         /* TODO: should we count uses in loops as higher benefit? */
--- a/src/intel/compiler/elk/elk_shader.cpp
+++ b/src/intel/compiler/elk/elk_shader.cpp
@ -1301,7 +1301,7 @@ elk_compile_tes(const struct elk_compiler *compiler,
      BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);

   /* URB entry sizes are stored as a multiple of 64 bytes. */
-   prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+   prog_data->base.urb_entry_size = align(output_size_bytes, 64) / 64;

   prog_data->base.urb_read_length = 0;

--- a/src/intel/compiler/elk/elk_vec4_gs_visitor.cpp
+++ b/src/intel/compiler/elk/elk_vec4_gs_visitor.cpp
@ -120,7 +120,7 @@ vec4_gs_visitor::setup_varying_inputs(int payload_reg,
      }
   }

-   int regs_used = ALIGN(input_array_stride * num_input_vertices,
+   int regs_used = align(input_array_stride * num_input_vertices,
                         attributes_per_reg) / attributes_per_reg;
   return payload_reg + regs_used;
 }
@ -672,7 +672,7 @@ elk_compile_gs(const struct elk_compiler *compiler,

   /* 1 HWORD = 32 bytes = 256 bits */
   prog_data->control_data_header_size_hwords =
-      ALIGN(c.control_data_header_size_bits, 256) / 256;
+      align(c.control_data_header_size_bits, 256) / 256;

   /* Compute the output vertex size.
    *
@ -726,7 +726,7 @@ elk_compile_gs(const struct elk_compiler *compiler,
   assert(compiler->devinfo->ver == 6 ||
          output_vertex_size_bytes <= GFX7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES);
   prog_data->output_vertex_size_hwords =
-      ALIGN(output_vertex_size_bytes, 32) / 32;
+      align(output_vertex_size_bytes, 32) / 32;

   /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
    * That divides up as follows:
@ -793,9 +793,9 @@ elk_compile_gs(const struct elk_compiler *compiler,
    * a multiple of 128 bytes in gfx6.
    */
   if (compiler->devinfo->ver >= 7) {
-      prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+      prog_data->base.urb_entry_size = align(output_size_bytes, 64) / 64;
   } else {
-      prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 128) / 128;
+      prog_data->base.urb_entry_size = align(output_size_bytes, 128) / 128;
   }

   assert(nir->info.gs.output_primitive < ARRAY_SIZE(elk::gl_prim_to_hw_prim));
--- a/src/intel/compiler/elk/elk_vec4_tcs.cpp
+++ b/src/intel/compiler/elk/elk_vec4_tcs.cpp
@ -423,7 +423,7 @@ elk_compile_tcs(const struct elk_compiler *compiler,
      return NULL;

   /* URB entry sizes are stored as a multiple of 64 bytes. */
-   vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+   vue_prog_data->urb_entry_size = align(output_size_bytes, 64) / 64;

   /* HS does not use the usual payload pushing from URB to GRFs,
    * because we don't have enough registers for a full-size payload, and
--- a/src/intel/isl/isl_tiled_memcpy.c
+++ b/src/intel/isl/isl_tiled_memcpy.c
@ -45,7 +45,7 @@
 #define FILE_DEBUG_FLAG DEBUG_TEXTURE

 #define ALIGN_DOWN(a, b) ROUND_DOWN_TO(a, b)
-#define ALIGN_UP(a, b) ALIGN(a, b)
+#define ALIGN_UP(a, b) align(a, b)

 /* Tile dimensions.  Width and span are in bytes, height is in pixels (i.e.
 * unitless).  A "span" is the most number of bytes we can copy from linear
--- a/src/intel/tools/aub_write.c
+++ b/src/intel/tools/aub_write.c
@ -94,7 +94,7 @@ write_execlists_header(struct aub_file *aub, const char *name)
   app_name_len =
      snprintf(app_name, sizeof(app_name), "PCI-ID=0x%X %s",
               aub->pci_id, name);
-   app_name_len = ALIGN(app_name_len, sizeof(uint32_t));
+   app_name_len = align(app_name_len, sizeof(uint32_t));

   dwords = 5 + app_name_len / sizeof(uint32_t);
   dword_out(aub, CMD_MEM_TRACE_VERSION | (dwords - 1));
@ -189,7 +189,7 @@ mem_trace_memory_write_header_out(struct aub_file *aub, uint64_t addr,
                                  uint32_t len, uint32_t addr_space,
                                  const char *desc)
 {
-   uint32_t dwords = ALIGN(len, sizeof(uint32_t)) / sizeof(uint32_t);
+   uint32_t dwords = align(len, sizeof(uint32_t)) / sizeof(uint32_t);

   if (aub->verbose_log_file) {
      fprintf(aub->verbose_log_file,
--- a/Show more
+++ b/Show more