mesa/src/panfrost/encoder/pan_attributes.c

/*
 * Copyright (C) 2019 Collabora, Ltd.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */

#include "util/u_math.h"
#include "panfrost-job.h"
#include "pan_encoder.h"

/* This file handles attribute descriptors (mali_attr_meta). The
 * bulk of the complexity is from instancing. See mali_job for
 * notes on how this works. But basically, for small vertex
 * counts, we have a lookup table, and for large vertex counts,
 * we look at the high bits as a heuristic. This has to match
 * exactly how the hardware calculates this (which is why the
 * algorithm is so weird) or else instancing will break. */

/* Given an odd number (of the form 2k + 1), compute k */
#define ODD(odd) ((odd - 1) >> 1)

static unsigned
panfrost_small_padded_vertex_count(unsigned idx)
{
        if (idx == 11 || idx == 13 || idx == 15 || idx == 19)
                return idx + 1;
        else
                return idx;
}

static unsigned
panfrost_large_padded_vertex_count(uint32_t vertex_count)
{
        /* First, we have to find the highest set one */
        unsigned highest = 32 - __builtin_clz(vertex_count);

        /* Using that, we mask out the highest 4-bits */
        unsigned n = highest - 4;
        unsigned nibble = (vertex_count >> n) & 0xF;

        /* Great, we have the nibble. Now we can just try possibilities. Note
         * that we don't care about the bottom most bit in most cases, and we
         * know the top bit must be 1 */

        unsigned middle_two = (nibble >> 1) & 0x3;

        switch (middle_two) {
        case 0b00:
                if (!(nibble & 1))
                        return (1 << n) * 9;
                else
                        return (1 << (n + 1)) * 5;
        case 0b01:
                return (1 << (n + 2)) * 3;
        case 0b10:
                return (1 << (n + 1)) * 7;
        case 0b11:
                return (1 << (n + 4));
        default:
                return 0; /* unreachable */
        }
}

unsigned
panfrost_padded_vertex_count(unsigned vertex_count)
{
        if (vertex_count < 20)
                return panfrost_small_padded_vertex_count(vertex_count);
        else
                return panfrost_large_padded_vertex_count(vertex_count);
}

/* The much, much more irritating case -- instancing is enabled. See
 * panfrost_job.h for notes on how this works */

static unsigned
panfrost_compute_magic_divisor(unsigned hw_divisor, unsigned *o_shift, unsigned *extra_flags)
{
        /* We have a NPOT divisor. Here's the fun one (multipling by
         * the inverse and shifting) */

        /* floor(log2(d)) */
        unsigned shift = util_logbase2(hw_divisor);

        /* m = ceil(2^(32 + shift) / d) */
        uint64_t shift_hi = 32 + shift;
        uint64_t t = 1ll << shift_hi;
        double t_f = t;
        double hw_divisor_d = hw_divisor;
        double m_f = ceil(t_f / hw_divisor_d);
        unsigned m = m_f;

        /* Default case */
        uint32_t magic_divisor = m;

        /* e = 2^(shift + 32) % d */
        uint64_t e = t % hw_divisor;

        /* Apply round-down algorithm? e <= 2^shift?. XXX: The blob
         * seems to use a different condition */
        if (e <= (1ll << shift)) {
                magic_divisor = m - 1;
                *extra_flags = 1;
        }

        /* Top flag implicitly set */
        assert(magic_divisor & (1u << 31));
        magic_divisor &= ~(1u << 31);
        *o_shift = shift;

        return magic_divisor;
}

unsigned
panfrost_vertex_instanced(
        unsigned padded_count,
        unsigned instance_shift, unsigned instance_odd,
        unsigned divisor,
        union mali_attr *attrs)
{
        /* Depending if there is an instance divisor or not, packing varies.
         * When there is a divisor, the hardware-level divisor is actually the
         * product of the instance divisor and the padded count */

        unsigned hw_divisor = padded_count * divisor;

        if (divisor == 0) {
                /* Per-vertex attributes use the MODULO mode. First, compute
                 * the modulus */

                attrs->elements |= MALI_ATTR_MODULO;
                attrs->shift = instance_shift;
                attrs->extra_flags = instance_odd;

                return 1;
        } else if (util_is_power_of_two_or_zero(hw_divisor)) {
                /* If there is a divisor but the hardware divisor works out to
                 * a power of two (not terribly exceptional), we can use an
                 * easy path (just shifting) */

                attrs->elements |= MALI_ATTR_POT_DIVIDE;
                attrs->shift = __builtin_ctz(hw_divisor);

                return 1;
        } else {
                unsigned shift = 0, extra_flags = 0;

                attrs[1].magic_divisor =
                        panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);

                /* Upload to two different slots */

                attrs[0].elements |= MALI_ATTR_NPOT_DIVIDE;
                attrs[0].shift = shift;
                attrs[0].extra_flags = extra_flags;

                attrs[1].unk = 0x20;
                attrs[1].zero = 0;
                attrs[1].divisor = divisor;

                return 2;
        }
}

/* Records for gl_VertexID and gl_InstanceID use a slightly special encoding,
 * but the idea is the same */

void
panfrost_vertex_id(
        unsigned padded_count,
        union mali_attr *attr)
{
        /* We factor the padded count as shift/odd and that's it */

        attr->elements = MALI_ATTR_VERTEXID;
        attr->shift = __builtin_ctz(padded_count);
        attr->extra_flags = padded_count >> (attr->shift + 1);
        attr->stride = attr->size = 0;
}

void
panfrost_instance_id(
        unsigned padded_count,
        union mali_attr *attr)
{
        attr->elements = MALI_ATTR_INSTANCEID;
        attr->stride = 0;
        attr->extra_flags = 0;
        attr->size = 0;

        /* POT records have just a shift directly with an off-by-one for
         * unclear reasons. NPOT records have a magic divisor smushed into the
         * stride field (which is unused for these special records) */

        if (util_is_power_of_two_or_zero(padded_count)) {
                attr->shift = __builtin_ctz(padded_count) - 1;
        } else {
                unsigned shift = 0, flags = 0;

                attr->stride = panfrost_compute_magic_divisor(padded_count, &shift, &flags);
                attr->shift = shift;
                attr->extra_flags = flags;
        }
}