mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-05 05:18:08 +02:00
mesa: Add SSE 4.1 optimisation for glDrawElements.
Makes use of SSE 4.1 to speed up compute of min and max elements. Callgrind cpu usage results from pts benchmarks: Openarena 0.8.8: 3.67% -> 1.03% UrbanTerror: 2.36% -> 0.81% V5: - actually make use of the optimisation in android (Emil Velikov) - set a better array size limit for using SSE and added TODO V4: - fixed bugs with incrementing pointer and updating counters V3: - Removed sse_minmax.c from Makefile.sources - handle the first few values without SSE until the pointer is aligned and use _mm_load_si128 rather than _mm_loadu_si128 - guard the call to the SSE code better at build time V2: - removed GL* types - use _mm_store_si128() rather than _mm_store_ps() - add runtime check for SSE - use aligned attribute for local mix/max - bunch of tidyups Reviewed-by: Juha-Pekka Heikkila <juhapekka.heikkila@gmail.com> Reviewed-by: Matt Turner <mattst88@gmail.com> Signed-off-by: Timothy Arceri <t_arceri@yahoo.com.au>
This commit is contained in:
parent
9557cf7d0d
commit
1378617218
6 changed files with 152 additions and 5 deletions
|
|
@ -51,10 +51,16 @@ endif # MESA_ENABLE_ASM
|
|||
|
||||
ifeq ($(ARCH_X86_HAVE_SSE4_1),true)
|
||||
LOCAL_SRC_FILES += \
|
||||
$(SRCDIR)main/streaming-load-memcpy.c
|
||||
$(SRCDIR)main/streaming-load-memcpy.c \
|
||||
$(SRCDIR)main/sse_minmax.c
|
||||
LOCAL_CFLAGS := -msse4.1
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH_X86_HAVE_SSE4_1),true)
|
||||
LOCAL_CFLAGS += \
|
||||
-DUSE_SSE41
|
||||
endif
|
||||
|
||||
LOCAL_C_INCLUDES := \
|
||||
$(call intermediates-dir-for STATIC_LIBRARIES,libmesa_program,,) \
|
||||
$(MESA_TOP)/src \
|
||||
|
|
|
|||
|
|
@ -48,6 +48,11 @@ ifeq ($(TARGET_ARCH),x86)
|
|||
endif # x86
|
||||
endif # MESA_ENABLE_ASM
|
||||
|
||||
ifeq ($(ARCH_X86_HAVE_SSE4_1),true)
|
||||
LOCAL_CFLAGS := \
|
||||
-DUSE_SSE41
|
||||
endif
|
||||
|
||||
LOCAL_C_INCLUDES := \
|
||||
$(call intermediates-dir-for STATIC_LIBRARIES,libmesa_program,,) \
|
||||
$(MESA_TOP)/src/gallium/auxiliary \
|
||||
|
|
|
|||
|
|
@ -151,7 +151,8 @@ libmesagallium_la_LIBADD = \
|
|||
$(ARCH_LIBS)
|
||||
|
||||
libmesa_sse41_la_SOURCES = \
|
||||
main/streaming-load-memcpy.c
|
||||
main/streaming-load-memcpy.c \
|
||||
main/sse_minmax.c
|
||||
libmesa_sse41_la_CFLAGS = $(AM_CFLAGS) -msse4.1
|
||||
|
||||
pkgconfigdir = $(libdir)/pkgconfig
|
||||
|
|
|
|||
97
src/mesa/main/sse_minmax.c
Normal file
97
src/mesa/main/sse_minmax.c
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
/*
|
||||
* Copyright © 2014 Timothy Arceri
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* Author:
|
||||
* Timothy Arceri <t_arceri@yahoo.com.au>
|
||||
*
|
||||
*/
|
||||
|
||||
#ifdef __SSE4_1__
|
||||
#include "main/sse_minmax.h"
|
||||
#include <smmintrin.h>
|
||||
#include <stdint.h>
|
||||
|
||||
void
|
||||
_mesa_uint_array_min_max(const unsigned *ui_indices, unsigned *min_index,
|
||||
unsigned *max_index, const unsigned count)
|
||||
{
|
||||
unsigned max_ui = 0;
|
||||
unsigned min_ui = ~0U;
|
||||
unsigned i = 0;
|
||||
unsigned aligned_count = count;
|
||||
|
||||
/* handle the first few values without SSE until the pointer is aligned */
|
||||
while (((uintptr_t)ui_indices & 15) && aligned_count) {
|
||||
if (*ui_indices > max_ui)
|
||||
max_ui = *ui_indices;
|
||||
if (*ui_indices < min_ui)
|
||||
min_ui = *ui_indices;
|
||||
|
||||
aligned_count--;
|
||||
ui_indices++;
|
||||
}
|
||||
|
||||
/* TODO: The actual threshold for SSE begin useful may be higher than 8.
|
||||
* Some careful microbenchmarks and measurement are required to
|
||||
* find the actual tipping point.
|
||||
*/
|
||||
if (aligned_count >= 8) {
|
||||
unsigned max_arr[4] __attribute__ ((aligned (16)));
|
||||
unsigned min_arr[4] __attribute__ ((aligned (16)));
|
||||
unsigned vec_count;
|
||||
__m128i max_ui4 = _mm_setzero_si128();
|
||||
__m128i min_ui4 = _mm_set1_epi32(~0U);
|
||||
__m128i ui_indices4;
|
||||
__m128i *ui_indices_ptr;
|
||||
|
||||
vec_count = aligned_count & ~0x3;
|
||||
ui_indices_ptr = (__m128i *)ui_indices;
|
||||
for (i = 0; i < vec_count / 4; i++) {
|
||||
ui_indices4 = _mm_load_si128(&ui_indices_ptr[i]);
|
||||
max_ui4 = _mm_max_epu32(ui_indices4, max_ui4);
|
||||
min_ui4 = _mm_min_epu32(ui_indices4, min_ui4);
|
||||
}
|
||||
|
||||
_mm_store_si128((__m128i *)max_arr, max_ui4);
|
||||
_mm_store_si128((__m128i *)min_arr, min_ui4);
|
||||
|
||||
for (i = 0; i < 4; i++) {
|
||||
if (max_arr[i] > max_ui)
|
||||
max_ui = max_arr[i];
|
||||
if (min_arr[i] < min_ui)
|
||||
min_ui = min_arr[i];
|
||||
}
|
||||
i = vec_count;
|
||||
}
|
||||
|
||||
for (; i < aligned_count; i++) {
|
||||
if (ui_indices[i] > max_ui)
|
||||
max_ui = ui_indices[i];
|
||||
if (ui_indices[i] < min_ui)
|
||||
min_ui = ui_indices[i];
|
||||
}
|
||||
|
||||
*min_index = min_ui;
|
||||
*max_index = max_ui;
|
||||
}
|
||||
|
||||
#endif
|
||||
30
src/mesa/main/sse_minmax.h
Normal file
30
src/mesa/main/sse_minmax.h
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
/*
|
||||
* Copyright © 2014 Timothy Arceri
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* Author:
|
||||
* Timothy Arceri <t_arceri@yahoo.com.au>
|
||||
*
|
||||
*/
|
||||
|
||||
void
|
||||
_mesa_uint_array_min_max(const unsigned *ui_indices, unsigned *min_index,
|
||||
unsigned *max_index, const unsigned count);
|
||||
|
|
@ -36,6 +36,8 @@
|
|||
#include "main/enums.h"
|
||||
#include "main/macros.h"
|
||||
#include "main/transformfeedback.h"
|
||||
#include "main/sse_minmax.h"
|
||||
#include "x86/common_x86_asm.h"
|
||||
|
||||
#include "vbo_context.h"
|
||||
|
||||
|
|
@ -119,10 +121,16 @@ vbo_get_minmax_index(struct gl_context *ctx,
|
|||
}
|
||||
}
|
||||
else {
|
||||
for (i = 0; i < count; i++) {
|
||||
if (ui_indices[i] > max_ui) max_ui = ui_indices[i];
|
||||
if (ui_indices[i] < min_ui) min_ui = ui_indices[i];
|
||||
#if defined(USE_SSE41)
|
||||
if (cpu_has_sse4_1) {
|
||||
_mesa_uint_array_min_max(ui_indices, &min_ui, &max_ui, count);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
for (i = 0; i < count; i++) {
|
||||
if (ui_indices[i] > max_ui) max_ui = ui_indices[i];
|
||||
if (ui_indices[i] < min_ui) min_ui = ui_indices[i];
|
||||
}
|
||||
}
|
||||
*min_index = min_ui;
|
||||
*max_index = max_ui;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue