From 2c9b7fc7a805bdf892a0c246db260939b15e0411 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= Date: Fri, 16 Jun 2023 13:20:59 +0200 Subject: [PATCH] libpng: Enable intrinsics on x86/SSE2, ppc64/VSX, and all arm/NEON --- drivers/png/SCsub | 40 +- .../gdextension_build/SConstruct | 1 - .../gdextension_build/SConstruct | 1 - platform/ios/detect.py | 5 +- thirdparty/README.md | 4 +- .../libpng/intel/filter_sse2_intrinsics.c | 391 +++++++++ thirdparty/libpng/intel/intel_init.c | 52 ++ .../libpng/powerpc/filter_vsx_intrinsics.c | 768 ++++++++++++++++++ thirdparty/libpng/powerpc/powerpc_init.c | 126 +++ 9 files changed, 1361 insertions(+), 27 deletions(-) create mode 100644 thirdparty/libpng/intel/filter_sse2_intrinsics.c create mode 100644 thirdparty/libpng/intel/intel_init.c create mode 100644 thirdparty/libpng/powerpc/filter_vsx_intrinsics.c create mode 100644 thirdparty/libpng/powerpc/powerpc_init.c diff --git a/drivers/png/SCsub b/drivers/png/SCsub index fe8c8fa8cc8b..dd4777a19b4f 100644 --- a/drivers/png/SCsub +++ b/drivers/png/SCsub @@ -33,30 +33,30 @@ if env["builtin_libpng"]: # Needed for drivers includes and in platform/web. env.Prepend(CPPPATH=[thirdparty_dir]) - # Currently .ASM filter_neon.S does not compile on NT. - import os - - # Enable ARM NEON instructions on 32-bit Android to compile more optimized code. - use_neon = env["platform"] == "android" and env["arch"] == "arm32" and os.name != "nt" - if use_neon: - env_png.Append(CPPDEFINES=[("PNG_ARM_NEON_OPT", 2)]) - else: - env_png.Append(CPPDEFINES=[("PNG_ARM_NEON_OPT", 0)]) - env_thirdparty = env_png.Clone() env_thirdparty.disable_warnings() env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources) - if use_neon: - env_neon = env_thirdparty.Clone() - if "S_compiler" in env: - env_neon["CC"] = env["S_compiler"] - neon_sources = [] - neon_sources.append(env_neon.Object(thirdparty_dir + "/arm/arm_init.c")) - neon_sources.append(env_neon.Object(thirdparty_dir + "/arm/filter_neon_intrinsics.c")) - neon_sources.append(env_neon.Object(thirdparty_dir + "/arm/filter_neon.S")) - neon_sources.append(env_neon.Object(thirdparty_dir + "/arm/palette_neon_intrinsics.c")) - thirdparty_obj += neon_sources + if env["arch"].startswith("arm"): + if env.msvc: # Can't compile assembly files with MSVC. + env_thirdparty.Append(CPPDEFINES=[("PNG_ARM_NEON_OPT"), 0]) + else: + env_neon = env_thirdparty.Clone() + if "S_compiler" in env: + env_neon["CC"] = env["S_compiler"] + neon_sources = [] + neon_sources.append(env_neon.Object(thirdparty_dir + "/arm/arm_init.c")) + neon_sources.append(env_neon.Object(thirdparty_dir + "/arm/filter_neon_intrinsics.c")) + neon_sources.append(env_neon.Object(thirdparty_dir + "/arm/filter_neon.S")) + neon_sources.append(env_neon.Object(thirdparty_dir + "/arm/palette_neon_intrinsics.c")) + thirdparty_obj += neon_sources + elif env["arch"].startswith("x86"): + env_thirdparty.Append(CPPDEFINES=["PNG_INTEL_SSE"]) + env_thirdparty.add_source_files(thirdparty_obj, thirdparty_dir + "/intel/intel_init.c") + env_thirdparty.add_source_files(thirdparty_obj, thirdparty_dir + "/intel/filter_sse2_intrinsics.c") + elif env["arch"] == "ppc64": + env_thirdparty.add_source_files(thirdparty_obj, thirdparty_dir + "/powerpc/powerpc_init.c") + env_thirdparty.add_source_files(thirdparty_obj, thirdparty_dir + "/powerpc/filter_vsx_intrinsics.c") env.drivers_sources += thirdparty_obj diff --git a/modules/text_server_adv/gdextension_build/SConstruct b/modules/text_server_adv/gdextension_build/SConstruct index af9dae84e3fb..4b12a4e41d2f 100644 --- a/modules/text_server_adv/gdextension_build/SConstruct +++ b/modules/text_server_adv/gdextension_build/SConstruct @@ -262,7 +262,6 @@ if env["freetype_enabled"]: CPPDEFINES=[ "FT2_BUILD_LIBRARY", "FT_CONFIG_OPTION_USE_PNG", - ("PNG_ARM_NEON_OPT", 0), "FT_CONFIG_OPTION_SYSTEM_ZLIB", ] ) diff --git a/modules/text_server_fb/gdextension_build/SConstruct b/modules/text_server_fb/gdextension_build/SConstruct index 51a6ee06be9b..fcf897601977 100644 --- a/modules/text_server_fb/gdextension_build/SConstruct +++ b/modules/text_server_fb/gdextension_build/SConstruct @@ -257,7 +257,6 @@ if env["freetype_enabled"]: CPPDEFINES=[ "FT2_BUILD_LIBRARY", "FT_CONFIG_OPTION_USE_PNG", - ("PNG_ARM_NEON_OPT", 0), "FT_CONFIG_OPTION_SYSTEM_ZLIB", ] ) diff --git a/platform/ios/detect.py b/platform/ios/detect.py index f5501e3d85ff..a01f4b55db65 100644 --- a/platform/ios/detect.py +++ b/platform/ios/detect.py @@ -85,19 +85,18 @@ def configure(env: "Environment"): env["ENV"]["PATH"] = env["IOS_TOOLCHAIN_PATH"] + "/Developer/usr/bin/:" + env["ENV"]["PATH"] compiler_path = "$IOS_TOOLCHAIN_PATH/usr/bin/${ios_triple}" - s_compiler_path = "$IOS_TOOLCHAIN_PATH/Developer/usr/bin/" ccache_path = os.environ.get("CCACHE") if ccache_path is None: env["CC"] = compiler_path + "clang" env["CXX"] = compiler_path + "clang++" - env["S_compiler"] = s_compiler_path + "gcc" + env["S_compiler"] = compiler_path + "clang" else: # there aren't any ccache wrappers available for iOS, # to enable caching we need to prepend the path to the ccache binary env["CC"] = ccache_path + " " + compiler_path + "clang" env["CXX"] = ccache_path + " " + compiler_path + "clang++" - env["S_compiler"] = ccache_path + " " + s_compiler_path + "gcc" + env["S_compiler"] = ccache_path + " " + compiler_path + "clang" env["AR"] = compiler_path + "ar" env["RANLIB"] = compiler_path + "ranlib" diff --git a/thirdparty/README.md b/thirdparty/README.md index a918acbe77f9..3a98f676b472 100644 --- a/thirdparty/README.md +++ b/thirdparty/README.md @@ -315,14 +315,14 @@ Files extracted from upstream source: ## libpng - Upstream: http://libpng.org/pub/png/libpng.html -- Version: 1.6.38 (0a158f3506502dfa23edfc42790dfaed82efba17, 2022) +- Version: 1.6.39 (07b8803110da160b158ebfef872627da6c85cbdf, 2022) - License: libpng/zlib Files extracted from upstream source: - all .c and .h files of the main directory, except from `example.c` and `pngtest.c` -- the arm/ folder +- `arm/`, `intel/` and `powerpc/` folders - `scripts/pnglibconf.h.prebuilt` as `pnglibconf.h` - `LICENSE` diff --git a/thirdparty/libpng/intel/filter_sse2_intrinsics.c b/thirdparty/libpng/intel/filter_sse2_intrinsics.c new file mode 100644 index 000000000000..d3c0fe9e2d68 --- /dev/null +++ b/thirdparty/libpng/intel/filter_sse2_intrinsics.c @@ -0,0 +1,391 @@ + +/* filter_sse2_intrinsics.c - SSE2 optimized filter functions + * + * Copyright (c) 2018 Cosmin Truta + * Copyright (c) 2016-2017 Glenn Randers-Pehrson + * Written by Mike Klein and Matt Sarett + * Derived from arm/filter_neon_intrinsics.c + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + */ + +#include "../pngpriv.h" + +#ifdef PNG_READ_SUPPORTED + +#if PNG_INTEL_SSE_IMPLEMENTATION > 0 + +#include + +/* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d). + * They're positioned like this: + * prev: c b + * row: a d + * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be + * whichever of a, b, or c is closest to p=a+b-c. + */ + +static __m128i load4(const void* p) { + int tmp; + memcpy(&tmp, p, sizeof(tmp)); + return _mm_cvtsi32_si128(tmp); +} + +static void store4(void* p, __m128i v) { + int tmp = _mm_cvtsi128_si32(v); + memcpy(p, &tmp, sizeof(int)); +} + +static __m128i load3(const void* p) { + png_uint_32 tmp = 0; + memcpy(&tmp, p, 3); + return _mm_cvtsi32_si128(tmp); +} + +static void store3(void* p, __m128i v) { + int tmp = _mm_cvtsi128_si32(v); + memcpy(p, &tmp, 3); +} + +void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row, + png_const_bytep prev) +{ + /* The Sub filter predicts each pixel as the previous pixel, a. + * There is no pixel to the left of the first pixel. It's encoded directly. + * That works with our main loop if we just say that left pixel was zero. + */ + size_t rb; + + __m128i a, d = _mm_setzero_si128(); + + png_debug(1, "in png_read_filter_row_sub3_sse2"); + + rb = row_info->rowbytes; + while (rb >= 4) { + a = d; d = load4(row); + d = _mm_add_epi8(d, a); + store3(row, d); + + row += 3; + rb -= 3; + } + if (rb > 0) { + a = d; d = load3(row); + d = _mm_add_epi8(d, a); + store3(row, d); + + row += 3; + rb -= 3; + } + PNG_UNUSED(prev) +} + +void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row, + png_const_bytep prev) +{ + /* The Sub filter predicts each pixel as the previous pixel, a. + * There is no pixel to the left of the first pixel. It's encoded directly. + * That works with our main loop if we just say that left pixel was zero. + */ + size_t rb; + + __m128i a, d = _mm_setzero_si128(); + + png_debug(1, "in png_read_filter_row_sub4_sse2"); + + rb = row_info->rowbytes+4; + while (rb > 4) { + a = d; d = load4(row); + d = _mm_add_epi8(d, a); + store4(row, d); + + row += 4; + rb -= 4; + } + PNG_UNUSED(prev) +} + +void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row, + png_const_bytep prev) +{ + /* The Avg filter predicts each pixel as the (truncated) average of a and b. + * There's no pixel to the left of the first pixel. Luckily, it's + * predicted to be half of the pixel above it. So again, this works + * perfectly with our loop if we make sure a starts at zero. + */ + + size_t rb; + + const __m128i zero = _mm_setzero_si128(); + + __m128i b; + __m128i a, d = zero; + + png_debug(1, "in png_read_filter_row_avg3_sse2"); + rb = row_info->rowbytes; + while (rb >= 4) { + __m128i avg; + b = load4(prev); + a = d; d = load4(row ); + + /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */ + avg = _mm_avg_epu8(a,b); + /* ...but we can fix it up by subtracting off 1 if it rounded up. */ + avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), + _mm_set1_epi8(1))); + d = _mm_add_epi8(d, avg); + store3(row, d); + + prev += 3; + row += 3; + rb -= 3; + } + if (rb > 0) { + __m128i avg; + b = load3(prev); + a = d; d = load3(row ); + + /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */ + avg = _mm_avg_epu8(a,b); + /* ...but we can fix it up by subtracting off 1 if it rounded up. */ + avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), + _mm_set1_epi8(1))); + + d = _mm_add_epi8(d, avg); + store3(row, d); + + prev += 3; + row += 3; + rb -= 3; + } +} + +void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row, + png_const_bytep prev) +{ + /* The Avg filter predicts each pixel as the (truncated) average of a and b. + * There's no pixel to the left of the first pixel. Luckily, it's + * predicted to be half of the pixel above it. So again, this works + * perfectly with our loop if we make sure a starts at zero. + */ + size_t rb; + const __m128i zero = _mm_setzero_si128(); + __m128i b; + __m128i a, d = zero; + + png_debug(1, "in png_read_filter_row_avg4_sse2"); + + rb = row_info->rowbytes+4; + while (rb > 4) { + __m128i avg; + b = load4(prev); + a = d; d = load4(row ); + + /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */ + avg = _mm_avg_epu8(a,b); + /* ...but we can fix it up by subtracting off 1 if it rounded up. */ + avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), + _mm_set1_epi8(1))); + + d = _mm_add_epi8(d, avg); + store4(row, d); + + prev += 4; + row += 4; + rb -= 4; + } +} + +/* Returns |x| for 16-bit lanes. */ +static __m128i abs_i16(__m128i x) { +#if PNG_INTEL_SSE_IMPLEMENTATION >= 2 + return _mm_abs_epi16(x); +#else + /* Read this all as, return x<0 ? -x : x. + * To negate two's complement, you flip all the bits then add 1. + */ + __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128()); + + /* Flip negative lanes. */ + x = _mm_xor_si128(x, is_negative); + + /* +1 to negative lanes, else +0. */ + x = _mm_sub_epi16(x, is_negative); + return x; +#endif +} + +/* Bytewise c ? t : e. */ +static __m128i if_then_else(__m128i c, __m128i t, __m128i e) { +#if PNG_INTEL_SSE_IMPLEMENTATION >= 3 + return _mm_blendv_epi8(e,t,c); +#else + return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e)); +#endif +} + +void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row, + png_const_bytep prev) +{ + /* Paeth tries to predict pixel d using the pixel to the left of it, a, + * and two pixels from the previous row, b and c: + * prev: c b + * row: a d + * The Paeth function predicts d to be whichever of a, b, or c is nearest to + * p=a+b-c. + * + * The first pixel has no left context, and so uses an Up filter, p = b. + * This works naturally with our main loop's p = a+b-c if we force a and c + * to zero. + * Here we zero b and d, which become c and a respectively at the start of + * the loop. + */ + size_t rb; + const __m128i zero = _mm_setzero_si128(); + __m128i c, b = zero, + a, d = zero; + + png_debug(1, "in png_read_filter_row_paeth3_sse2"); + + rb = row_info->rowbytes; + while (rb >= 4) { + /* It's easiest to do this math (particularly, deal with pc) with 16-bit + * intermediates. + */ + __m128i pa,pb,pc,smallest,nearest; + c = b; b = _mm_unpacklo_epi8(load4(prev), zero); + a = d; d = _mm_unpacklo_epi8(load4(row ), zero); + + /* (p-a) == (a+b-c - a) == (b-c) */ + + pa = _mm_sub_epi16(b,c); + + /* (p-b) == (a+b-c - b) == (a-c) */ + pb = _mm_sub_epi16(a,c); + + /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ + pc = _mm_add_epi16(pa,pb); + + pa = abs_i16(pa); /* |p-a| */ + pb = abs_i16(pb); /* |p-b| */ + pc = abs_i16(pc); /* |p-c| */ + + smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); + + /* Paeth breaks ties favoring a over b over c. */ + nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, + if_then_else(_mm_cmpeq_epi16(smallest, pb), b, + c)); + + /* Note `_epi8`: we need addition to wrap modulo 255. */ + d = _mm_add_epi8(d, nearest); + store3(row, _mm_packus_epi16(d,d)); + + prev += 3; + row += 3; + rb -= 3; + } + if (rb > 0) { + /* It's easiest to do this math (particularly, deal with pc) with 16-bit + * intermediates. + */ + __m128i pa,pb,pc,smallest,nearest; + c = b; b = _mm_unpacklo_epi8(load3(prev), zero); + a = d; d = _mm_unpacklo_epi8(load3(row ), zero); + + /* (p-a) == (a+b-c - a) == (b-c) */ + pa = _mm_sub_epi16(b,c); + + /* (p-b) == (a+b-c - b) == (a-c) */ + pb = _mm_sub_epi16(a,c); + + /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ + pc = _mm_add_epi16(pa,pb); + + pa = abs_i16(pa); /* |p-a| */ + pb = abs_i16(pb); /* |p-b| */ + pc = abs_i16(pc); /* |p-c| */ + + smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); + + /* Paeth breaks ties favoring a over b over c. */ + nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, + if_then_else(_mm_cmpeq_epi16(smallest, pb), b, + c)); + + /* Note `_epi8`: we need addition to wrap modulo 255. */ + d = _mm_add_epi8(d, nearest); + store3(row, _mm_packus_epi16(d,d)); + + prev += 3; + row += 3; + rb -= 3; + } +} + +void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row, + png_const_bytep prev) +{ + /* Paeth tries to predict pixel d using the pixel to the left of it, a, + * and two pixels from the previous row, b and c: + * prev: c b + * row: a d + * The Paeth function predicts d to be whichever of a, b, or c is nearest to + * p=a+b-c. + * + * The first pixel has no left context, and so uses an Up filter, p = b. + * This works naturally with our main loop's p = a+b-c if we force a and c + * to zero. + * Here we zero b and d, which become c and a respectively at the start of + * the loop. + */ + size_t rb; + const __m128i zero = _mm_setzero_si128(); + __m128i pa,pb,pc,smallest,nearest; + __m128i c, b = zero, + a, d = zero; + + png_debug(1, "in png_read_filter_row_paeth4_sse2"); + + rb = row_info->rowbytes+4; + while (rb > 4) { + /* It's easiest to do this math (particularly, deal with pc) with 16-bit + * intermediates. + */ + c = b; b = _mm_unpacklo_epi8(load4(prev), zero); + a = d; d = _mm_unpacklo_epi8(load4(row ), zero); + + /* (p-a) == (a+b-c - a) == (b-c) */ + pa = _mm_sub_epi16(b,c); + + /* (p-b) == (a+b-c - b) == (a-c) */ + pb = _mm_sub_epi16(a,c); + + /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ + pc = _mm_add_epi16(pa,pb); + + pa = abs_i16(pa); /* |p-a| */ + pb = abs_i16(pb); /* |p-b| */ + pc = abs_i16(pc); /* |p-c| */ + + smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); + + /* Paeth breaks ties favoring a over b over c. */ + nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, + if_then_else(_mm_cmpeq_epi16(smallest, pb), b, + c)); + + /* Note `_epi8`: we need addition to wrap modulo 255. */ + d = _mm_add_epi8(d, nearest); + store4(row, _mm_packus_epi16(d,d)); + + prev += 4; + row += 4; + rb -= 4; + } +} + +#endif /* PNG_INTEL_SSE_IMPLEMENTATION > 0 */ +#endif /* READ */ diff --git a/thirdparty/libpng/intel/intel_init.c b/thirdparty/libpng/intel/intel_init.c new file mode 100644 index 000000000000..2f8168b7c483 --- /dev/null +++ b/thirdparty/libpng/intel/intel_init.c @@ -0,0 +1,52 @@ + +/* intel_init.c - SSE2 optimized filter functions + * + * Copyright (c) 2018 Cosmin Truta + * Copyright (c) 2016-2017 Glenn Randers-Pehrson + * Written by Mike Klein and Matt Sarett, Google, Inc. + * Derived from arm/arm_init.c + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + */ + +#include "../pngpriv.h" + +#ifdef PNG_READ_SUPPORTED +#if PNG_INTEL_SSE_IMPLEMENTATION > 0 + +void +png_init_filter_functions_sse2(png_structp pp, unsigned int bpp) +{ + /* The techniques used to implement each of these filters in SSE operate on + * one pixel at a time. + * So they generally speed up 3bpp images about 3x, 4bpp images about 4x. + * They can scale up to 6 and 8 bpp images and down to 2 bpp images, + * but they'd not likely have any benefit for 1bpp images. + * Most of these can be implemented using only MMX and 64-bit registers, + * but they end up a bit slower than using the equally-ubiquitous SSE2. + */ + png_debug(1, "in png_init_filter_functions_sse2"); + if (bpp == 3) + { + pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_sse2; + pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_sse2; + pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = + png_read_filter_row_paeth3_sse2; + } + else if (bpp == 4) + { + pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub4_sse2; + pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_sse2; + pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = + png_read_filter_row_paeth4_sse2; + } + + /* No need optimize PNG_FILTER_VALUE_UP. The compiler should + * autovectorize. + */ +} + +#endif /* PNG_INTEL_SSE_IMPLEMENTATION > 0 */ +#endif /* PNG_READ_SUPPORTED */ diff --git a/thirdparty/libpng/powerpc/filter_vsx_intrinsics.c b/thirdparty/libpng/powerpc/filter_vsx_intrinsics.c new file mode 100644 index 000000000000..01cf8800dc47 --- /dev/null +++ b/thirdparty/libpng/powerpc/filter_vsx_intrinsics.c @@ -0,0 +1,768 @@ +/* filter_vsx_intrinsics.c - PowerPC optimised filter functions + * + * Copyright (c) 2018 Cosmin Truta + * Copyright (c) 2017 Glenn Randers-Pehrson + * Written by Vadim Barkov, 2017. + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + */ + +#include +#include +#include "../pngpriv.h" + +#ifdef PNG_READ_SUPPORTED + +/* This code requires -maltivec and -mvsx on the command line: */ +#if PNG_POWERPC_VSX_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */ + +#include + +#if PNG_POWERPC_VSX_OPT > 0 + +#ifndef __VSX__ +# error "This code requires VSX support (POWER7 and later). Please provide -mvsx compiler flag." +#endif + +#define vec_ld_unaligned(vec,data) vec = vec_vsx_ld(0,data) +#define vec_st_unaligned(vec,data) vec_vsx_st(vec,0,data) + + +/* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d). + * They're positioned like this: + * prev: c b + * row: a d + * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be + * whichever of a, b, or c is closest to p=a+b-c. + * ( this is taken from ../intel/filter_sse2_intrinsics.c ) + */ + +#define vsx_declare_common_vars(row_info,row,prev_row,offset) \ + png_byte i;\ + png_bytep rp = row + offset;\ + png_const_bytep pp = prev_row;\ + size_t unaligned_top = 16 - (((size_t)rp % 16));\ + size_t istop;\ + if(unaligned_top == 16)\ + unaligned_top = 0;\ + istop = row_info->rowbytes;\ + if((unaligned_top < istop))\ + istop -= unaligned_top;\ + else{\ + unaligned_top = istop;\ + istop = 0;\ + } + +void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vsx_declare_common_vars(row_info,row,prev_row,0) + + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + for (i = 0; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); + rp++; + } + + /* Using SIMD while we can */ + while( istop >= 16 ) + { + rp_vec = vec_ld(0,rp); + vec_ld_unaligned(pp_vec,pp); + + rp_vec = vec_add(rp_vec,pp_vec); + + vec_st(rp_vec,0,rp); + + pp += 16; + rp += 16; + istop -= 16; + } + + if(istop > 0) + { + /* If byte count of row is not divisible by 16 + * we will process remaining part as usual + */ + for (i = 0; i < istop; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); + rp++; + } +} + +} + +static const vector unsigned char VSX_LEFTSHIFTED1_4 = {16,16,16,16, 0, 1, 2, 3,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_LEFTSHIFTED2_4 = {16,16,16,16,16,16,16,16, 4, 5, 6, 7,16,16,16,16}; +static const vector unsigned char VSX_LEFTSHIFTED3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 8, 9,10,11}; + +static const vector unsigned char VSX_LEFTSHIFTED1_3 = {16,16,16, 0, 1, 2,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_LEFTSHIFTED2_3 = {16,16,16,16,16,16, 3, 4, 5,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_LEFTSHIFTED3_3 = {16,16,16,16,16,16,16,16,16, 6, 7, 8,16,16,16,16}; +static const vector unsigned char VSX_LEFTSHIFTED4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 9,10,11,16}; + +static const vector unsigned char VSX_NOT_SHIFTED1_4 = {16,16,16,16, 4, 5, 6, 7,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_NOT_SHIFTED2_4 = {16,16,16,16,16,16,16,16, 8, 9,10,11,16,16,16,16}; +static const vector unsigned char VSX_NOT_SHIFTED3_4 = {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,15}; + +static const vector unsigned char VSX_NOT_SHIFTED1_3 = {16,16,16, 3, 4, 5,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_NOT_SHIFTED2_3 = {16,16,16,16,16,16, 6, 7, 8,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_NOT_SHIFTED3_3 = {16,16,16,16,16,16,16,16,16, 9,10,11,16,16,16,16}; +static const vector unsigned char VSX_NOT_SHIFTED4_3 = {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,16}; + +static const vector unsigned char VSX_CHAR_ZERO = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; +#ifdef __LITTLE_ENDIAN__ + +static const vector unsigned char VSX_CHAR_TO_SHORT1_4 = { 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT2_4 = { 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT3_4 = {12,16,13,16,14,16,15,16,16,16,16,16,16,16,16,16}; + +static const vector unsigned char VSX_SHORT_TO_CHAR1_4 = {16,16,16,16, 0, 2, 4, 6,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR2_4 = {16,16,16,16,16,16,16,16, 0, 2, 4, 6,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4, 6}; + +static const vector unsigned char VSX_CHAR_TO_SHORT1_3 = { 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT2_3 = { 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT3_3 = { 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT4_3 = {12,16,13,16,14,16,16,16,16,16,16,16,16,16,16,16}; + +static const vector unsigned char VSX_SHORT_TO_CHAR1_3 = {16,16,16, 0, 2, 4,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR2_3 = {16,16,16,16,16,16, 0, 2, 4,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR3_3 = {16,16,16,16,16,16,16,16,16, 0, 2, 4,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4,16}; + +#elif defined(__BIG_ENDIAN__) + +static const vector unsigned char VSX_CHAR_TO_SHORT1_4 = {16, 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT2_4 = {16, 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT3_4 = {16,12,16,13,16,14,16,15,16,16,16,16,16,16,16,16}; + +static const vector unsigned char VSX_SHORT_TO_CHAR1_4 = {16,16,16,16, 1, 3, 5, 7,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR2_4 = {16,16,16,16,16,16,16,16, 1, 3, 5, 7,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5, 7}; + +static const vector unsigned char VSX_CHAR_TO_SHORT1_3 = {16, 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT2_3 = {16, 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT3_3 = {16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_CHAR_TO_SHORT4_3 = {16,12,16,13,16,14,16,16,16,16,16,16,16,16,16,16}; + +static const vector unsigned char VSX_SHORT_TO_CHAR1_3 = {16,16,16, 1, 3, 5,16,16,16,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR2_3 = {16,16,16,16,16,16, 1, 3, 5,16,16,16,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR3_3 = {16,16,16,16,16,16,16,16,16, 1, 3, 5,16,16,16,16}; +static const vector unsigned char VSX_SHORT_TO_CHAR4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5,16}; + +#endif + +#define vsx_char_to_short(vec,offset,bpp) (vector unsigned short)vec_perm((vec),VSX_CHAR_ZERO,VSX_CHAR_TO_SHORT##offset##_##bpp) +#define vsx_short_to_char(vec,offset,bpp) vec_perm(((vector unsigned char)(vec)),VSX_CHAR_ZERO,VSX_SHORT_TO_CHAR##offset##_##bpp) + +#ifdef PNG_USE_ABS +# define vsx_abs(number) abs(number) +#else +# define vsx_abs(number) (number > 0) ? (number) : -(number) +#endif + +void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_byte bpp = 4; + + vector unsigned char rp_vec; + vector unsigned char part_vec; + + vsx_declare_common_vars(row_info,row,prev_row,bpp) + + PNG_UNUSED(pp) + + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + for (i = 0; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } + + /* Using SIMD while we can */ + while( istop >= 16 ) + { + for(i=0;i < bpp ; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } + rp -= bpp; + + rp_vec = vec_ld(0,rp); + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_4); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_4); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_4); + rp_vec = vec_add(rp_vec,part_vec); + + vec_st(rp_vec,0,rp); + + rp += 16; + istop -= 16; + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp - bpp))) & 0xff); + rp++; + } + +} + +void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_byte bpp = 3; + + vector unsigned char rp_vec; + vector unsigned char part_vec; + + vsx_declare_common_vars(row_info,row,prev_row,bpp) + + PNG_UNUSED(pp) + + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + for (i = 0; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } + + /* Using SIMD while we can */ + while( istop >= 16 ) + { + for(i=0;i < bpp ; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } + rp -= bpp; + + rp_vec = vec_ld(0,rp); + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_3); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_3); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_3); + rp_vec = vec_add(rp_vec,part_vec); + + part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED4_3); + rp_vec = vec_add(rp_vec,part_vec); + + vec_st(rp_vec,0,rp); + rp += 15; + istop -= 16; + + /* Since 16 % bpp = 16 % 3 = 1, last element of array must + * be proceeded manually + */ + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff); + rp++; + } +} + +void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_byte bpp = 4; + + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vector unsigned char pp_part_vec; + vector unsigned char rp_part_vec; + vector unsigned char avg_vec; + + vsx_declare_common_vars(row_info,row,prev_row,bpp) + rp -= bpp; + if(istop >= bpp) + istop -= bpp; + + for (i = 0; i < bpp; i++) + { + *rp = (png_byte)(((int)(*rp) + + ((int)(*pp++) / 2 )) & 0xff); + + rp++; + } + + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + for (i = 0; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } + + /* Using SIMD while we can */ + while( istop >= 16 ) + { + for(i=0;i < bpp ; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } + rp -= bpp; + pp -= bpp; + + vec_ld_unaligned(pp_vec,pp); + rp_vec = vec_ld(0,rp); + + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_4); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED1_4); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_4); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED2_4); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_4); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED3_4); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + vec_st(rp_vec,0,rp); + + rp += 16; + pp += 16; + istop -= 16; + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } +} + +void png_read_filter_row_avg3_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_byte bpp = 3; + + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vector unsigned char pp_part_vec; + vector unsigned char rp_part_vec; + vector unsigned char avg_vec; + + vsx_declare_common_vars(row_info,row,prev_row,bpp) + rp -= bpp; + if(istop >= bpp) + istop -= bpp; + + for (i = 0; i < bpp; i++) + { + *rp = (png_byte)(((int)(*rp) + + ((int)(*pp++) / 2 )) & 0xff); + + rp++; + } + + /* Altivec operations require 16-byte aligned data + * but input can be unaligned. So we calculate + * unaligned part as usual. + */ + for (i = 0; i < unaligned_top; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } + + /* Using SIMD while we can */ + while( istop >= 16 ) + { + for(i=0;i < bpp ; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } + rp -= bpp; + pp -= bpp; + + vec_ld_unaligned(pp_vec,pp); + rp_vec = vec_ld(0,rp); + + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_3); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED1_3); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_3); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED2_3); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_3); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED3_3); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED4_3); + pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED4_3); + avg_vec = vec_avg(rp_part_vec,pp_part_vec); + avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1))); + rp_vec = vec_add(rp_vec,avg_vec); + + vec_st(rp_vec,0,rp); + + rp += 15; + pp += 15; + istop -= 16; + + /* Since 16 % bpp = 16 % 3 = 1, last element of array must + * be proceeded manually + */ + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + rp++; + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + *rp = (png_byte)(((int)(*rp) + + (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff); + + rp++; + } +} + +/* Bytewise c ? t : e. */ +#define if_then_else(c,t,e) vec_sel(e,t,c) + +#define vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) {\ + c = *(pp - bpp);\ + a = *(rp - bpp);\ + b = *pp++;\ + p = b - c;\ + pc = a - c;\ + pa = vsx_abs(p);\ + pb = vsx_abs(pc);\ + pc = vsx_abs(p + pc);\ + if (pb < pa) pa = pb, a = b;\ + if (pc < pa) a = c;\ + a += *rp;\ + *rp++ = (png_byte)a;\ + } + +void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_byte bpp = 4; + + int a, b, c, pa, pb, pc, p; + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vector unsigned short a_vec,b_vec,c_vec,nearest_vec; + vector signed short pa_vec,pb_vec,pc_vec,smallest_vec; + + vsx_declare_common_vars(row_info,row,prev_row,bpp) + rp -= bpp; + if(istop >= bpp) + istop -= bpp; + + /* Process the first pixel in the row completely (this is the same as 'up' + * because there is only one candidate predictor for the first row). + */ + for(i = 0; i < bpp ; i++) + { + *rp = (png_byte)( *rp + *pp); + rp++; + pp++; + } + + for(i = 0; i < unaligned_top ; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } + + while( istop >= 16) + { + for(i = 0; i < bpp ; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } + + rp -= bpp; + pp -= bpp; + rp_vec = vec_ld(0,rp); + vec_ld_unaligned(pp_vec,pp); + + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_4),1,4); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED1_4),1,4); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_4),1,4); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,1,4))); + + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_4),2,4); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED2_4),2,4); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_4),2,4); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,2,4))); + + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_4),3,4); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED3_4),3,4); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_4),3,4); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,3,4))); + + vec_st(rp_vec,0,rp); + + rp += 16; + pp += 16; + istop -= 16; + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } +} + +void png_read_filter_row_paeth3_vsx(png_row_infop row_info, png_bytep row, + png_const_bytep prev_row) +{ + png_byte bpp = 3; + + int a, b, c, pa, pb, pc, p; + vector unsigned char rp_vec; + vector unsigned char pp_vec; + vector unsigned short a_vec,b_vec,c_vec,nearest_vec; + vector signed short pa_vec,pb_vec,pc_vec,smallest_vec; + + vsx_declare_common_vars(row_info,row,prev_row,bpp) + rp -= bpp; + if(istop >= bpp) + istop -= bpp; + + /* Process the first pixel in the row completely (this is the same as 'up' + * because there is only one candidate predictor for the first row). + */ + for(i = 0; i < bpp ; i++) + { + *rp = (png_byte)( *rp + *pp); + rp++; + pp++; + } + + for(i = 0; i < unaligned_top ; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } + + while( istop >= 16) + { + for(i = 0; i < bpp ; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } + + rp -= bpp; + pp -= bpp; + rp_vec = vec_ld(0,rp); + vec_ld_unaligned(pp_vec,pp); + + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_3),1,3); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED1_3),1,3); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_3),1,3); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,1,3))); + + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_3),2,3); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED2_3),2,3); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_3),2,3); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,2,3))); + + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_3),3,3); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED3_3),3,3); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_3),3,3); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,3,3))); + + a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED4_3),4,3); + b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED4_3),4,3); + c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED4_3),4,3); + pa_vec = (vector signed short) vec_sub(b_vec,c_vec); + pb_vec = (vector signed short) vec_sub(a_vec , c_vec); + pc_vec = vec_add(pa_vec,pb_vec); + pa_vec = vec_abs(pa_vec); + pb_vec = vec_abs(pb_vec); + pc_vec = vec_abs(pc_vec); + smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec)); + nearest_vec = if_then_else( + vec_cmpeq(pa_vec,smallest_vec), + a_vec, + if_then_else( + vec_cmpeq(pb_vec,smallest_vec), + b_vec, + c_vec + ) + ); + rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,4,3))); + + vec_st(rp_vec,0,rp); + + rp += 15; + pp += 15; + istop -= 16; + + /* Since 16 % bpp = 16 % 3 = 1, last element of array must + * be proceeded manually + */ + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } + + if(istop > 0) + for (i = 0; i < istop % 16; i++) + { + vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) + } +} + +#endif /* PNG_POWERPC_VSX_OPT > 0 */ +#endif /* PNG_POWERPC_VSX_IMPLEMENTATION == 1 (intrinsics) */ +#endif /* READ */ diff --git a/thirdparty/libpng/powerpc/powerpc_init.c b/thirdparty/libpng/powerpc/powerpc_init.c new file mode 100644 index 000000000000..54426c558e55 --- /dev/null +++ b/thirdparty/libpng/powerpc/powerpc_init.c @@ -0,0 +1,126 @@ + +/* powerpc_init.c - POWERPC optimised filter functions + * + * Copyright (c) 2018 Cosmin Truta + * Copyright (c) 2017 Glenn Randers-Pehrson + * Written by Vadim Barkov, 2017. + * + * This code is released under the libpng license. + * For conditions of distribution and use, see the disclaimer + * and license in png.h + */ + +/* Below, after checking __linux__, various non-C90 POSIX 1003.1 functions are + * called. + */ +#define _POSIX_SOURCE 1 + +#include +#include "../pngpriv.h" + +#ifdef PNG_READ_SUPPORTED + +#if PNG_POWERPC_VSX_OPT > 0 +#ifdef PNG_POWERPC_VSX_CHECK_SUPPORTED /* Do run-time checks */ +/* WARNING: it is strongly recommended that you do not build libpng with + * run-time checks for CPU features if at all possible. In the case of the PowerPC + * VSX instructions there is no processor-specific way of detecting the + * presence of the required support, therefore run-time detection is extremely + * OS specific. + * + * You may set the macro PNG_POWERPC_VSX_FILE to the file name of file containing + * a fragment of C source code which defines the png_have_vsx function. There + * are a number of implementations in contrib/powerpc-vsx, but the only one that + * has partial support is contrib/powerpc-vsx/linux.c - a generic Linux + * implementation which reads /proc/cpufino. + */ +#ifndef PNG_POWERPC_VSX_FILE +# ifdef __linux__ +# define PNG_POWERPC_VSX_FILE "contrib/powerpc-vsx/linux_aux.c" +# endif +#endif + +#ifdef PNG_POWERPC_VSX_FILE + +#include /* for sig_atomic_t */ +static int png_have_vsx(png_structp png_ptr); +#include PNG_POWERPC_VSX_FILE + +#else /* PNG_POWERPC_VSX_FILE */ +# error "PNG_POWERPC_VSX_FILE undefined: no support for run-time POWERPC VSX checks" +#endif /* PNG_POWERPC_VSX_FILE */ +#endif /* PNG_POWERPC_VSX_CHECK_SUPPORTED */ + +void +png_init_filter_functions_vsx(png_structp pp, unsigned int bpp) +{ + /* The switch statement is compiled in for POWERPC_VSX_API, the call to + * png_have_vsx is compiled in for POWERPC_VSX_CHECK. If both are defined + * the check is only performed if the API has not set the PowerPC option on + * or off explicitly. In this case the check controls what happens. + */ + +#ifdef PNG_POWERPC_VSX_API_SUPPORTED + switch ((pp->options >> PNG_POWERPC_VSX) & 3) + { + case PNG_OPTION_UNSET: + /* Allow the run-time check to execute if it has been enabled - + * thus both API and CHECK can be turned on. If it isn't supported + * this case will fall through to the 'default' below, which just + * returns. + */ +#endif /* PNG_POWERPC_VSX_API_SUPPORTED */ +#ifdef PNG_POWERPC_VSX_CHECK_SUPPORTED + { + static volatile sig_atomic_t no_vsx = -1; /* not checked */ + + if (no_vsx < 0) + no_vsx = !png_have_vsx(pp); + + if (no_vsx) + return; + } +#ifdef PNG_POWERPC_VSX_API_SUPPORTED + break; +#endif +#endif /* PNG_POWERPC_VSX_CHECK_SUPPORTED */ + +#ifdef PNG_POWERPC_VSX_API_SUPPORTED + default: /* OFF or INVALID */ + return; + + case PNG_OPTION_ON: + /* Option turned on */ + break; + } +#endif + + /* IMPORTANT: any new internal functions used here must be declared using + * PNG_INTERNAL_FUNCTION in ../pngpriv.h. This is required so that the + * 'prefix' option to configure works: + * + * ./configure --with-libpng-prefix=foobar_ + * + * Verify you have got this right by running the above command, doing a build + * and examining pngprefix.h; it must contain a #define for every external + * function you add. (Notice that this happens automatically for the + * initialization function.) + */ + pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up_vsx; + + if (bpp == 3) + { + pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_vsx; + pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_vsx; + pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth3_vsx; + } + + else if (bpp == 4) + { + pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub4_vsx; + pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_vsx; + pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = png_read_filter_row_paeth4_vsx; + } +} +#endif /* PNG_POWERPC_VSX_OPT > 0 */ +#endif /* READ */