Update libwebp to 0.6.1

* lossless performance and compression improvements + a new 'cruncher' mode (-m 6 -q 100)
* ARM performance improvements with clang (15-20% w/ndk r15c)
* webp-js: emscripten/webassembly based javascript decoder
* miscellaneous bug & build fixes
This commit is contained in:
volzhs 2017-12-12 02:11:11 +09:00
parent 64d104756c
commit 043103fe6a
162 changed files with 7580 additions and 6214 deletions

View file

@ -26,9 +26,6 @@ if env['builtin_libwebp']:
"dsp/alpha_processing_neon.c",
"dsp/alpha_processing_sse2.c",
"dsp/alpha_processing_sse41.c",
"dsp/argb.c",
"dsp/argb_mips_dsp_r2.c",
"dsp/argb_sse2.c",
"dsp/cost.c",
"dsp/cost_mips32.c",
"dsp/cost_mips_dsp_r2.c",
@ -36,6 +33,9 @@ if env['builtin_libwebp']:
"dsp/cpu.c",
"dsp/dec.c",
"dsp/dec_clip_tables.c",
"dsp/ssim.c",
"dsp/ssim_sse2.c",
"dsp/yuv_neon.c",
"dsp/dec_mips32.c",
"dsp/dec_mips_dsp_r2.c",
"dsp/dec_msa.c",
@ -84,6 +84,7 @@ if env['builtin_libwebp']:
"dsp/yuv_sse2.c",
"enc/alpha_enc.c",
"enc/analysis_enc.c",
"enc/backward_references_cost_enc.c",
"enc/backward_references_enc.c",
"enc/config_enc.c",
"enc/cost_enc.c",
@ -122,10 +123,10 @@ if env['builtin_libwebp']:
"utils/thread_utils.c",
"utils/utils.c",
]
thirdparty_sources = [thirdparty_dir + file for file in thirdparty_sources]
thirdparty_sources = [thirdparty_dir + "src/" + file for file in thirdparty_sources]
env_webp.add_source_files(env.modules_sources, thirdparty_sources)
env_webp.Append(CPPPATH=[thirdparty_dir])
env_webp.Append(CPPPATH=[thirdparty_dir, thirdparty_dir + "src/"])
# Godot source files
env_webp.add_source_files(env.modules_sources, "*.cpp")

View file

@ -183,7 +183,7 @@ TODO.
## libwebp
- Upstream: https://chromium.googlesource.com/webm/libwebp/
- Version: 0.6.0
- Version: 0.6.1
- License: BSD-3-Clause
Files extracted from upstream source:

View file

@ -1,68 +0,0 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// ARGB making functions.
//
// Author: Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
}
static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
const uint8_t* b, int len, uint32_t* out) {
int i;
for (i = 0; i < len; ++i) {
out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
}
}
static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
int len, int step, uint32_t* out) {
int i, offset = 0;
for (i = 0; i < len; ++i) {
out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
offset += step;
}
}
void (*VP8PackARGB)(const uint8_t*, const uint8_t*, const uint8_t*,
const uint8_t*, int, uint32_t*);
void (*VP8PackRGB)(const uint8_t*, const uint8_t*, const uint8_t*,
int, int, uint32_t*);
extern void VP8EncDspARGBInitMIPSdspR2(void);
extern void VP8EncDspARGBInitSSE2(void);
static volatile VP8CPUInfo argb_last_cpuinfo_used =
(VP8CPUInfo)&argb_last_cpuinfo_used;
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInit(void) {
if (argb_last_cpuinfo_used == VP8GetCPUInfo) return;
VP8PackARGB = PackARGB;
VP8PackRGB = PackRGB;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8EncDspARGBInitSSE2();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
VP8EncDspARGBInitMIPSdspR2();
}
#endif
}
argb_last_cpuinfo_used = VP8GetCPUInfo;
}

View file

@ -1,110 +0,0 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// ARGB making functions (mips version).
//
// Author: Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
const uint8_t* b, int len, uint32_t* out) {
int temp0, temp1, temp2, temp3, offset;
const int rest = len & 1;
const uint32_t* const loop_end = out + len - rest;
const int step = 4;
__asm__ volatile (
"xor %[offset], %[offset], %[offset] \n\t"
"beq %[loop_end], %[out], 0f \n\t"
"2: \n\t"
"lbux %[temp0], %[offset](%[a]) \n\t"
"lbux %[temp1], %[offset](%[r]) \n\t"
"lbux %[temp2], %[offset](%[g]) \n\t"
"lbux %[temp3], %[offset](%[b]) \n\t"
"ins %[temp1], %[temp0], 16, 16 \n\t"
"ins %[temp3], %[temp2], 16, 16 \n\t"
"addiu %[out], %[out], 4 \n\t"
"precr.qb.ph %[temp0], %[temp1], %[temp3] \n\t"
"sw %[temp0], -4(%[out]) \n\t"
"addu %[offset], %[offset], %[step] \n\t"
"bne %[loop_end], %[out], 2b \n\t"
"0: \n\t"
"beq %[rest], $zero, 1f \n\t"
"lbux %[temp0], %[offset](%[a]) \n\t"
"lbux %[temp1], %[offset](%[r]) \n\t"
"lbux %[temp2], %[offset](%[g]) \n\t"
"lbux %[temp3], %[offset](%[b]) \n\t"
"ins %[temp1], %[temp0], 16, 16 \n\t"
"ins %[temp3], %[temp2], 16, 16 \n\t"
"precr.qb.ph %[temp0], %[temp1], %[temp3] \n\t"
"sw %[temp0], 0(%[out]) \n\t"
"1: \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [offset]"=&r"(offset), [out]"+&r"(out)
: [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
[loop_end]"r"(loop_end), [rest]"r"(rest)
: "memory"
);
}
static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
int len, int step, uint32_t* out) {
int temp0, temp1, temp2, offset;
const int rest = len & 1;
const int a = 0xff;
const uint32_t* const loop_end = out + len - rest;
__asm__ volatile (
"xor %[offset], %[offset], %[offset] \n\t"
"beq %[loop_end], %[out], 0f \n\t"
"2: \n\t"
"lbux %[temp0], %[offset](%[r]) \n\t"
"lbux %[temp1], %[offset](%[g]) \n\t"
"lbux %[temp2], %[offset](%[b]) \n\t"
"ins %[temp0], %[a], 16, 16 \n\t"
"ins %[temp2], %[temp1], 16, 16 \n\t"
"addiu %[out], %[out], 4 \n\t"
"precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t"
"sw %[temp0], -4(%[out]) \n\t"
"addu %[offset], %[offset], %[step] \n\t"
"bne %[loop_end], %[out], 2b \n\t"
"0: \n\t"
"beq %[rest], $zero, 1f \n\t"
"lbux %[temp0], %[offset](%[r]) \n\t"
"lbux %[temp1], %[offset](%[g]) \n\t"
"lbux %[temp2], %[offset](%[b]) \n\t"
"ins %[temp0], %[a], 16, 16 \n\t"
"ins %[temp2], %[temp1], 16, 16 \n\t"
"precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t"
"sw %[temp0], 0(%[out]) \n\t"
"1: \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[offset]"=&r"(offset), [out]"+&r"(out)
: [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
[loop_end]"r"(loop_end), [rest]"r"(rest)
: "memory"
);
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspARGBInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitMIPSdspR2(void) {
VP8PackARGB = PackARGB;
VP8PackRGB = PackRGB;
}
#else // !WEBP_USE_MIPS_DSP_R2
WEBP_DSP_INIT_STUB(VP8EncDspARGBInitMIPSdspR2)
#endif // WEBP_USE_MIPS_DSP_R2

View file

@ -1,67 +0,0 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// ARGB making functions (SSE2 version).
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#if defined(WEBP_USE_SSE2)
#include <assert.h>
#include <emmintrin.h>
#include <string.h>
static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
}
static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
const uint8_t* b, int len, uint32_t* out) {
if (g == r + 1) { // RGBA input order. Need to swap R and B.
int i = 0;
const int len_max = len & ~3; // max length processed in main loop
const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
assert(b == r + 2);
assert(a == r + 3);
for (; i < len_max; i += 4) {
const __m128i A = _mm_loadu_si128((const __m128i*)(r + 4 * i));
const __m128i B = _mm_and_si128(A, red_blue_mask); // R 0 B 0
const __m128i C = _mm_andnot_si128(red_blue_mask, A); // 0 G 0 A
const __m128i D = _mm_shufflelo_epi16(B, _MM_SHUFFLE(2, 3, 0, 1));
const __m128i E = _mm_shufflehi_epi16(D, _MM_SHUFFLE(2, 3, 0, 1));
const __m128i F = _mm_or_si128(E, C);
_mm_storeu_si128((__m128i*)(out + i), F);
}
for (; i < len; ++i) {
out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
}
} else {
assert(g == b + 1);
assert(r == b + 2);
assert(a == b + 3);
memcpy(out, b, len * 4);
}
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspARGBInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitSSE2(void) {
VP8PackARGB = PackARGB;
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(VP8EncDspARGBInitSSE2)
#endif // WEBP_USE_SSE2

File diff suppressed because it is too large Load diff

View file

@ -12,13 +12,13 @@
// Author: Skal (pascal.massimino@gmail.com)
#include <stdlib.h>
#include "./alphai_dec.h"
#include "./vp8i_dec.h"
#include "./vp8li_dec.h"
#include "../dsp/dsp.h"
#include "../utils/quant_levels_dec_utils.h"
#include "../utils/utils.h"
#include "../webp/format_constants.h"
#include "src/dec/alphai_dec.h"
#include "src/dec/vp8i_dec.h"
#include "src/dec/vp8li_dec.h"
#include "src/dsp/dsp.h"
#include "src/utils/quant_levels_dec_utils.h"
#include "src/utils/utils.h"
#include "src/webp/format_constants.h"
//------------------------------------------------------------------------------
// ALPHDecoder object.

View file

@ -11,11 +11,11 @@
//
// Author: Urvang (urvang@google.com)
#ifndef WEBP_DEC_ALPHAI_H_
#define WEBP_DEC_ALPHAI_H_
#ifndef WEBP_DEC_ALPHAI_DEC_H_
#define WEBP_DEC_ALPHAI_DEC_H_
#include "./webpi_dec.h"
#include "../utils/filters_utils.h"
#include "src/dec/webpi_dec.h"
#include "src/utils/filters_utils.h"
#ifdef __cplusplus
extern "C" {
@ -51,4 +51,4 @@ void WebPDeallocateAlphaMemory(VP8Decoder* const dec);
} // extern "C"
#endif
#endif /* WEBP_DEC_ALPHAI_H_ */
#endif /* WEBP_DEC_ALPHAI_DEC_H_ */

View file

@ -13,15 +13,15 @@
#include <stdlib.h>
#include "./vp8i_dec.h"
#include "./webpi_dec.h"
#include "../utils/utils.h"
#include "src/dec/vp8i_dec.h"
#include "src/dec/webpi_dec.h"
#include "src/utils/utils.h"
//------------------------------------------------------------------------------
// WebPDecBuffer
// Number of bytes per pixel for the different color-spaces.
static const int kModeBpp[MODE_LAST] = {
static const uint8_t kModeBpp[MODE_LAST] = {
3, 4, 3, 4, 4, 2, 2,
4, 4, 4, 2, // pre-multiplied modes
1, 1 };
@ -36,7 +36,7 @@ static int IsValidColorspace(int webp_csp_mode) {
// strictly speaking, the very last (or first, if flipped) row
// doesn't require padding.
#define MIN_BUFFER_SIZE(WIDTH, HEIGHT, STRIDE) \
(uint64_t)(STRIDE) * ((HEIGHT) - 1) + (WIDTH)
((uint64_t)(STRIDE) * ((HEIGHT) - 1) + (WIDTH))
static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
int ok = 1;
@ -98,9 +98,14 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
uint64_t uv_size = 0, a_size = 0, total_size;
// We need memory and it hasn't been allocated yet.
// => initialize output buffer, now that dimensions are known.
const int stride = w * kModeBpp[mode];
const uint64_t size = (uint64_t)stride * h;
int stride;
uint64_t size;
if ((uint64_t)w * kModeBpp[mode] >= (1ull << 32)) {
return VP8_STATUS_INVALID_PARAM;
}
stride = w * kModeBpp[mode];
size = (uint64_t)stride * h;
if (!WebPIsRGBMode(mode)) {
uv_stride = (w + 1) / 2;
uv_size = (uint64_t)uv_stride * ((h + 1) / 2);
@ -169,11 +174,11 @@ VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer) {
return VP8_STATUS_OK;
}
VP8StatusCode WebPAllocateDecBuffer(int w, int h,
VP8StatusCode WebPAllocateDecBuffer(int width, int height,
const WebPDecoderOptions* const options,
WebPDecBuffer* const out) {
WebPDecBuffer* const buffer) {
VP8StatusCode status;
if (out == NULL || w <= 0 || h <= 0) {
if (buffer == NULL || width <= 0 || height <= 0) {
return VP8_STATUS_INVALID_PARAM;
}
if (options != NULL) { // First, apply options if there is any.
@ -182,33 +187,39 @@ VP8StatusCode WebPAllocateDecBuffer(int w, int h,
const int ch = options->crop_height;
const int x = options->crop_left & ~1;
const int y = options->crop_top & ~1;
if (x < 0 || y < 0 || cw <= 0 || ch <= 0 || x + cw > w || y + ch > h) {
if (x < 0 || y < 0 || cw <= 0 || ch <= 0 ||
x + cw > width || y + ch > height) {
return VP8_STATUS_INVALID_PARAM; // out of frame boundary.
}
w = cw;
h = ch;
width = cw;
height = ch;
}
if (options->use_scaling) {
#if !defined(WEBP_REDUCE_SIZE)
int scaled_width = options->scaled_width;
int scaled_height = options->scaled_height;
if (!WebPRescalerGetScaledDimensions(
w, h, &scaled_width, &scaled_height)) {
width, height, &scaled_width, &scaled_height)) {
return VP8_STATUS_INVALID_PARAM;
}
w = scaled_width;
h = scaled_height;
width = scaled_width;
height = scaled_height;
#else
return VP8_STATUS_INVALID_PARAM; // rescaling not supported
#endif
}
}
out->width = w;
out->height = h;
buffer->width = width;
buffer->height = height;
// Then, allocate buffer for real.
status = AllocateBuffer(out);
status = AllocateBuffer(buffer);
if (status != VP8_STATUS_OK) return status;
// Use the stride trick if vertical flip is needed.
if (options != NULL && options->flip) {
status = WebPFlipBuffer(out);
status = WebPFlipBuffer(buffer);
}
return status;
}

View file

@ -11,8 +11,8 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#ifndef WEBP_DEC_COMMON_H_
#define WEBP_DEC_COMMON_H_
#ifndef WEBP_DEC_COMMON_DEC_H_
#define WEBP_DEC_COMMON_DEC_H_
// intra prediction modes
enum { B_DC_PRED = 0, // 4x4 modes
@ -51,4 +51,4 @@ enum { MB_FEATURE_TREE_PROBS = 3,
NUM_PROBAS = 11
};
#endif // WEBP_DEC_COMMON_H_
#endif // WEBP_DEC_COMMON_DEC_H_

View file

@ -12,13 +12,13 @@
// Author: Skal (pascal.massimino@gmail.com)
#include <stdlib.h>
#include "./vp8i_dec.h"
#include "../utils/utils.h"
#include "src/dec/vp8i_dec.h"
#include "src/utils/utils.h"
//------------------------------------------------------------------------------
// Main reconstruction function.
static const int kScan[16] = {
static const uint16_t kScan[16] = {
0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
@ -320,7 +320,7 @@ static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
#define MIN_DITHER_AMP 4
#define DITHER_AMP_TAB_SIZE 12
static const int kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
static const uint8_t kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
// roughly, it's dqm->uv_mat_[1]
8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1
};
@ -728,7 +728,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
}
mem = (uint8_t*)dec->mem_;
dec->intra_t_ = (uint8_t*)mem;
dec->intra_t_ = mem;
mem += intra_pred_mode_size;
dec->yuv_t_ = (VP8TopSamples*)mem;
@ -750,7 +750,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
mem = (uint8_t*)WEBP_ALIGN(mem);
assert((yuv_size & WEBP_ALIGN_CST) == 0);
dec->yuv_b_ = (uint8_t*)mem;
dec->yuv_b_ = mem;
mem += yuv_size;
dec->mb_data_ = (VP8MBData*)mem;
@ -766,7 +766,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
const int extra_rows = kFilterExtraRows[dec->filter_type_];
const int extra_y = extra_rows * dec->cache_y_stride_;
const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride_;
dec->cache_y_ = ((uint8_t*)mem) + extra_y;
dec->cache_y_ = mem + extra_y;
dec->cache_u_ = dec->cache_y_
+ 16 * num_caches * dec->cache_y_stride_ + extra_uv;
dec->cache_v_ = dec->cache_u_
@ -776,7 +776,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
mem += cache_size;
// alpha plane
dec->alpha_plane_ = alpha_size ? (uint8_t*)mem : NULL;
dec->alpha_plane_ = alpha_size ? mem : NULL;
mem += alpha_size;
assert(mem <= (uint8_t*)dec->mem_ + dec->mem_size_);

View file

@ -15,10 +15,10 @@
#include <string.h>
#include <stdlib.h>
#include "./alphai_dec.h"
#include "./webpi_dec.h"
#include "./vp8i_dec.h"
#include "../utils/utils.h"
#include "src/dec/alphai_dec.h"
#include "src/dec/webpi_dec.h"
#include "src/dec/vp8i_dec.h"
#include "src/utils/utils.h"
// In append mode, buffer allocations increase as multiples of this value.
// Needs to be a power of 2.
@ -673,12 +673,12 @@ void WebPIDelete(WebPIDecoder* idec) {
//------------------------------------------------------------------------------
// Wrapper toward WebPINewDecoder
WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE csp, uint8_t* output_buffer,
size_t output_buffer_size, int output_stride) {
const int is_external_memory = (output_buffer != NULL) ? 1 : 0;
WebPIDecoder* idec;
if (mode >= MODE_YUV) return NULL;
if (csp >= MODE_YUV) return NULL;
if (is_external_memory == 0) { // Overwrite parameters to sane values.
output_buffer_size = 0;
output_stride = 0;
@ -689,7 +689,7 @@ WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
}
idec = WebPINewDecoder(NULL);
if (idec == NULL) return NULL;
idec->output_.colorspace = mode;
idec->output_.colorspace = csp;
idec->output_.is_external_memory = is_external_memory;
idec->output_.u.RGBA.rgba = output_buffer;
idec->output_.u.RGBA.stride = output_stride;

View file

@ -13,11 +13,11 @@
#include <assert.h>
#include <stdlib.h>
#include "../dec/vp8i_dec.h"
#include "./webpi_dec.h"
#include "../dsp/dsp.h"
#include "../dsp/yuv.h"
#include "../utils/utils.h"
#include "src/dec/vp8i_dec.h"
#include "src/dec/webpi_dec.h"
#include "src/dsp/dsp.h"
#include "src/dsp/yuv.h"
#include "src/utils/utils.h"
//------------------------------------------------------------------------------
// Main YUV<->RGB conversion functions
@ -212,7 +212,7 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
int num_rows;
const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
uint8_t* alpha_dst = base_rgba;
#else
uint8_t* alpha_dst = base_rgba + 1;
@ -241,6 +241,7 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
//------------------------------------------------------------------------------
// YUV rescaling (no final RGB conversion needed)
#if !defined(WEBP_REDUCE_SIZE)
static int Rescale(const uint8_t* src, int src_stride,
int new_lines, WebPRescaler* const wrk) {
int num_lines_out = 0;
@ -431,7 +432,7 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos,
int max_lines_out) {
const WebPRGBABuffer* const buf = &p->output->u.RGBA;
uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
uint8_t* alpha_dst = base_rgba;
#else
uint8_t* alpha_dst = base_rgba + 1;
@ -541,6 +542,8 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
return 1;
}
#endif // WEBP_REDUCE_SIZE
//------------------------------------------------------------------------------
// Default custom functions
@ -561,10 +564,14 @@ static int CustomSetup(VP8Io* io) {
WebPInitUpsamplers();
}
if (io->use_scaling) {
#if !defined(WEBP_REDUCE_SIZE)
const int ok = is_rgb ? InitRGBRescaler(io, p) : InitYUVRescaler(io, p);
if (!ok) {
return 0; // memory error
}
#else
return 0; // rescaling support not compiled
#endif
} else {
if (is_rgb) {
WebPInitSamplers();
@ -598,9 +605,6 @@ static int CustomSetup(VP8Io* io) {
}
}
if (is_rgb) {
VP8YUVInit();
}
return 1;
}

View file

@ -11,7 +11,7 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./vp8i_dec.h"
#include "src/dec/vp8i_dec.h"
static WEBP_INLINE int clip(int v, int M) {
return v < 0 ? 0 : v > M ? M : v;

View file

@ -11,15 +11,19 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./vp8i_dec.h"
#include "../utils/bit_reader_inl_utils.h"
#include "src/dec/vp8i_dec.h"
#include "src/utils/bit_reader_inl_utils.h"
#if !defined(USE_GENERIC_TREE)
#if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__)
// using a table is ~1-2% slower on ARM. Prefer the coded-tree approach then.
#define USE_GENERIC_TREE
#define USE_GENERIC_TREE 1 // ALTERNATE_CODE
#else
#define USE_GENERIC_TREE 0
#endif
#endif // USE_GENERIC_TREE
#ifdef USE_GENERIC_TREE
#if (USE_GENERIC_TREE == 1)
static const int8_t kYModesIntra4[18] = {
-B_DC_PRED, 1,
-B_TM_PRED, 2,
@ -317,7 +321,7 @@ static void ParseIntraMode(VP8BitReader* const br,
int x;
for (x = 0; x < 4; ++x) {
const uint8_t* const prob = kBModesProba[top[x]][ymode];
#ifdef USE_GENERIC_TREE
#if (USE_GENERIC_TREE == 1)
// Generic tree-parsing
int i = kYModesIntra4[VP8GetBit(br, prob[0])];
while (i > 0) {
@ -335,7 +339,7 @@ static void ParseIntraMode(VP8BitReader* const br,
(!VP8GetBit(br, prob[6]) ? B_LD_PRED :
(!VP8GetBit(br, prob[7]) ? B_VL_PRED :
(!VP8GetBit(br, prob[8]) ? B_HD_PRED : B_HU_PRED)));
#endif // USE_GENERIC_TREE
#endif // USE_GENERIC_TREE
top[x] = ymode;
}
memcpy(modes, top, 4 * sizeof(*top));
@ -498,7 +502,7 @@ static const uint8_t
// Paragraph 9.9
static const int kBands[16 + 1] = {
static const uint8_t kBands[16 + 1] = {
0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
0 // extra entry as sentinel
};

View file

@ -13,12 +13,12 @@
#include <stdlib.h>
#include "./alphai_dec.h"
#include "./vp8i_dec.h"
#include "./vp8li_dec.h"
#include "./webpi_dec.h"
#include "../utils/bit_reader_inl_utils.h"
#include "../utils/utils.h"
#include "src/dec/alphai_dec.h"
#include "src/dec/vp8i_dec.h"
#include "src/dec/vp8li_dec.h"
#include "src/dec/webpi_dec.h"
#include "src/utils/bit_reader_inl_utils.h"
#include "src/utils/utils.h"
//------------------------------------------------------------------------------

View file

@ -11,10 +11,10 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#ifndef WEBP_WEBP_DECODE_VP8_H_
#define WEBP_WEBP_DECODE_VP8_H_
#ifndef WEBP_DEC_VP8_DEC_H_
#define WEBP_DEC_VP8_DEC_H_
#include "../webp/decode.h"
#include "src/webp/decode.h"
#ifdef __cplusplus
extern "C" {
@ -33,7 +33,7 @@ extern "C" {
// /* customize io's functions (setup()/put()/teardown()) if needed. */
//
// VP8Decoder* dec = VP8New();
// bool ok = VP8Decode(dec);
// int ok = VP8Decode(dec, &io);
// if (!ok) printf("Error: %s\n", VP8StatusMessage(dec));
// VP8Delete(dec);
// return ok;
@ -157,24 +157,24 @@ void VP8Delete(VP8Decoder* const dec);
// Miscellaneous VP8/VP8L bitstream probing functions.
// Returns true if the next 3 bytes in data contain the VP8 signature.
WEBP_EXTERN(int) VP8CheckSignature(const uint8_t* const data, size_t data_size);
WEBP_EXTERN int VP8CheckSignature(const uint8_t* const data, size_t data_size);
// Validates the VP8 data-header and retrieves basic header information viz
// width and height. Returns 0 in case of formatting error. *width/*height
// can be passed NULL.
WEBP_EXTERN(int) VP8GetInfo(
WEBP_EXTERN int VP8GetInfo(
const uint8_t* data,
size_t data_size, // data available so far
size_t chunk_size, // total data size expected in the chunk
int* const width, int* const height);
// Returns true if the next byte(s) in data is a VP8L signature.
WEBP_EXTERN(int) VP8LCheckSignature(const uint8_t* const data, size_t size);
WEBP_EXTERN int VP8LCheckSignature(const uint8_t* const data, size_t size);
// Validates the VP8L data-header and retrieves basic header information viz
// width, height and alpha. Returns 0 in case of formatting error.
// width/height/has_alpha can be passed NULL.
WEBP_EXTERN(int) VP8LGetInfo(
WEBP_EXTERN int VP8LGetInfo(
const uint8_t* data, size_t data_size, // data available so far
int* const width, int* const height, int* const has_alpha);
@ -182,4 +182,4 @@ WEBP_EXTERN(int) VP8LGetInfo(
} // extern "C"
#endif
#endif /* WEBP_WEBP_DECODE_VP8_H_ */
#endif /* WEBP_DEC_VP8_DEC_H_ */

View file

@ -11,16 +11,16 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#ifndef WEBP_DEC_VP8I_H_
#define WEBP_DEC_VP8I_H_
#ifndef WEBP_DEC_VP8I_DEC_H_
#define WEBP_DEC_VP8I_DEC_H_
#include <string.h> // for memcpy()
#include "./common_dec.h"
#include "./vp8li_dec.h"
#include "../utils/bit_reader_utils.h"
#include "../utils/random_utils.h"
#include "../utils/thread_utils.h"
#include "../dsp/dsp.h"
#include "src/dec/common_dec.h"
#include "src/dec/vp8li_dec.h"
#include "src/utils/bit_reader_utils.h"
#include "src/utils/random_utils.h"
#include "src/utils/thread_utils.h"
#include "src/dsp/dsp.h"
#ifdef __cplusplus
extern "C" {
@ -32,7 +32,7 @@ extern "C" {
// version numbers
#define DEC_MAJ_VERSION 0
#define DEC_MIN_VERSION 6
#define DEC_REV_VERSION 0
#define DEC_REV_VERSION 1
// YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
// Constraints are: We need to store one 16x16 block of luma samples (y),
@ -57,7 +57,6 @@ extern "C" {
// '|' = left sample, '-' = top sample, '+' = top-left sample
// 't' = extra top-right sample for 4x4 modes
#define YUV_SIZE (BPS * 17 + BPS * 9)
#define Y_SIZE (BPS * 17)
#define Y_OFF (BPS * 1 + 8)
#define U_OFF (Y_OFF + BPS * 16 + BPS)
#define V_OFF (U_OFF + 16)
@ -317,4 +316,4 @@ const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
} // extern "C"
#endif
#endif /* WEBP_DEC_VP8I_H_ */
#endif /* WEBP_DEC_VP8I_DEC_H_ */

View file

@ -14,22 +14,22 @@
#include <stdlib.h>
#include "./alphai_dec.h"
#include "./vp8li_dec.h"
#include "../dsp/dsp.h"
#include "../dsp/lossless.h"
#include "../dsp/lossless_common.h"
#include "../dsp/yuv.h"
#include "../utils/endian_inl_utils.h"
#include "../utils/huffman_utils.h"
#include "../utils/utils.h"
#include "src/dec/alphai_dec.h"
#include "src/dec/vp8li_dec.h"
#include "src/dsp/dsp.h"
#include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h"
#include "src/dsp/yuv.h"
#include "src/utils/endian_inl_utils.h"
#include "src/utils/huffman_utils.h"
#include "src/utils/utils.h"
#define NUM_ARGB_CACHE_ROWS 16
static const int kCodeLengthLiterals = 16;
static const int kCodeLengthRepeatCode = 16;
static const int kCodeLengthExtraBits[3] = { 2, 3, 7 };
static const int kCodeLengthRepeatOffsets[3] = { 3, 3, 11 };
static const uint8_t kCodeLengthExtraBits[3] = { 2, 3, 7 };
static const uint8_t kCodeLengthRepeatOffsets[3] = { 3, 3, 11 };
// -----------------------------------------------------------------------------
// Five Huffman codes are used at each meta code:
@ -86,7 +86,7 @@ static const uint8_t kCodeToPlane[CODE_TO_PLANE_CODES] = {
// All values computed for 8-bit first level lookup with Mark Adler's tool:
// http://www.hdfgroup.org/ftp/lib-external/zlib/zlib-1.2.5/examples/enough.c
#define FIXED_TABLE_SIZE (630 * 3 + 410)
static const int kTableSize[12] = {
static const uint16_t kTableSize[12] = {
FIXED_TABLE_SIZE + 654,
FIXED_TABLE_SIZE + 656,
FIXED_TABLE_SIZE + 658,
@ -485,6 +485,7 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
//------------------------------------------------------------------------------
// Scaling.
#if !defined(WEBP_REDUCE_SIZE)
static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
const int num_channels = 4;
const int in_width = io->mb_w;
@ -516,10 +517,13 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
out_width, out_height, 0, num_channels, work);
return 1;
}
#endif // WEBP_REDUCE_SIZE
//------------------------------------------------------------------------------
// Export to ARGB
#if !defined(WEBP_REDUCE_SIZE)
// We have special "export" function since we need to convert from BGRA
static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
int rgba_stride, uint8_t* const rgba) {
@ -561,6 +565,8 @@ static int EmitRescaledRowsRGBA(const VP8LDecoder* const dec,
return num_lines_out;
}
#endif // WEBP_REDUCE_SIZE
// Emit rows without any scaling.
static int EmitRows(WEBP_CSP_MODE colorspace,
const uint8_t* row_in, int in_stride,
@ -746,9 +752,12 @@ static void ProcessRows(VP8LDecoder* const dec, int row) {
if (WebPIsRGBMode(output->colorspace)) { // convert to RGBA
const WebPRGBABuffer* const buf = &output->u.RGBA;
uint8_t* const rgba = buf->rgba + dec->last_out_row_ * buf->stride;
const int num_rows_out = io->use_scaling ?
const int num_rows_out =
#if !defined(WEBP_REDUCE_SIZE)
io->use_scaling ?
EmitRescaledRowsRGBA(dec, rows_data, in_stride, io->mb_h,
rgba, buf->stride) :
#endif // WEBP_REDUCE_SIZE
EmitRows(output->colorspace, rows_data, in_stride,
io->mb_w, io->mb_h, rgba, buf->stride);
// Update 'last_out_row_'.
@ -1012,12 +1021,13 @@ static int DecodeAlphaData(VP8LDecoder* const dec, uint8_t* const data,
ok = 0;
goto End;
}
assert(br->eos_ == VP8LIsEndOfStream(br));
br->eos_ = VP8LIsEndOfStream(br);
}
// Process the remaining rows corresponding to last row-block.
ExtractPalettedAlphaRows(dec, row > last_row ? last_row : row);
End:
br->eos_ = VP8LIsEndOfStream(br);
if (!ok || (br->eos_ && pos < end)) {
ok = 0;
dec->status_ = br->eos_ ? VP8_STATUS_SUSPENDED
@ -1090,11 +1100,12 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
VP8LFillBitWindow(br);
if (htree_group->use_packed_table) {
code = ReadPackedSymbols(htree_group, br, src);
if (VP8LIsEndOfStream(br)) break;
if (code == PACKED_NON_LITERAL_CODE) goto AdvanceByOne;
} else {
code = ReadSymbol(htree_group->htrees[GREEN], br);
}
if (br->eos_) break; // early out
if (VP8LIsEndOfStream(br)) break;
if (code < NUM_LITERAL_CODES) { // Literal
if (htree_group->is_trivial_literal) {
*src = htree_group->literal_arb | (code << 8);
@ -1104,7 +1115,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
VP8LFillBitWindow(br);
blue = ReadSymbol(htree_group->htrees[BLUE], br);
alpha = ReadSymbol(htree_group->htrees[ALPHA], br);
if (br->eos_) break;
if (VP8LIsEndOfStream(br)) break;
*src = ((uint32_t)alpha << 24) | (red << 16) | (code << 8) | blue;
}
AdvanceByOne:
@ -1132,7 +1143,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
VP8LFillBitWindow(br);
dist_code = GetCopyDistance(dist_symbol, br);
dist = PlaneCodeToDistance(width, dist_code);
if (br->eos_) break;
if (VP8LIsEndOfStream(br)) break;
if (src - data < (ptrdiff_t)dist || src_end - src < (ptrdiff_t)length) {
goto Error;
} else {
@ -1169,9 +1180,9 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
} else { // Not reached
goto Error;
}
assert(br->eos_ == VP8LIsEndOfStream(br));
}
br->eos_ = VP8LIsEndOfStream(br);
if (dec->incremental_ && br->eos_ && src < src_end) {
RestoreState(dec);
} else if (!br->eos_) {
@ -1630,12 +1641,19 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
if (!AllocateInternalBuffers32b(dec, io->width)) goto Err;
#if !defined(WEBP_REDUCE_SIZE)
if (io->use_scaling && !AllocateAndInitRescaler(dec, io)) goto Err;
if (io->use_scaling || WebPIsPremultipliedMode(dec->output_->colorspace)) {
// need the alpha-multiply functions for premultiplied output or rescaling
WebPInitAlphaProcessing();
}
#else
if (io->use_scaling) {
dec->status_ = VP8_STATUS_INVALID_PARAM;
goto Err;
}
#endif
if (!WebPIsRGBMode(dec->output_->colorspace)) {
WebPInitConvertARGBToYUV();
if (dec->output_->u.YUVA.a != NULL) WebPInitAlphaProcessing();

View file

@ -12,14 +12,14 @@
// Author: Skal (pascal.massimino@gmail.com)
// Vikas Arora(vikaas.arora@gmail.com)
#ifndef WEBP_DEC_VP8LI_H_
#define WEBP_DEC_VP8LI_H_
#ifndef WEBP_DEC_VP8LI_DEC_H_
#define WEBP_DEC_VP8LI_DEC_H_
#include <string.h> // for memcpy()
#include "./webpi_dec.h"
#include "../utils/bit_reader_utils.h"
#include "../utils/color_cache_utils.h"
#include "../utils/huffman_utils.h"
#include "src/dec/webpi_dec.h"
#include "src/utils/bit_reader_utils.h"
#include "src/utils/color_cache_utils.h"
#include "src/utils/huffman_utils.h"
#ifdef __cplusplus
extern "C" {
@ -132,4 +132,4 @@ void VP8LDelete(VP8LDecoder* const dec);
} // extern "C"
#endif
#endif /* WEBP_DEC_VP8LI_H_ */
#endif /* WEBP_DEC_VP8LI_DEC_H_ */

View file

@ -13,11 +13,11 @@
#include <stdlib.h>
#include "./vp8i_dec.h"
#include "./vp8li_dec.h"
#include "./webpi_dec.h"
#include "../utils/utils.h"
#include "../webp/mux_types.h" // ALPHA_FLAG
#include "src/dec/vp8i_dec.h"
#include "src/dec/vp8li_dec.h"
#include "src/dec/webpi_dec.h"
#include "src/utils/utils.h"
#include "src/webp/mux_types.h" // ALPHA_FLAG
//------------------------------------------------------------------------------
// RIFF layout is:
@ -421,7 +421,9 @@ VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
NULL, NULL, NULL, &has_animation,
NULL, headers);
if (status == VP8_STATUS_OK || status == VP8_STATUS_NOT_ENOUGH_DATA) {
// TODO(jzern): full support of animation frames will require API additions.
// The WebPDemux API + libwebp can be used to decode individual
// uncomposited frames or the WebPAnimDecoder can be used to fully
// reconstruct them (see webp/demux.h).
if (has_animation) {
status = VP8_STATUS_UNSUPPORTED_FEATURE;
}

View file

@ -11,15 +11,15 @@
//
// Author: somnath@google.com (Somnath Banerjee)
#ifndef WEBP_DEC_WEBPI_H_
#define WEBP_DEC_WEBPI_H_
#ifndef WEBP_DEC_WEBPI_DEC_H_
#define WEBP_DEC_WEBPI_DEC_H_
#ifdef __cplusplus
extern "C" {
#endif
#include "../utils/rescaler_utils.h"
#include "./vp8_dec.h"
#include "src/utils/rescaler_utils.h"
#include "src/dec/vp8_dec.h"
//------------------------------------------------------------------------------
// WebPDecParams: Decoding output parameters. Transient internal object.
@ -130,4 +130,4 @@ int WebPAvoidSlowMemory(const WebPDecBuffer* const output,
} // extern "C"
#endif
#endif /* WEBP_DEC_WEBPI_H_ */
#endif /* WEBP_DEC_WEBPI_DEC_H_ */

View file

@ -11,15 +11,15 @@
//
#ifdef HAVE_CONFIG_H
#include "../webp/config.h"
#include "src/webp/config.h"
#endif
#include <assert.h>
#include <string.h>
#include "../utils/utils.h"
#include "../webp/decode.h"
#include "../webp/demux.h"
#include "src/utils/utils.h"
#include "src/webp/decode.h"
#include "src/webp/demux.h"
#define NUM_CHANNELS 4

View file

@ -11,21 +11,21 @@
//
#ifdef HAVE_CONFIG_H
#include "../webp/config.h"
#include "src/webp/config.h"
#endif
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include "../utils/utils.h"
#include "../webp/decode.h" // WebPGetFeatures
#include "../webp/demux.h"
#include "../webp/format_constants.h"
#include "src/utils/utils.h"
#include "src/webp/decode.h" // WebPGetFeatures
#include "src/webp/demux.h"
#include "src/webp/format_constants.h"
#define DMUX_MAJ_VERSION 0
#define DMUX_MIN_VERSION 3
#define DMUX_REV_VERSION 2
#define DMUX_REV_VERSION 3
typedef struct {
size_t start_; // start location of the data
@ -205,12 +205,14 @@ static void SetFrameInfo(size_t start_offset, size_t size,
frame->complete_ = complete;
}
// Store image bearing chunks to 'frame'.
// Store image bearing chunks to 'frame'. 'min_size' is an optional size
// requirement, it may be zero.
static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
MemBuffer* const mem, Frame* const frame) {
int alpha_chunks = 0;
int image_chunks = 0;
int done = (MemDataSize(mem) < min_size);
int done = (MemDataSize(mem) < CHUNK_HEADER_SIZE ||
MemDataSize(mem) < min_size);
ParseStatus status = PARSE_OK;
if (done) return PARSE_NEED_MORE_DATA;
@ -401,9 +403,9 @@ static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
frame = (Frame*)WebPSafeCalloc(1ULL, sizeof(*frame));
if (frame == NULL) return PARSE_ERROR;
// For the single image case we allow parsing of a partial frame, but we need
// at least CHUNK_HEADER_SIZE for parsing.
status = StoreFrame(1, CHUNK_HEADER_SIZE, &dmux->mem_, frame);
// For the single image case we allow parsing of a partial frame, so no
// minimum size is imposed here.
status = StoreFrame(1, 0, &dmux->mem_, frame);
if (status != PARSE_ERROR) {
const int has_alpha = !!(dmux->feature_flags_ & ALPHA_FLAG);
// Clear any alpha when the alpha flag is missing.

View file

@ -12,10 +12,13 @@
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include "./dsp.h"
#include "src/dsp/dsp.h"
// Tables can be faster on some platform but incur some extra binary size (~2k).
// #define USE_TABLES_FOR_ALPHA_MULT
#if !defined(USE_TABLES_FOR_ALPHA_MULT)
#define USE_TABLES_FOR_ALPHA_MULT 0 // ALTERNATE_CODE
#endif
// -----------------------------------------------------------------------------
@ -29,7 +32,7 @@ static uint32_t Mult(uint8_t x, uint32_t mult) {
return v;
}
#ifdef USE_TABLES_FOR_ALPHA_MULT
#if (USE_TABLES_FOR_ALPHA_MULT == 1)
static const uint32_t kMultTables[2][256] = {
{ // (255u << MFIX) / alpha
@ -132,9 +135,9 @@ static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
return inverse ? (255u << MFIX) / a : a * KINV_255;
}
#endif // USE_TABLES_FOR_ALPHA_MULT
#endif // USE_TABLES_FOR_ALPHA_MULT
void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) {
void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse) {
int x;
for (x = 0; x < width; ++x) {
const uint32_t argb = ptr[x];
@ -154,8 +157,8 @@ void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) {
}
}
void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
int width, int inverse) {
void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
int width, int inverse) {
int x;
for (x = 0; x < width; ++x) {
const uint32_t a = alpha[x];
@ -217,8 +220,9 @@ void WebPMultRows(uint8_t* ptr, int stride,
#define PREMULTIPLY(x, m) (((x) * (m) + (1U << 23)) >> 24)
#endif
static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
int w, int h, int stride) {
#if !WEBP_NEON_OMIT_C_CODE
static void ApplyAlphaMultiply_C(uint8_t* rgba, int alpha_first,
int w, int h, int stride) {
while (h-- > 0) {
uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
@ -235,6 +239,7 @@ static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
rgba += stride;
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
#undef MULTIPLIER
#undef PREMULTIPLY
@ -254,9 +259,9 @@ static WEBP_INLINE uint8_t multiply(uint8_t x, uint32_t m) {
return (x * m) >> 16;
}
static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444,
int w, int h, int stride,
int rg_byte_pos /* 0 or 1 */) {
static WEBP_INLINE void ApplyAlphaMultiply4444_C(uint8_t* rgba4444,
int w, int h, int stride,
int rg_byte_pos /* 0 or 1 */) {
while (h-- > 0) {
int i;
for (i = 0; i < w; ++i) {
@ -275,15 +280,16 @@ static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444,
}
#undef MULTIPLIER
static void ApplyAlphaMultiply_16b(uint8_t* rgba4444,
int w, int h, int stride) {
#ifdef WEBP_SWAP_16BIT_CSP
ApplyAlphaMultiply4444(rgba4444, w, h, stride, 1);
static void ApplyAlphaMultiply_16b_C(uint8_t* rgba4444,
int w, int h, int stride) {
#if (WEBP_SWAP_16BIT_CSP == 1)
ApplyAlphaMultiply4444_C(rgba4444, w, h, stride, 1);
#else
ApplyAlphaMultiply4444(rgba4444, w, h, stride, 0);
ApplyAlphaMultiply4444_C(rgba4444, w, h, stride, 0);
#endif
}
#if !WEBP_NEON_OMIT_C_CODE
static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint8_t* dst, int dst_stride) {
@ -338,6 +344,36 @@ static void ExtractGreen_C(const uint32_t* argb, uint8_t* alpha, int size) {
int i;
for (i = 0; i < size; ++i) alpha[i] = argb[i] >> 8;
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
static int HasAlpha8b_C(const uint8_t* src, int length) {
while (length-- > 0) if (*src++ != 0xff) return 1;
return 0;
}
static int HasAlpha32b_C(const uint8_t* src, int length) {
int x;
for (x = 0; length-- > 0; x += 4) if (src[x] != 0xff) return 1;
return 0;
}
//------------------------------------------------------------------------------
// Simple channel manipulations.
static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
}
static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
int len, int step, uint32_t* out) {
int i, offset = 0;
for (i = 0; i < len; ++i) {
out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
offset += step;
}
}
void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
@ -345,6 +381,11 @@ int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
int len, int step, uint32_t* out);
int (*WebPHasAlpha8b)(const uint8_t* src, int length);
int (*WebPHasAlpha32b)(const uint8_t* src, int length);
//------------------------------------------------------------------------------
// Init function
@ -360,15 +401,21 @@ static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used =
WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
if (alpha_processing_last_cpuinfo_used == VP8GetCPUInfo) return;
WebPMultARGBRow = WebPMultARGBRowC;
WebPMultRow = WebPMultRowC;
WebPApplyAlphaMultiply = ApplyAlphaMultiply;
WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b;
WebPMultARGBRow = WebPMultARGBRow_C;
WebPMultRow = WebPMultRow_C;
WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b_C;
WebPPackRGB = PackRGB_C;
#if !WEBP_NEON_OMIT_C_CODE
WebPApplyAlphaMultiply = ApplyAlphaMultiply_C;
WebPDispatchAlpha = DispatchAlpha_C;
WebPDispatchAlphaToGreen = DispatchAlphaToGreen_C;
WebPExtractAlpha = ExtractAlpha_C;
WebPExtractGreen = ExtractGreen_C;
#endif
WebPHasAlpha8b = HasAlpha8b_C;
WebPHasAlpha32b = HasAlpha32b_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
@ -382,16 +429,31 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
#endif
}
#endif
#if defined(WEBP_USE_NEON)
if (VP8GetCPUInfo(kNEON)) {
WebPInitAlphaProcessingNEON();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
WebPInitAlphaProcessingMIPSdspR2();
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPInitAlphaProcessingNEON();
}
#endif
assert(WebPMultARGBRow != NULL);
assert(WebPMultRow != NULL);
assert(WebPApplyAlphaMultiply != NULL);
assert(WebPApplyAlphaMultiply4444 != NULL);
assert(WebPDispatchAlpha != NULL);
assert(WebPDispatchAlphaToGreen != NULL);
assert(WebPExtractAlpha != NULL);
assert(WebPExtractGreen != NULL);
assert(WebPPackRGB != NULL);
assert(WebPHasAlpha8b != NULL);
assert(WebPHasAlpha32b != NULL);
alpha_processing_last_cpuinfo_used = VP8GetCPUInfo;
}

View file

@ -12,13 +12,13 @@
// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
// Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint8_t* dst, int dst_stride) {
static int DispatchAlpha_MIPSdspR2(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint8_t* dst, int dst_stride) {
uint32_t alpha_mask = 0xffffffff;
int i, j, temp0;
@ -79,7 +79,8 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
return (alpha_mask != 0xff);
}
static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
static void MultARGBRow_MIPSdspR2(uint32_t* const ptr, int width,
int inverse) {
int x;
const uint32_t c_00ffffff = 0x00ffffffu;
const uint32_t c_ff000000 = 0xff000000u;
@ -124,14 +125,54 @@ static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
}
}
static void PackRGB_MIPSdspR2(const uint8_t* r, const uint8_t* g,
const uint8_t* b, int len, int step,
uint32_t* out) {
int temp0, temp1, temp2, offset;
const int rest = len & 1;
const int a = 0xff;
const uint32_t* const loop_end = out + len - rest;
__asm__ volatile (
"xor %[offset], %[offset], %[offset] \n\t"
"beq %[loop_end], %[out], 0f \n\t"
"2: \n\t"
"lbux %[temp0], %[offset](%[r]) \n\t"
"lbux %[temp1], %[offset](%[g]) \n\t"
"lbux %[temp2], %[offset](%[b]) \n\t"
"ins %[temp0], %[a], 16, 16 \n\t"
"ins %[temp2], %[temp1], 16, 16 \n\t"
"addiu %[out], %[out], 4 \n\t"
"precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t"
"sw %[temp0], -4(%[out]) \n\t"
"addu %[offset], %[offset], %[step] \n\t"
"bne %[loop_end], %[out], 2b \n\t"
"0: \n\t"
"beq %[rest], $zero, 1f \n\t"
"lbux %[temp0], %[offset](%[r]) \n\t"
"lbux %[temp1], %[offset](%[g]) \n\t"
"lbux %[temp2], %[offset](%[b]) \n\t"
"ins %[temp0], %[a], 16, 16 \n\t"
"ins %[temp2], %[temp1], 16, 16 \n\t"
"precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t"
"sw %[temp0], 0(%[out]) \n\t"
"1: \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[offset]"=&r"(offset), [out]"+&r"(out)
: [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
[loop_end]"r"(loop_end), [rest]"r"(rest)
: "memory"
);
}
//------------------------------------------------------------------------------
// Entry point
extern void WebPInitAlphaProcessingMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingMIPSdspR2(void) {
WebPDispatchAlpha = DispatchAlpha;
WebPMultARGBRow = MultARGBRow;
WebPDispatchAlpha = DispatchAlpha_MIPSdspR2;
WebPMultARGBRow = MultARGBRow_MIPSdspR2;
WebPPackRGB = PackRGB_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2

View file

@ -11,11 +11,11 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
#include "./neon.h"
#include "src/dsp/neon.h"
//------------------------------------------------------------------------------

View file

@ -11,16 +11,16 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
#include <emmintrin.h>
//------------------------------------------------------------------------------
static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint8_t* dst, int dst_stride) {
static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint8_t* dst, int dst_stride) {
// alpha_and stores an 'and' operation of all the alpha[] values. The final
// value is not 0xff if any of the alpha[] is not equal to 0xff.
uint32_t alpha_and = 0xff;
@ -72,9 +72,9 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
return (alpha_and != 0xff);
}
static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint32_t* dst, int dst_stride) {
static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint32_t* dst, int dst_stride) {
int i, j;
const __m128i zero = _mm_setzero_si128();
const int limit = width & ~15;
@ -98,9 +98,9 @@ static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
}
}
static int ExtractAlpha(const uint8_t* argb, int argb_stride,
int width, int height,
uint8_t* alpha, int alpha_stride) {
static int ExtractAlpha_SSE2(const uint8_t* argb, int argb_stride,
int width, int height,
uint8_t* alpha, int alpha_stride) {
// alpha_and stores an 'and' operation of all the alpha[] values. The final
// value is not 0xff if any of the alpha[] is not equal to 0xff.
uint32_t alpha_and = 0xff;
@ -210,6 +210,61 @@ static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first,
#undef MULTIPLIER
#undef PREMULTIPLY
//------------------------------------------------------------------------------
// Alpha detection
static int HasAlpha8b_SSE2(const uint8_t* src, int length) {
const __m128i all_0xff = _mm_set1_epi8(0xff);
int i = 0;
for (; i + 16 <= length; i += 16) {
const __m128i v = _mm_loadu_si128((const __m128i*)(src + i));
const __m128i bits = _mm_cmpeq_epi8(v, all_0xff);
const int mask = _mm_movemask_epi8(bits);
if (mask != 0xffff) return 1;
}
for (; i < length; ++i) if (src[i] != 0xff) return 1;
return 0;
}
static int HasAlpha32b_SSE2(const uint8_t* src, int length) {
const __m128i alpha_mask = _mm_set1_epi32(0xff);
const __m128i all_0xff = _mm_set1_epi8(0xff);
int i = 0;
// We don't know if we can access the last 3 bytes after the last alpha
// value 'src[4 * length - 4]' (because we don't know if alpha is the first
// or the last byte of the quadruplet). Hence the '-3' protection below.
length = length * 4 - 3; // size in bytes
for (; i + 64 <= length; i += 64) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0));
const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16));
const __m128i a2 = _mm_loadu_si128((const __m128i*)(src + i + 32));
const __m128i a3 = _mm_loadu_si128((const __m128i*)(src + i + 48));
const __m128i b0 = _mm_and_si128(a0, alpha_mask);
const __m128i b1 = _mm_and_si128(a1, alpha_mask);
const __m128i b2 = _mm_and_si128(a2, alpha_mask);
const __m128i b3 = _mm_and_si128(a3, alpha_mask);
const __m128i c0 = _mm_packs_epi32(b0, b1);
const __m128i c1 = _mm_packs_epi32(b2, b3);
const __m128i d = _mm_packus_epi16(c0, c1);
const __m128i bits = _mm_cmpeq_epi8(d, all_0xff);
const int mask = _mm_movemask_epi8(bits);
if (mask != 0xffff) return 1;
}
for (; i + 32 <= length; i += 32) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0));
const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16));
const __m128i b0 = _mm_and_si128(a0, alpha_mask);
const __m128i b1 = _mm_and_si128(a1, alpha_mask);
const __m128i c = _mm_packs_epi32(b0, b1);
const __m128i d = _mm_packus_epi16(c, c);
const __m128i bits = _mm_cmpeq_epi8(d, all_0xff);
const int mask = _mm_movemask_epi8(bits);
if (mask != 0xffff) return 1;
}
for (; i <= length; i += 4) if (src[i] != 0xff) return 1;
return 0;
}
// -----------------------------------------------------------------------------
// Apply alpha value to rows
@ -238,7 +293,7 @@ static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) {
}
}
width -= x;
if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse);
if (width > 0) WebPMultARGBRow_C(ptr + x, width, inverse);
}
static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
@ -261,7 +316,7 @@ static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
}
}
width -= x;
if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse);
if (width > 0) WebPMultRow_C(ptr + x, alpha + x, width, inverse);
}
//------------------------------------------------------------------------------
@ -273,9 +328,12 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
WebPMultARGBRow = MultARGBRow_SSE2;
WebPMultRow = MultRow_SSE2;
WebPApplyAlphaMultiply = ApplyAlphaMultiply_SSE2;
WebPDispatchAlpha = DispatchAlpha;
WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
WebPExtractAlpha = ExtractAlpha;
WebPDispatchAlpha = DispatchAlpha_SSE2;
WebPDispatchAlphaToGreen = DispatchAlphaToGreen_SSE2;
WebPExtractAlpha = ExtractAlpha_SSE2;
WebPHasAlpha8b = HasAlpha8b_SSE2;
WebPHasAlpha32b = HasAlpha32b_SSE2;
}
#else // !WEBP_USE_SSE2

View file

@ -11,7 +11,7 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE41)
@ -19,9 +19,9 @@
//------------------------------------------------------------------------------
static int ExtractAlpha(const uint8_t* argb, int argb_stride,
int width, int height,
uint8_t* alpha, int alpha_stride) {
static int ExtractAlpha_SSE41(const uint8_t* argb, int argb_stride,
int width, int height,
uint8_t* alpha, int alpha_stride) {
// alpha_and stores an 'and' operation of all the alpha[] values. The final
// value is not 0xff if any of the alpha[] is not equal to 0xff.
uint32_t alpha_and = 0xff;
@ -82,7 +82,7 @@ static int ExtractAlpha(const uint8_t* argb, int argb_stride,
extern void WebPInitAlphaProcessingSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE41(void) {
WebPExtractAlpha = ExtractAlpha;
WebPExtractAlpha = ExtractAlpha_SSE41;
}
#else // !WEBP_USE_SSE41

View file

@ -9,8 +9,8 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "../enc/cost_enc.h"
#include "src/dsp/dsp.h"
#include "src/enc/cost_enc.h"
//------------------------------------------------------------------------------
// Boolean-cost cost table
@ -319,7 +319,7 @@ const uint8_t VP8EncBands[16 + 1] = {
//------------------------------------------------------------------------------
// Mode costs
static int GetResidualCost(int ctx0, const VP8Residual* const res) {
static int GetResidualCost_C(int ctx0, const VP8Residual* const res) {
int n = res->first;
// should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
const int p0 = res->prob[n][ctx0][0];
@ -354,8 +354,8 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
return cost;
}
static void SetResidualCoeffs(const int16_t* const coeffs,
VP8Residual* const res) {
static void SetResidualCoeffs_C(const int16_t* const coeffs,
VP8Residual* const res) {
int n;
res->last = -1;
assert(res->first == 0 || coeffs[0] == 0);
@ -384,8 +384,8 @@ static volatile VP8CPUInfo cost_last_cpuinfo_used =
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInit(void) {
if (cost_last_cpuinfo_used == VP8GetCPUInfo) return;
VP8GetResidualCost = GetResidualCost;
VP8SetResidualCoeffs = SetResidualCoeffs;
VP8GetResidualCost = GetResidualCost_C;
VP8SetResidualCoeffs = SetResidualCoeffs_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {

View file

@ -9,13 +9,13 @@
//
// Author: Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS32)
#include "../enc/cost_enc.h"
#include "src/enc/cost_enc.h"
static int GetResidualCost(int ctx0, const VP8Residual* const res) {
static int GetResidualCost_MIPS32(int ctx0, const VP8Residual* const res) {
int temp0, temp1;
int v_reg, ctx_reg;
int n = res->first;
@ -96,8 +96,8 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
return cost;
}
static void SetResidualCoeffs(const int16_t* const coeffs,
VP8Residual* const res) {
static void SetResidualCoeffs_MIPS32(const int16_t* const coeffs,
VP8Residual* const res) {
const int16_t* p_coeffs = (int16_t*)coeffs;
int temp0, temp1, temp2, n, n1;
assert(res->first == 0 || coeffs[0] == 0);
@ -143,8 +143,8 @@ static void SetResidualCoeffs(const int16_t* const coeffs,
extern void VP8EncDspCostInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPS32(void) {
VP8GetResidualCost = GetResidualCost;
VP8SetResidualCoeffs = SetResidualCoeffs;
VP8GetResidualCost = GetResidualCost_MIPS32;
VP8SetResidualCoeffs = SetResidualCoeffs_MIPS32;
}
#else // !WEBP_USE_MIPS32

View file

@ -9,13 +9,13 @@
//
// Author: Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "../enc/cost_enc.h"
#include "src/enc/cost_enc.h"
static int GetResidualCost(int ctx0, const VP8Residual* const res) {
static int GetResidualCost_MIPSdspR2(int ctx0, const VP8Residual* const res) {
int temp0, temp1;
int v_reg, ctx_reg;
int n = res->first;
@ -97,7 +97,7 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
extern void VP8EncDspCostInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPSdspR2(void) {
VP8GetResidualCost = GetResidualCost;
VP8GetResidualCost = GetResidualCost_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2

View file

@ -11,19 +11,19 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
#include <emmintrin.h>
#include "../enc/cost_enc.h"
#include "../enc/vp8i_enc.h"
#include "../utils/utils.h"
#include "src/enc/cost_enc.h"
#include "src/enc/vp8i_enc.h"
#include "src/utils/utils.h"
//------------------------------------------------------------------------------
static void SetResidualCoeffsSSE2(const int16_t* const coeffs,
VP8Residual* const res) {
static void SetResidualCoeffs_SSE2(const int16_t* const coeffs,
VP8Residual* const res) {
const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
// Use SSE2 to compare 16 values with a single instruction.
@ -42,7 +42,7 @@ static void SetResidualCoeffsSSE2(const int16_t* const coeffs,
res->coeffs = coeffs;
}
static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
static int GetResidualCost_SSE2(int ctx0, const VP8Residual* const res) {
uint8_t levels[16], ctxs[16];
uint16_t abs_levels[16];
int n = res->first;
@ -108,8 +108,8 @@ static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
extern void VP8EncDspCostInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitSSE2(void) {
VP8SetResidualCoeffs = SetResidualCoeffsSSE2;
VP8GetResidualCost = GetResidualCostSSE2;
VP8SetResidualCoeffs = SetResidualCoeffs_SSE2;
VP8GetResidualCost = GetResidualCost_SSE2;
}
#else // !WEBP_USE_SSE2

View file

@ -11,7 +11,7 @@
//
// Author: Christian Duvivier (cduvivier@google.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_HAVE_NEON_RTCD)
#include <stdio.h>
@ -143,7 +143,7 @@ static int x86CPUInfo(CPUFeature feature) {
return !!(cpu_info[2] & (1 << 0));
}
if (feature == kSlowSSSE3) {
if (is_intel && (cpu_info[2] & (1 << 0))) { // SSSE3?
if (is_intel && (cpu_info[2] & (1 << 9))) { // SSSE3?
return CheckSlowModel(cpu_info[0]);
}
return 0;

View file

@ -11,9 +11,11 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "../dec/vp8i_dec.h"
#include "../utils/utils.h"
#include <assert.h>
#include "src/dsp/dsp.h"
#include "src/dec/vp8i_dec.h"
#include "src/utils/utils.h"
//------------------------------------------------------------------------------
@ -25,7 +27,7 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
// Transforms (Paragraph 14.4)
#define STORE(x, y, v) \
dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3))
dst[(x) + (y) * BPS] = clip_8b(dst[(x) + (y) * BPS] + ((v) >> 3))
#define STORE2(y, dc, d, c) do { \
const int DC = (dc); \
@ -38,7 +40,8 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
#define MUL1(a) ((((a) * 20091) >> 16) + (a))
#define MUL2(a) (((a) * 35468) >> 16)
static void TransformOne(const int16_t* in, uint8_t* dst) {
#if !WEBP_NEON_OMIT_C_CODE
static void TransformOne_C(const int16_t* in, uint8_t* dst) {
int C[4 * 4], *tmp;
int i;
tmp = C;
@ -78,7 +81,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
}
// Simplified transform when only in[0], in[1] and in[4] are non-zero
static void TransformAC3(const int16_t* in, uint8_t* dst) {
static void TransformAC3_C(const int16_t* in, uint8_t* dst) {
const int a = in[0] + 4;
const int c4 = MUL2(in[4]);
const int d4 = MUL1(in[4]);
@ -93,19 +96,21 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
#undef MUL2
#undef STORE2
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
TransformOne(in, dst);
static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) {
TransformOne_C(in, dst);
if (do_two) {
TransformOne(in + 16, dst + 4);
TransformOne_C(in + 16, dst + 4);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void TransformUV(const int16_t* in, uint8_t* dst) {
static void TransformUV_C(const int16_t* in, uint8_t* dst) {
VP8Transform(in + 0 * 16, dst, 1);
VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
}
static void TransformDC(const int16_t* in, uint8_t* dst) {
#if !WEBP_NEON_OMIT_C_CODE
static void TransformDC_C(const int16_t* in, uint8_t* dst) {
const int DC = in[0] + 4;
int i, j;
for (j = 0; j < 4; ++j) {
@ -114,8 +119,9 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
}
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void TransformDCUV(const int16_t* in, uint8_t* dst) {
static void TransformDCUV_C(const int16_t* in, uint8_t* dst) {
if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4);
if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS);
@ -127,7 +133,8 @@ static void TransformDCUV(const int16_t* in, uint8_t* dst) {
//------------------------------------------------------------------------------
// Paragraph 14.3
static void TransformWHT(const int16_t* in, int16_t* out) {
#if !WEBP_NEON_OMIT_C_CODE
static void TransformWHT_C(const int16_t* in, int16_t* out) {
int tmp[16];
int i;
for (i = 0; i < 4; ++i) {
@ -153,6 +160,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
out += 64;
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
@ -161,6 +169,7 @@ void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
#define DST(x, y) dst[(x) + (y) * BPS]
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
const uint8_t* top = dst - BPS;
const uint8_t* const clip0 = VP8kclip1 - top[-1];
@ -174,21 +183,21 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
dst += BPS;
}
}
static void TM4(uint8_t* dst) { TrueMotion(dst, 4); }
static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
static void TM16(uint8_t* dst) { TrueMotion(dst, 16); }
static void TM4_C(uint8_t* dst) { TrueMotion(dst, 4); }
static void TM8uv_C(uint8_t* dst) { TrueMotion(dst, 8); }
static void TM16_C(uint8_t* dst) { TrueMotion(dst, 16); }
//------------------------------------------------------------------------------
// 16x16
static void VE16(uint8_t* dst) { // vertical
static void VE16_C(uint8_t* dst) { // vertical
int j;
for (j = 0; j < 16; ++j) {
memcpy(dst + j * BPS, dst - BPS, 16);
}
}
static void HE16(uint8_t* dst) { // horizontal
static void HE16_C(uint8_t* dst) { // horizontal
int j;
for (j = 16; j > 0; --j) {
memset(dst, dst[-1], 16);
@ -203,7 +212,7 @@ static WEBP_INLINE void Put16(int v, uint8_t* dst) {
}
}
static void DC16(uint8_t* dst) { // DC
static void DC16_C(uint8_t* dst) { // DC
int DC = 16;
int j;
for (j = 0; j < 16; ++j) {
@ -212,7 +221,7 @@ static void DC16(uint8_t* dst) { // DC
Put16(DC >> 5, dst);
}
static void DC16NoTop(uint8_t* dst) { // DC with top samples not available
static void DC16NoTop_C(uint8_t* dst) { // DC with top samples not available
int DC = 8;
int j;
for (j = 0; j < 16; ++j) {
@ -221,7 +230,7 @@ static void DC16NoTop(uint8_t* dst) { // DC with top samples not available
Put16(DC >> 4, dst);
}
static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available
static void DC16NoLeft_C(uint8_t* dst) { // DC with left samples not available
int DC = 8;
int i;
for (i = 0; i < 16; ++i) {
@ -230,9 +239,10 @@ static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available
Put16(DC >> 4, dst);
}
static void DC16NoTopLeft(uint8_t* dst) { // DC with no top and left samples
static void DC16NoTopLeft_C(uint8_t* dst) { // DC with no top and left samples
Put16(0x80, dst);
}
#endif // !WEBP_NEON_OMIT_C_CODE
VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
@ -242,7 +252,8 @@ VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
#define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
#define AVG2(a, b) (((a) + (b) + 1) >> 1)
static void VE4(uint8_t* dst) { // vertical
#if !WEBP_NEON_OMIT_C_CODE
static void VE4_C(uint8_t* dst) { // vertical
const uint8_t* top = dst - BPS;
const uint8_t vals[4] = {
AVG3(top[-1], top[0], top[1]),
@ -255,8 +266,9 @@ static void VE4(uint8_t* dst) { // vertical
memcpy(dst + i * BPS, vals, sizeof(vals));
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void HE4(uint8_t* dst) { // horizontal
static void HE4_C(uint8_t* dst) { // horizontal
const int A = dst[-1 - BPS];
const int B = dst[-1];
const int C = dst[-1 + BPS];
@ -268,7 +280,8 @@ static void HE4(uint8_t* dst) { // horizontal
WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(D, E, E));
}
static void DC4(uint8_t* dst) { // DC
#if !WEBP_NEON_OMIT_C_CODE
static void DC4_C(uint8_t* dst) { // DC
uint32_t dc = 4;
int i;
for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
@ -276,7 +289,7 @@ static void DC4(uint8_t* dst) { // DC
for (i = 0; i < 4; ++i) memset(dst + i * BPS, dc, 4);
}
static void RD4(uint8_t* dst) { // Down-right
static void RD4_C(uint8_t* dst) { // Down-right
const int I = dst[-1 + 0 * BPS];
const int J = dst[-1 + 1 * BPS];
const int K = dst[-1 + 2 * BPS];
@ -295,7 +308,7 @@ static void RD4(uint8_t* dst) { // Down-right
DST(3, 0) = AVG3(D, C, B);
}
static void LD4(uint8_t* dst) { // Down-Left
static void LD4_C(uint8_t* dst) { // Down-Left
const int A = dst[0 - BPS];
const int B = dst[1 - BPS];
const int C = dst[2 - BPS];
@ -312,8 +325,9 @@ static void LD4(uint8_t* dst) { // Down-Left
DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
DST(3, 3) = AVG3(G, H, H);
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void VR4(uint8_t* dst) { // Vertical-Right
static void VR4_C(uint8_t* dst) { // Vertical-Right
const int I = dst[-1 + 0 * BPS];
const int J = dst[-1 + 1 * BPS];
const int K = dst[-1 + 2 * BPS];
@ -335,7 +349,7 @@ static void VR4(uint8_t* dst) { // Vertical-Right
DST(3, 1) = AVG3(B, C, D);
}
static void VL4(uint8_t* dst) { // Vertical-Left
static void VL4_C(uint8_t* dst) { // Vertical-Left
const int A = dst[0 - BPS];
const int B = dst[1 - BPS];
const int C = dst[2 - BPS];
@ -357,7 +371,7 @@ static void VL4(uint8_t* dst) { // Vertical-Left
DST(3, 3) = AVG3(F, G, H);
}
static void HU4(uint8_t* dst) { // Horizontal-Up
static void HU4_C(uint8_t* dst) { // Horizontal-Up
const int I = dst[-1 + 0 * BPS];
const int J = dst[-1 + 1 * BPS];
const int K = dst[-1 + 2 * BPS];
@ -372,7 +386,7 @@ static void HU4(uint8_t* dst) { // Horizontal-Up
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
}
static void HD4(uint8_t* dst) { // Horizontal-Down
static void HD4_C(uint8_t* dst) { // Horizontal-Down
const int I = dst[-1 + 0 * BPS];
const int J = dst[-1 + 1 * BPS];
const int K = dst[-1 + 2 * BPS];
@ -404,14 +418,15 @@ VP8PredFunc VP8PredLuma4[NUM_BMODES];
//------------------------------------------------------------------------------
// Chroma
static void VE8uv(uint8_t* dst) { // vertical
#if !WEBP_NEON_OMIT_C_CODE
static void VE8uv_C(uint8_t* dst) { // vertical
int j;
for (j = 0; j < 8; ++j) {
memcpy(dst + j * BPS, dst - BPS, 8);
}
}
static void HE8uv(uint8_t* dst) { // horizontal
static void HE8uv_C(uint8_t* dst) { // horizontal
int j;
for (j = 0; j < 8; ++j) {
memset(dst, dst[-1], 8);
@ -427,7 +442,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
}
}
static void DC8uv(uint8_t* dst) { // DC
static void DC8uv_C(uint8_t* dst) { // DC
int dc0 = 8;
int i;
for (i = 0; i < 8; ++i) {
@ -436,7 +451,7 @@ static void DC8uv(uint8_t* dst) { // DC
Put8x8uv(dc0 >> 4, dst);
}
static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
static void DC8uvNoLeft_C(uint8_t* dst) { // DC with no left samples
int dc0 = 4;
int i;
for (i = 0; i < 8; ++i) {
@ -445,7 +460,7 @@ static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
Put8x8uv(dc0 >> 3, dst);
}
static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
static void DC8uvNoTop_C(uint8_t* dst) { // DC with no top samples
int dc0 = 4;
int i;
for (i = 0; i < 8; ++i) {
@ -454,17 +469,19 @@ static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
Put8x8uv(dc0 >> 3, dst);
}
static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing
static void DC8uvNoTopLeft_C(uint8_t* dst) { // DC with nothing
Put8x8uv(0x80, dst);
}
#endif // !WEBP_NEON_OMIT_C_CODE
VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES];
//------------------------------------------------------------------------------
// Edge filtering functions
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
// 4 pixels in, 2 pixels out
static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
static WEBP_INLINE void DoFilter2_C(uint8_t* p, int step) {
const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1]; // in [-893,892]
const int a1 = VP8ksclip2[(a + 4) >> 3]; // in [-16,15]
@ -474,7 +491,7 @@ static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
}
// 4 pixels in, 4 pixels out
static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
static WEBP_INLINE void DoFilter4_C(uint8_t* p, int step) {
const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
const int a = 3 * (q0 - p0);
const int a1 = VP8ksclip2[(a + 4) >> 3];
@ -487,7 +504,7 @@ static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
}
// 6 pixels in, 6 pixels out
static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
static WEBP_INLINE void DoFilter6_C(uint8_t* p, int step) {
const int p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
const int q0 = p[0], q1 = p[step], q2 = p[2*step];
const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
@ -503,18 +520,22 @@ static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
p[ 2*step] = VP8kclip1[q2 - a3];
}
static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
static WEBP_INLINE int Hev(const uint8_t* p, int step, int thresh) {
const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
return (VP8kabs0[p1 - p0] > thresh) || (VP8kabs0[q1 - q0] > thresh);
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE int NeedsFilter_C(const uint8_t* p, int step, int t) {
const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
return ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) <= t);
}
#endif // !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE int needs_filter2(const uint8_t* p,
int step, int t, int it) {
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static WEBP_INLINE int NeedsFilter2_C(const uint8_t* p,
int step, int t, int it) {
const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step];
const int p0 = p[-step], q0 = p[0];
const int q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
@ -523,140 +544,159 @@ static WEBP_INLINE int needs_filter2(const uint8_t* p,
VP8kabs0[p1 - p0] <= it && VP8kabs0[q3 - q2] <= it &&
VP8kabs0[q2 - q1] <= it && VP8kabs0[q1 - q0] <= it;
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
//------------------------------------------------------------------------------
// Simple In-loop filtering (Paragraph 15.2)
static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
#if !WEBP_NEON_OMIT_C_CODE
static void SimpleVFilter16_C(uint8_t* p, int stride, int thresh) {
int i;
const int thresh2 = 2 * thresh + 1;
for (i = 0; i < 16; ++i) {
if (needs_filter(p + i, stride, thresh2)) {
do_filter2(p + i, stride);
if (NeedsFilter_C(p + i, stride, thresh2)) {
DoFilter2_C(p + i, stride);
}
}
}
static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
static void SimpleHFilter16_C(uint8_t* p, int stride, int thresh) {
int i;
const int thresh2 = 2 * thresh + 1;
for (i = 0; i < 16; ++i) {
if (needs_filter(p + i * stride, 1, thresh2)) {
do_filter2(p + i * stride, 1);
if (NeedsFilter_C(p + i * stride, 1, thresh2)) {
DoFilter2_C(p + i * stride, 1);
}
}
}
static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
static void SimpleVFilter16i_C(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4 * stride;
SimpleVFilter16(p, stride, thresh);
SimpleVFilter16_C(p, stride, thresh);
}
}
static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
static void SimpleHFilter16i_C(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4;
SimpleHFilter16(p, stride, thresh);
SimpleHFilter16_C(p, stride, thresh);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// Complex In-loop filtering (Paragraph 15.3)
static WEBP_INLINE void FilterLoop26(uint8_t* p,
int hstride, int vstride, int size,
int thresh, int ithresh, int hev_thresh) {
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static WEBP_INLINE void FilterLoop26_C(uint8_t* p,
int hstride, int vstride, int size,
int thresh, int ithresh,
int hev_thresh) {
const int thresh2 = 2 * thresh + 1;
while (size-- > 0) {
if (needs_filter2(p, hstride, thresh2, ithresh)) {
if (hev(p, hstride, hev_thresh)) {
do_filter2(p, hstride);
if (NeedsFilter2_C(p, hstride, thresh2, ithresh)) {
if (Hev(p, hstride, hev_thresh)) {
DoFilter2_C(p, hstride);
} else {
do_filter6(p, hstride);
DoFilter6_C(p, hstride);
}
}
p += vstride;
}
}
static WEBP_INLINE void FilterLoop24(uint8_t* p,
int hstride, int vstride, int size,
int thresh, int ithresh, int hev_thresh) {
static WEBP_INLINE void FilterLoop24_C(uint8_t* p,
int hstride, int vstride, int size,
int thresh, int ithresh,
int hev_thresh) {
const int thresh2 = 2 * thresh + 1;
while (size-- > 0) {
if (needs_filter2(p, hstride, thresh2, ithresh)) {
if (hev(p, hstride, hev_thresh)) {
do_filter2(p, hstride);
if (NeedsFilter2_C(p, hstride, thresh2, ithresh)) {
if (Hev(p, hstride, hev_thresh)) {
DoFilter2_C(p, hstride);
} else {
do_filter4(p, hstride);
DoFilter4_C(p, hstride);
}
}
p += vstride;
}
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
#if !WEBP_NEON_OMIT_C_CODE
// on macroblock edges
static void VFilter16(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
static void VFilter16_C(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26_C(p, stride, 1, 16, thresh, ithresh, hev_thresh);
}
static void HFilter16(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
static void HFilter16_C(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26_C(p, 1, stride, 16, thresh, ithresh, hev_thresh);
}
// on three inner edges
static void VFilter16i(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter16i_C(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4 * stride;
FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
FilterLoop24_C(p, stride, 1, 16, thresh, ithresh, hev_thresh);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void HFilter16i(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static void HFilter16i_C(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4;
FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
FilterLoop24_C(p, 1, stride, 16, thresh, ithresh, hev_thresh);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
#if !WEBP_NEON_OMIT_C_CODE
// 8-pixels wide variant, for chroma filtering
static void VFilter8(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
static void VFilter8_C(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26_C(u, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop26_C(v, stride, 1, 8, thresh, ithresh, hev_thresh);
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void HFilter8(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static void HFilter8_C(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26_C(u, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop26_C(v, 1, stride, 8, thresh, ithresh, hev_thresh);
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
#if !WEBP_NEON_OMIT_C_CODE
static void VFilter8i_C(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop24_C(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop24_C(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop24_C(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop24_C(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
//------------------------------------------------------------------------------
static void DitherCombine8x8(const uint8_t* dither, uint8_t* dst,
int dst_stride) {
static void DitherCombine8x8_C(const uint8_t* dither, uint8_t* dst,
int dst_stride) {
int i, j;
for (j = 0; j < 8; ++j) {
for (i = 0; i < 8; ++i) {
@ -709,54 +749,66 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
VP8InitClipTables();
VP8TransformWHT = TransformWHT;
VP8Transform = TransformTwo;
VP8TransformUV = TransformUV;
VP8TransformDC = TransformDC;
VP8TransformDCUV = TransformDCUV;
VP8TransformAC3 = TransformAC3;
#if !WEBP_NEON_OMIT_C_CODE
VP8TransformWHT = TransformWHT_C;
VP8Transform = TransformTwo_C;
VP8TransformDC = TransformDC_C;
VP8TransformAC3 = TransformAC3_C;
#endif
VP8TransformUV = TransformUV_C;
VP8TransformDCUV = TransformDCUV_C;
VP8VFilter16 = VFilter16;
VP8HFilter16 = HFilter16;
VP8VFilter8 = VFilter8;
VP8HFilter8 = HFilter8;
VP8VFilter16i = VFilter16i;
VP8HFilter16i = HFilter16i;
VP8VFilter8i = VFilter8i;
VP8HFilter8i = HFilter8i;
VP8SimpleVFilter16 = SimpleVFilter16;
VP8SimpleHFilter16 = SimpleHFilter16;
VP8SimpleVFilter16i = SimpleVFilter16i;
VP8SimpleHFilter16i = SimpleHFilter16i;
#if !WEBP_NEON_OMIT_C_CODE
VP8VFilter16 = VFilter16_C;
VP8VFilter16i = VFilter16i_C;
VP8HFilter16 = HFilter16_C;
VP8VFilter8 = VFilter8_C;
VP8VFilter8i = VFilter8i_C;
VP8SimpleVFilter16 = SimpleVFilter16_C;
VP8SimpleHFilter16 = SimpleHFilter16_C;
VP8SimpleVFilter16i = SimpleVFilter16i_C;
VP8SimpleHFilter16i = SimpleHFilter16i_C;
#endif
VP8PredLuma4[0] = DC4;
VP8PredLuma4[1] = TM4;
VP8PredLuma4[2] = VE4;
VP8PredLuma4[3] = HE4;
VP8PredLuma4[4] = RD4;
VP8PredLuma4[5] = VR4;
VP8PredLuma4[6] = LD4;
VP8PredLuma4[7] = VL4;
VP8PredLuma4[8] = HD4;
VP8PredLuma4[9] = HU4;
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
VP8HFilter16i = HFilter16i_C;
VP8HFilter8 = HFilter8_C;
VP8HFilter8i = HFilter8i_C;
#endif
VP8PredLuma16[0] = DC16;
VP8PredLuma16[1] = TM16;
VP8PredLuma16[2] = VE16;
VP8PredLuma16[3] = HE16;
VP8PredLuma16[4] = DC16NoTop;
VP8PredLuma16[5] = DC16NoLeft;
VP8PredLuma16[6] = DC16NoTopLeft;
#if !WEBP_NEON_OMIT_C_CODE
VP8PredLuma4[0] = DC4_C;
VP8PredLuma4[1] = TM4_C;
VP8PredLuma4[2] = VE4_C;
VP8PredLuma4[4] = RD4_C;
VP8PredLuma4[6] = LD4_C;
#endif
VP8PredChroma8[0] = DC8uv;
VP8PredChroma8[1] = TM8uv;
VP8PredChroma8[2] = VE8uv;
VP8PredChroma8[3] = HE8uv;
VP8PredChroma8[4] = DC8uvNoTop;
VP8PredChroma8[5] = DC8uvNoLeft;
VP8PredChroma8[6] = DC8uvNoTopLeft;
VP8PredLuma4[3] = HE4_C;
VP8PredLuma4[5] = VR4_C;
VP8PredLuma4[7] = VL4_C;
VP8PredLuma4[8] = HD4_C;
VP8PredLuma4[9] = HU4_C;
VP8DitherCombine8x8 = DitherCombine8x8;
#if !WEBP_NEON_OMIT_C_CODE
VP8PredLuma16[0] = DC16_C;
VP8PredLuma16[1] = TM16_C;
VP8PredLuma16[2] = VE16_C;
VP8PredLuma16[3] = HE16_C;
VP8PredLuma16[4] = DC16NoTop_C;
VP8PredLuma16[5] = DC16NoLeft_C;
VP8PredLuma16[6] = DC16NoTopLeft_C;
VP8PredChroma8[0] = DC8uv_C;
VP8PredChroma8[1] = TM8uv_C;
VP8PredChroma8[2] = VE8uv_C;
VP8PredChroma8[3] = HE8uv_C;
VP8PredChroma8[4] = DC8uvNoTop_C;
VP8PredChroma8[5] = DC8uvNoLeft_C;
VP8PredChroma8[6] = DC8uvNoTopLeft_C;
#endif
VP8DitherCombine8x8 = DitherCombine8x8_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
@ -770,11 +822,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
#endif
}
#endif
#if defined(WEBP_USE_NEON)
if (VP8GetCPUInfo(kNEON)) {
VP8DspInitNEON();
}
#endif
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
VP8DspInitMIPS32();
@ -791,5 +838,57 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8DspInitNEON();
}
#endif
assert(VP8TransformWHT != NULL);
assert(VP8Transform != NULL);
assert(VP8TransformDC != NULL);
assert(VP8TransformAC3 != NULL);
assert(VP8TransformUV != NULL);
assert(VP8TransformDCUV != NULL);
assert(VP8VFilter16 != NULL);
assert(VP8HFilter16 != NULL);
assert(VP8VFilter8 != NULL);
assert(VP8HFilter8 != NULL);
assert(VP8VFilter16i != NULL);
assert(VP8HFilter16i != NULL);
assert(VP8VFilter8i != NULL);
assert(VP8HFilter8i != NULL);
assert(VP8SimpleVFilter16 != NULL);
assert(VP8SimpleHFilter16 != NULL);
assert(VP8SimpleVFilter16i != NULL);
assert(VP8SimpleHFilter16i != NULL);
assert(VP8PredLuma4[0] != NULL);
assert(VP8PredLuma4[1] != NULL);
assert(VP8PredLuma4[2] != NULL);
assert(VP8PredLuma4[3] != NULL);
assert(VP8PredLuma4[4] != NULL);
assert(VP8PredLuma4[5] != NULL);
assert(VP8PredLuma4[6] != NULL);
assert(VP8PredLuma4[7] != NULL);
assert(VP8PredLuma4[8] != NULL);
assert(VP8PredLuma4[9] != NULL);
assert(VP8PredLuma16[0] != NULL);
assert(VP8PredLuma16[1] != NULL);
assert(VP8PredLuma16[2] != NULL);
assert(VP8PredLuma16[3] != NULL);
assert(VP8PredLuma16[4] != NULL);
assert(VP8PredLuma16[5] != NULL);
assert(VP8PredLuma16[6] != NULL);
assert(VP8PredChroma8[0] != NULL);
assert(VP8PredChroma8[1] != NULL);
assert(VP8PredChroma8[2] != NULL);
assert(VP8PredChroma8[3] != NULL);
assert(VP8PredChroma8[4] != NULL);
assert(VP8PredChroma8[5] != NULL);
assert(VP8PredChroma8[6] != NULL);
assert(VP8DitherCombine8x8 != NULL);
dec_last_cpuinfo_used = VP8GetCPUInfo;
}

View file

@ -11,11 +11,14 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#define USE_STATIC_TABLES // undefine to have run-time table initialization
// define to 0 to have run-time table initialization
#if !defined(USE_STATIC_TABLES)
#define USE_STATIC_TABLES 1 // ALTERNATE_CODE
#endif
#ifdef USE_STATIC_TABLES
#if (USE_STATIC_TABLES == 1)
static const uint8_t abs0[255 + 255 + 1] = {
0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4,
@ -337,7 +340,7 @@ static uint8_t clip1[255 + 511 + 1];
// and make sure it's set to true _last_ (so as to be thread-safe)
static volatile int tables_ok = 0;
#endif
#endif // USE_STATIC_TABLES
const int8_t* const VP8ksclip1 = (const int8_t*)&sclip1[1020];
const int8_t* const VP8ksclip2 = (const int8_t*)&sclip2[112];
@ -345,7 +348,7 @@ const uint8_t* const VP8kclip1 = &clip1[255];
const uint8_t* const VP8kabs0 = &abs0[255];
WEBP_TSAN_IGNORE_FUNCTION void VP8InitClipTables(void) {
#if !defined(USE_STATIC_TABLES)
#if (USE_STATIC_TABLES == 0)
int i;
if (!tables_ok) {
for (i = -255; i <= 255; ++i) {

View file

@ -12,11 +12,11 @@
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS32)
#include "./mips_macro.h"
#include "src/dsp/mips_macro.h"
static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;

View file

@ -12,11 +12,11 @@
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "./mips_macro.h"
#include "src/dsp/mips_macro.h"
static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;

View file

@ -12,11 +12,11 @@
// Author(s): Prashant Patil (prashant.patil@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MSA)
#include "./msa_macro.h"
#include "src/dsp/msa_macro.h"
//------------------------------------------------------------------------------
// Transforms
@ -222,6 +222,7 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
const v16i8 cnst4b = __msa_ldi_b(4); \
const v16i8 cnst3b = __msa_ldi_b(3); \
const v8i16 cnst9h = __msa_ldi_h(9); \
const v8i16 cnst63h = __msa_ldi_h(63); \
\
FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m); \
filt = __msa_subs_s_b(p1_m, q1_m); \
@ -241,9 +242,9 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l); \
/* update q2/p2 */ \
temp0 = filt_r * cnst9h; \
temp1 = ADDVI_H(temp0, 63); \
temp1 = temp0 + cnst63h; \
temp2 = filt_l * cnst9h; \
temp3 = ADDVI_H(temp2, 63); \
temp3 = temp2 + cnst63h; \
FILT2(q2_m, p2_m, q2, p2); \
/* update q1/p1 */ \
temp1 = temp1 + temp0; \
@ -708,7 +709,7 @@ static void VE4(uint8_t* dst) { // vertical
const uint32_t val0 = LW(ptop + 0);
const uint32_t val1 = LW(ptop + 4);
uint32_t out;
v16u8 A, B, C, AC, B2, R;
v16u8 A = { 0 }, B, C, AC, B2, R;
INSERT_W2_UB(val0, val1, A);
B = SLDI_UB(A, A, 1);
@ -725,7 +726,7 @@ static void RD4(uint8_t* dst) { // Down-right
uint32_t val0 = LW(ptop + 0);
uint32_t val1 = LW(ptop + 4);
uint32_t val2, val3;
v16u8 A, B, C, AC, B2, R, A1;
v16u8 A, B, C, AC, B2, R, A1 = { 0 };
INSERT_W2_UB(val0, val1, A1);
A = SLDI_UB(A1, A1, 12);
@ -753,7 +754,7 @@ static void LD4(uint8_t* dst) { // Down-Left
uint32_t val0 = LW(ptop + 0);
uint32_t val1 = LW(ptop + 4);
uint32_t val2, val3;
v16u8 A, B, C, AC, B2, R;
v16u8 A = { 0 }, B, C, AC, B2, R;
INSERT_W2_UB(val0, val1, A);
B = SLDI_UB(A, A, 1);

File diff suppressed because it is too large Load diff

View file

@ -12,23 +12,25 @@
// Author: somnath@google.com (Somnath Banerjee)
// cduvivier@google.com (Christian Duvivier)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
// The 3-coeff sparse transform in SSE2 is not really faster than the plain-C
// one it seems => disable it by default. Uncomment the following to enable:
// #define USE_TRANSFORM_AC3
#if !defined(USE_TRANSFORM_AC3)
#define USE_TRANSFORM_AC3 0 // ALTERNATE_CODE
#endif
#include <emmintrin.h>
#include "./common_sse2.h"
#include "../dec/vp8i_dec.h"
#include "../utils/utils.h"
#include "src/dsp/common_sse2.h"
#include "src/dec/vp8i_dec.h"
#include "src/utils/utils.h"
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
// This implementation makes use of 16-bit fixed point versions of two
// multiply constants:
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@ -193,7 +195,7 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
}
}
#if defined(USE_TRANSFORM_AC3)
#if (USE_TRANSFORM_AC3 == 1)
#define MUL(a, b) (((a) * (b)) >> 16)
static void TransformAC3(const int16_t* in, uint8_t* dst) {
static const int kC1 = 20091 + (1 << 16);
@ -248,7 +250,7 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
_mm_subs_epu8((p), (q)))
// Shift each byte of "x" by 3 bits while preserving by the sign bit.
static WEBP_INLINE void SignedShift8b(__m128i* const x) {
static WEBP_INLINE void SignedShift8b_SSE2(__m128i* const x) {
const __m128i zero = _mm_setzero_si128();
const __m128i lo_0 = _mm_unpacklo_epi8(zero, *x);
const __m128i hi_0 = _mm_unpackhi_epi8(zero, *x);
@ -258,8 +260,8 @@ static WEBP_INLINE void SignedShift8b(__m128i* const x) {
}
#define FLIP_SIGN_BIT2(a, b) { \
a = _mm_xor_si128(a, sign_bit); \
b = _mm_xor_si128(b, sign_bit); \
(a) = _mm_xor_si128(a, sign_bit); \
(b) = _mm_xor_si128(b, sign_bit); \
}
#define FLIP_SIGN_BIT4(a, b, c, d) { \
@ -268,11 +270,11 @@ static WEBP_INLINE void SignedShift8b(__m128i* const x) {
}
// input/output is uint8_t
static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
const __m128i* const p0,
const __m128i* const q0,
const __m128i* const q1,
int hev_thresh, __m128i* const not_hev) {
static WEBP_INLINE void GetNotHEV_SSE2(const __m128i* const p1,
const __m128i* const p0,
const __m128i* const q0,
const __m128i* const q1,
int hev_thresh, __m128i* const not_hev) {
const __m128i zero = _mm_setzero_si128();
const __m128i t_1 = MM_ABS(*p1, *p0);
const __m128i t_2 = MM_ABS(*q1, *q0);
@ -285,11 +287,11 @@ static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
}
// input pixels are int8_t
static WEBP_INLINE void GetBaseDelta(const __m128i* const p1,
const __m128i* const p0,
const __m128i* const q0,
const __m128i* const q1,
__m128i* const delta) {
static WEBP_INLINE void GetBaseDelta_SSE2(const __m128i* const p1,
const __m128i* const p0,
const __m128i* const q0,
const __m128i* const q1,
__m128i* const delta) {
// beware of addition order, for saturation!
const __m128i p1_q1 = _mm_subs_epi8(*p1, *q1); // p1 - q1
const __m128i q0_p0 = _mm_subs_epi8(*q0, *p0); // q0 - p0
@ -300,15 +302,16 @@ static WEBP_INLINE void GetBaseDelta(const __m128i* const p1,
}
// input and output are int8_t
static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0,
const __m128i* const fl) {
static WEBP_INLINE void DoSimpleFilter_SSE2(__m128i* const p0,
__m128i* const q0,
const __m128i* const fl) {
const __m128i k3 = _mm_set1_epi8(3);
const __m128i k4 = _mm_set1_epi8(4);
__m128i v3 = _mm_adds_epi8(*fl, k3);
__m128i v4 = _mm_adds_epi8(*fl, k4);
SignedShift8b(&v4); // v4 >> 3
SignedShift8b(&v3); // v3 >> 3
SignedShift8b_SSE2(&v4); // v4 >> 3
SignedShift8b_SSE2(&v3); // v3 >> 3
*q0 = _mm_subs_epi8(*q0, v4); // q0 -= v4
*p0 = _mm_adds_epi8(*p0, v3); // p0 += v3
}
@ -317,9 +320,9 @@ static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0,
// Update operations:
// q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
// Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip).
static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi,
const __m128i* const a0_lo,
const __m128i* const a0_hi) {
static WEBP_INLINE void Update2Pixels_SSE2(__m128i* const pi, __m128i* const qi,
const __m128i* const a0_lo,
const __m128i* const a0_hi) {
const __m128i a1_lo = _mm_srai_epi16(*a0_lo, 7);
const __m128i a1_hi = _mm_srai_epi16(*a0_hi, 7);
const __m128i delta = _mm_packs_epi16(a1_lo, a1_hi);
@ -330,11 +333,11 @@ static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi,
}
// input pixels are uint8_t
static WEBP_INLINE void NeedsFilter(const __m128i* const p1,
const __m128i* const p0,
const __m128i* const q0,
const __m128i* const q1,
int thresh, __m128i* const mask) {
static WEBP_INLINE void NeedsFilter_SSE2(const __m128i* const p1,
const __m128i* const p0,
const __m128i* const q0,
const __m128i* const q1,
int thresh, __m128i* const mask) {
const __m128i m_thresh = _mm_set1_epi8(thresh);
const __m128i t1 = MM_ABS(*p1, *q1); // abs(p1 - q1)
const __m128i kFE = _mm_set1_epi8(0xFE);
@ -353,28 +356,29 @@ static WEBP_INLINE void NeedsFilter(const __m128i* const p1,
// Edge filtering functions
// Applies filter on 2 pixels (p0 and q0)
static WEBP_INLINE void DoFilter2(__m128i* const p1, __m128i* const p0,
__m128i* const q0, __m128i* const q1,
int thresh) {
static WEBP_INLINE void DoFilter2_SSE2(__m128i* const p1, __m128i* const p0,
__m128i* const q0, __m128i* const q1,
int thresh) {
__m128i a, mask;
const __m128i sign_bit = _mm_set1_epi8(0x80);
// convert p1/q1 to int8_t (for GetBaseDelta)
// convert p1/q1 to int8_t (for GetBaseDelta_SSE2)
const __m128i p1s = _mm_xor_si128(*p1, sign_bit);
const __m128i q1s = _mm_xor_si128(*q1, sign_bit);
NeedsFilter(p1, p0, q0, q1, thresh, &mask);
NeedsFilter_SSE2(p1, p0, q0, q1, thresh, &mask);
FLIP_SIGN_BIT2(*p0, *q0);
GetBaseDelta(&p1s, p0, q0, &q1s, &a);
GetBaseDelta_SSE2(&p1s, p0, q0, &q1s, &a);
a = _mm_and_si128(a, mask); // mask filter values we don't care about
DoSimpleFilter(p0, q0, &a);
DoSimpleFilter_SSE2(p0, q0, &a);
FLIP_SIGN_BIT2(*p0, *q0);
}
// Applies filter on 4 pixels (p1, p0, q0 and q1)
static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
__m128i* const q0, __m128i* const q1,
const __m128i* const mask, int hev_thresh) {
static WEBP_INLINE void DoFilter4_SSE2(__m128i* const p1, __m128i* const p0,
__m128i* const q0, __m128i* const q1,
const __m128i* const mask,
int hev_thresh) {
const __m128i zero = _mm_setzero_si128();
const __m128i sign_bit = _mm_set1_epi8(0x80);
const __m128i k64 = _mm_set1_epi8(64);
@ -384,7 +388,7 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
__m128i t1, t2, t3;
// compute hev mask
GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);
GetNotHEV_SSE2(p1, p0, q0, q1, hev_thresh, &not_hev);
// convert to signed values
FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
@ -399,8 +403,8 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
t2 = _mm_adds_epi8(t1, k3); // 3 * (q0 - p0) + hev(p1 - q1) + 3
t3 = _mm_adds_epi8(t1, k4); // 3 * (q0 - p0) + hev(p1 - q1) + 4
SignedShift8b(&t2); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
SignedShift8b(&t3); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
SignedShift8b_SSE2(&t2); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
SignedShift8b_SSE2(&t3); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
*p0 = _mm_adds_epi8(*p0, t2); // p0 += t2
*q0 = _mm_subs_epi8(*q0, t3); // q0 -= t3
FLIP_SIGN_BIT2(*p0, *q0);
@ -417,25 +421,26 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
}
// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
__m128i* const p0, __m128i* const q0,
__m128i* const q1, __m128i* const q2,
const __m128i* const mask, int hev_thresh) {
static WEBP_INLINE void DoFilter6_SSE2(__m128i* const p2, __m128i* const p1,
__m128i* const p0, __m128i* const q0,
__m128i* const q1, __m128i* const q2,
const __m128i* const mask,
int hev_thresh) {
const __m128i zero = _mm_setzero_si128();
const __m128i sign_bit = _mm_set1_epi8(0x80);
__m128i a, not_hev;
// compute hev mask
GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);
GetNotHEV_SSE2(p1, p0, q0, q1, hev_thresh, &not_hev);
FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
FLIP_SIGN_BIT2(*p2, *q2);
GetBaseDelta(p1, p0, q0, q1, &a);
GetBaseDelta_SSE2(p1, p0, q0, q1, &a);
{ // do simple filter on pixels with hev
const __m128i m = _mm_andnot_si128(not_hev, *mask);
const __m128i f = _mm_and_si128(a, m);
DoSimpleFilter(p0, q0, &f);
DoSimpleFilter_SSE2(p0, q0, &f);
}
{ // do strong filter on pixels with not hev
@ -460,15 +465,15 @@ static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
const __m128i a0_lo = _mm_add_epi16(a1_lo, f9_lo); // Filter * 27 + 63
const __m128i a0_hi = _mm_add_epi16(a1_hi, f9_hi); // Filter * 27 + 63
Update2Pixels(p2, q2, &a2_lo, &a2_hi);
Update2Pixels(p1, q1, &a1_lo, &a1_hi);
Update2Pixels(p0, q0, &a0_lo, &a0_hi);
Update2Pixels_SSE2(p2, q2, &a2_lo, &a2_hi);
Update2Pixels_SSE2(p1, q1, &a1_lo, &a1_hi);
Update2Pixels_SSE2(p0, q0, &a0_lo, &a0_hi);
}
}
// reads 8 rows across a vertical edge.
static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride,
__m128i* const p, __m128i* const q) {
static WEBP_INLINE void Load8x4_SSE2(const uint8_t* const b, int stride,
__m128i* const p, __m128i* const q) {
// A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00
// A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10
const __m128i A0 = _mm_set_epi32(
@ -494,11 +499,11 @@ static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride,
*q = _mm_unpackhi_epi32(C0, C1);
}
static WEBP_INLINE void Load16x4(const uint8_t* const r0,
const uint8_t* const r8,
int stride,
__m128i* const p1, __m128i* const p0,
__m128i* const q0, __m128i* const q1) {
static WEBP_INLINE void Load16x4_SSE2(const uint8_t* const r0,
const uint8_t* const r8,
int stride,
__m128i* const p1, __m128i* const p0,
__m128i* const q0, __m128i* const q1) {
// Assume the pixels around the edge (|) are numbered as follows
// 00 01 | 02 03
// 10 11 | 12 13
@ -514,8 +519,8 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0,
// q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
// p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
// q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
Load8x4(r0, stride, p1, q0);
Load8x4(r8, stride, p0, q1);
Load8x4_SSE2(r0, stride, p1, q0);
Load8x4_SSE2(r8, stride, p0, q1);
{
// p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
@ -531,7 +536,8 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0,
}
}
static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
static WEBP_INLINE void Store4x4_SSE2(__m128i* const x,
uint8_t* dst, int stride) {
int i;
for (i = 0; i < 4; ++i, dst += stride) {
WebPUint32ToMem(dst, _mm_cvtsi128_si32(*x));
@ -540,12 +546,12 @@ static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
}
// Transpose back and store
static WEBP_INLINE void Store16x4(const __m128i* const p1,
const __m128i* const p0,
const __m128i* const q0,
const __m128i* const q1,
uint8_t* r0, uint8_t* r8,
int stride) {
static WEBP_INLINE void Store16x4_SSE2(const __m128i* const p1,
const __m128i* const p0,
const __m128i* const q0,
const __m128i* const q1,
uint8_t* r0, uint8_t* r8,
int stride) {
__m128i t1, p1_s, p0_s, q0_s, q1_s;
// p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
@ -572,55 +578,55 @@ static WEBP_INLINE void Store16x4(const __m128i* const p1,
p1_s = _mm_unpacklo_epi16(t1, q1_s);
q1_s = _mm_unpackhi_epi16(t1, q1_s);
Store4x4(&p0_s, r0, stride);
Store4x4_SSE2(&p0_s, r0, stride);
r0 += 4 * stride;
Store4x4(&q0_s, r0, stride);
Store4x4_SSE2(&q0_s, r0, stride);
Store4x4(&p1_s, r8, stride);
Store4x4_SSE2(&p1_s, r8, stride);
r8 += 4 * stride;
Store4x4(&q1_s, r8, stride);
Store4x4_SSE2(&q1_s, r8, stride);
}
//------------------------------------------------------------------------------
// Simple In-loop filtering (Paragraph 15.2)
static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
static void SimpleVFilter16_SSE2(uint8_t* p, int stride, int thresh) {
// Load
__m128i p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]);
__m128i p0 = _mm_loadu_si128((__m128i*)&p[-stride]);
__m128i q0 = _mm_loadu_si128((__m128i*)&p[0]);
__m128i q1 = _mm_loadu_si128((__m128i*)&p[stride]);
DoFilter2(&p1, &p0, &q0, &q1, thresh);
DoFilter2_SSE2(&p1, &p0, &q0, &q1, thresh);
// Store
_mm_storeu_si128((__m128i*)&p[-stride], p0);
_mm_storeu_si128((__m128i*)&p[0], q0);
}
static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
static void SimpleHFilter16_SSE2(uint8_t* p, int stride, int thresh) {
__m128i p1, p0, q0, q1;
p -= 2; // beginning of p1
Load16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
DoFilter2(&p1, &p0, &q0, &q1, thresh);
Store16x4(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride);
Load16x4_SSE2(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
DoFilter2_SSE2(&p1, &p0, &q0, &q1, thresh);
Store16x4_SSE2(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride);
}
static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
static void SimpleVFilter16i_SSE2(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4 * stride;
SimpleVFilter16(p, stride, thresh);
SimpleVFilter16_SSE2(p, stride, thresh);
}
}
static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
static void SimpleHFilter16i_SSE2(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4;
SimpleHFilter16(p, stride, thresh);
SimpleHFilter16_SSE2(p, stride, thresh);
}
}
@ -628,60 +634,60 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
// Complex In-loop filtering (Paragraph 15.3)
#define MAX_DIFF1(p3, p2, p1, p0, m) do { \
m = MM_ABS(p1, p0); \
m = _mm_max_epu8(m, MM_ABS(p3, p2)); \
m = _mm_max_epu8(m, MM_ABS(p2, p1)); \
(m) = MM_ABS(p1, p0); \
(m) = _mm_max_epu8(m, MM_ABS(p3, p2)); \
(m) = _mm_max_epu8(m, MM_ABS(p2, p1)); \
} while (0)
#define MAX_DIFF2(p3, p2, p1, p0, m) do { \
m = _mm_max_epu8(m, MM_ABS(p1, p0)); \
m = _mm_max_epu8(m, MM_ABS(p3, p2)); \
m = _mm_max_epu8(m, MM_ABS(p2, p1)); \
(m) = _mm_max_epu8(m, MM_ABS(p1, p0)); \
(m) = _mm_max_epu8(m, MM_ABS(p3, p2)); \
(m) = _mm_max_epu8(m, MM_ABS(p2, p1)); \
} while (0)
#define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) { \
e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]); \
e2 = _mm_loadu_si128((__m128i*)&(p)[1 * stride]); \
e3 = _mm_loadu_si128((__m128i*)&(p)[2 * stride]); \
e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]); \
(e1) = _mm_loadu_si128((__m128i*)&(p)[0 * (stride)]); \
(e2) = _mm_loadu_si128((__m128i*)&(p)[1 * (stride)]); \
(e3) = _mm_loadu_si128((__m128i*)&(p)[2 * (stride)]); \
(e4) = _mm_loadu_si128((__m128i*)&(p)[3 * (stride)]); \
}
#define LOADUV_H_EDGE(p, u, v, stride) do { \
const __m128i U = _mm_loadl_epi64((__m128i*)&(u)[(stride)]); \
const __m128i V = _mm_loadl_epi64((__m128i*)&(v)[(stride)]); \
p = _mm_unpacklo_epi64(U, V); \
(p) = _mm_unpacklo_epi64(U, V); \
} while (0)
#define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) { \
LOADUV_H_EDGE(e1, u, v, 0 * stride); \
LOADUV_H_EDGE(e2, u, v, 1 * stride); \
LOADUV_H_EDGE(e3, u, v, 2 * stride); \
LOADUV_H_EDGE(e4, u, v, 3 * stride); \
LOADUV_H_EDGE(e1, u, v, 0 * (stride)); \
LOADUV_H_EDGE(e2, u, v, 1 * (stride)); \
LOADUV_H_EDGE(e3, u, v, 2 * (stride)); \
LOADUV_H_EDGE(e4, u, v, 3 * (stride)); \
}
#define STOREUV(p, u, v, stride) { \
_mm_storel_epi64((__m128i*)&u[(stride)], p); \
p = _mm_srli_si128(p, 8); \
_mm_storel_epi64((__m128i*)&v[(stride)], p); \
_mm_storel_epi64((__m128i*)&(u)[(stride)], p); \
(p) = _mm_srli_si128(p, 8); \
_mm_storel_epi64((__m128i*)&(v)[(stride)], p); \
}
static WEBP_INLINE void ComplexMask(const __m128i* const p1,
const __m128i* const p0,
const __m128i* const q0,
const __m128i* const q1,
int thresh, int ithresh,
__m128i* const mask) {
static WEBP_INLINE void ComplexMask_SSE2(const __m128i* const p1,
const __m128i* const p0,
const __m128i* const q0,
const __m128i* const q1,
int thresh, int ithresh,
__m128i* const mask) {
const __m128i it = _mm_set1_epi8(ithresh);
const __m128i diff = _mm_subs_epu8(*mask, it);
const __m128i thresh_mask = _mm_cmpeq_epi8(diff, _mm_setzero_si128());
__m128i filter_mask;
NeedsFilter(p1, p0, q0, q1, thresh, &filter_mask);
NeedsFilter_SSE2(p1, p0, q0, q1, thresh, &filter_mask);
*mask = _mm_and_si128(thresh_mask, filter_mask);
}
// on macroblock edges
static void VFilter16(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter16_SSE2(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
__m128i t1;
__m128i mask;
__m128i p2, p1, p0, q0, q1, q2;
@ -694,8 +700,8 @@ static void VFilter16(uint8_t* p, int stride,
LOAD_H_EDGES4(p, stride, q0, q1, q2, t1);
MAX_DIFF2(t1, q2, q1, q0, mask);
ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
// Store
_mm_storeu_si128((__m128i*)&p[-3 * stride], p2);
@ -706,28 +712,28 @@ static void VFilter16(uint8_t* p, int stride,
_mm_storeu_si128((__m128i*)&p[+2 * stride], q2);
}
static void HFilter16(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
static void HFilter16_SSE2(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i p3, p2, p1, p0, q0, q1, q2, q3;
uint8_t* const b = p - 4;
Load16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0); // p3, p2, p1, p0
Load16x4_SSE2(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);
MAX_DIFF1(p3, p2, p1, p0, mask);
Load16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3); // q0, q1, q2, q3
Load16x4_SSE2(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);
MAX_DIFF2(q3, q2, q1, q0, mask);
ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
Store16x4(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride);
Store16x4(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride);
Store16x4_SSE2(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride);
Store16x4_SSE2(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride);
}
// on three inner edges
static void VFilter16i(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter16i_SSE2(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
int k;
__m128i p3, p2, p1, p0; // loop invariants
@ -744,8 +750,8 @@ static void VFilter16i(uint8_t* p, int stride,
// p3 and p2 are not just temporary variables here: they will be
// re-used for next span. And q2/q3 will become p1/p0 accordingly.
ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
ComplexMask_SSE2(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
DoFilter4_SSE2(&p1, &p0, &p3, &p2, &mask, hev_thresh);
// Store
_mm_storeu_si128((__m128i*)&b[0 * stride], p1);
@ -759,12 +765,12 @@ static void VFilter16i(uint8_t* p, int stride,
}
}
static void HFilter16i(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
static void HFilter16i_SSE2(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
int k;
__m128i p3, p2, p1, p0; // loop invariants
Load16x4(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0); // prologue
Load16x4_SSE2(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0); // prologue
for (k = 3; k > 0; --k) {
__m128i mask, tmp1, tmp2;
@ -773,13 +779,13 @@ static void HFilter16i(uint8_t* p, int stride,
p += 4; // beginning of q0 (and next span)
MAX_DIFF1(p3, p2, p1, p0, mask); // compute partial mask
Load16x4(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2);
Load16x4_SSE2(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2);
MAX_DIFF2(p3, p2, tmp1, tmp2, mask);
ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
ComplexMask_SSE2(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
DoFilter4_SSE2(&p1, &p0, &p3, &p2, &mask, hev_thresh);
Store16x4(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride);
Store16x4_SSE2(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride);
// rotate samples
p1 = tmp1;
@ -788,8 +794,8 @@ static void HFilter16i(uint8_t* p, int stride,
}
// 8-pixels wide variant, for chroma filtering
static void VFilter8(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i t1, p2, p1, p0, q0, q1, q2;
@ -801,8 +807,8 @@ static void VFilter8(uint8_t* u, uint8_t* v, int stride,
LOADUV_H_EDGES4(u, v, stride, q0, q1, q2, t1);
MAX_DIFF2(t1, q2, q1, q0, mask);
ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
// Store
STOREUV(p2, u, v, -3 * stride);
@ -813,28 +819,28 @@ static void VFilter8(uint8_t* u, uint8_t* v, int stride,
STOREUV(q2, u, v, 2 * stride);
}
static void HFilter8(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i p3, p2, p1, p0, q0, q1, q2, q3;
uint8_t* const tu = u - 4;
uint8_t* const tv = v - 4;
Load16x4(tu, tv, stride, &p3, &p2, &p1, &p0); // p3, p2, p1, p0
Load16x4_SSE2(tu, tv, stride, &p3, &p2, &p1, &p0);
MAX_DIFF1(p3, p2, p1, p0, mask);
Load16x4(u, v, stride, &q0, &q1, &q2, &q3); // q0, q1, q2, q3
Load16x4_SSE2(u, v, stride, &q0, &q1, &q2, &q3);
MAX_DIFF2(q3, q2, q1, q0, mask);
ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
Store16x4(&p3, &p2, &p1, &p0, tu, tv, stride);
Store16x4(&q0, &q1, &q2, &q3, u, v, stride);
Store16x4_SSE2(&p3, &p2, &p1, &p0, tu, tv, stride);
Store16x4_SSE2(&q0, &q1, &q2, &q3, u, v, stride);
}
static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i t1, t2, p1, p0, q0, q1;
@ -849,8 +855,8 @@ static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
LOADUV_H_EDGES4(u, v, stride, q0, q1, t1, t2);
MAX_DIFF2(t2, t1, q1, q0, mask);
ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter4_SSE2(&p1, &p0, &q0, &q1, &mask, hev_thresh);
// Store
STOREUV(p1, u, v, -2 * stride);
@ -859,24 +865,24 @@ static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
STOREUV(q1, u, v, 1 * stride);
}
static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
static void HFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
__m128i mask;
__m128i t1, t2, p1, p0, q0, q1;
Load16x4(u, v, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0
Load16x4_SSE2(u, v, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0
MAX_DIFF1(t2, t1, p1, p0, mask);
u += 4; // beginning of q0
v += 4;
Load16x4(u, v, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3
Load16x4_SSE2(u, v, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3
MAX_DIFF2(t2, t1, q1, q0, mask);
ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
DoFilter4_SSE2(&p1, &p0, &q0, &q1, &mask, hev_thresh);
u -= 2; // beginning of p1
v -= 2;
Store16x4(&p1, &p0, &q0, &q1, u, v, stride);
Store16x4_SSE2(&p1, &p0, &q0, &q1, u, v, stride);
}
//------------------------------------------------------------------------------
@ -893,7 +899,7 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
// where: AC = (a + b + 1) >> 1, BC = (b + c + 1) >> 1
// and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
static void VE4(uint8_t* dst) { // vertical
static void VE4_SSE2(uint8_t* dst) { // vertical
const __m128i one = _mm_set1_epi8(1);
const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@ -909,7 +915,7 @@ static void VE4(uint8_t* dst) { // vertical
}
}
static void LD4(uint8_t* dst) { // Down-Left
static void LD4_SSE2(uint8_t* dst) { // Down-Left
const __m128i one = _mm_set1_epi8(1);
const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@ -925,7 +931,7 @@ static void LD4(uint8_t* dst) { // Down-Left
WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
}
static void VR4(uint8_t* dst) { // Vertical-Right
static void VR4_SSE2(uint8_t* dst) { // Vertical-Right
const __m128i one = _mm_set1_epi8(1);
const int I = dst[-1 + 0 * BPS];
const int J = dst[-1 + 1 * BPS];
@ -950,7 +956,7 @@ static void VR4(uint8_t* dst) { // Vertical-Right
DST(0, 3) = AVG3(K, J, I);
}
static void VL4(uint8_t* dst) { // Vertical-Left
static void VL4_SSE2(uint8_t* dst) { // Vertical-Left
const __m128i one = _mm_set1_epi8(1);
const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
@ -975,7 +981,7 @@ static void VL4(uint8_t* dst) { // Vertical-Left
DST(3, 3) = (extra_out >> 8) & 0xff;
}
static void RD4(uint8_t* dst) { // Down-right
static void RD4_SSE2(uint8_t* dst) { // Down-right
const __m128i one = _mm_set1_epi8(1);
const __m128i XABCD = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
const __m128i ____XABCD = _mm_slli_si128(XABCD, 4);
@ -1004,7 +1010,7 @@ static void RD4(uint8_t* dst) { // Down-right
//------------------------------------------------------------------------------
// Luma 16x16
static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, int size) {
const uint8_t* top = dst - BPS;
const __m128i zero = _mm_setzero_si128();
int y;
@ -1041,11 +1047,11 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
}
}
static void TM4(uint8_t* dst) { TrueMotion(dst, 4); }
static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
static void TM16(uint8_t* dst) { TrueMotion(dst, 16); }
static void TM4_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 4); }
static void TM8uv_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 8); }
static void TM16_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 16); }
static void VE16(uint8_t* dst) {
static void VE16_SSE2(uint8_t* dst) {
const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
int j;
for (j = 0; j < 16; ++j) {
@ -1053,7 +1059,7 @@ static void VE16(uint8_t* dst) {
}
}
static void HE16(uint8_t* dst) { // horizontal
static void HE16_SSE2(uint8_t* dst) { // horizontal
int j;
for (j = 16; j > 0; --j) {
const __m128i values = _mm_set1_epi8(dst[-1]);
@ -1062,7 +1068,7 @@ static void HE16(uint8_t* dst) { // horizontal
}
}
static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
int j;
const __m128i values = _mm_set1_epi8(v);
for (j = 0; j < 16; ++j) {
@ -1070,7 +1076,7 @@ static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
}
}
static void DC16(uint8_t* dst) { // DC
static void DC16_SSE2(uint8_t* dst) { // DC
const __m128i zero = _mm_setzero_si128();
const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
const __m128i sad8x2 = _mm_sad_epu8(top, zero);
@ -1083,37 +1089,37 @@ static void DC16(uint8_t* dst) { // DC
}
{
const int DC = _mm_cvtsi128_si32(sum) + left + 16;
Put16(DC >> 5, dst);
Put16_SSE2(DC >> 5, dst);
}
}
static void DC16NoTop(uint8_t* dst) { // DC with top samples not available
static void DC16NoTop_SSE2(uint8_t* dst) { // DC with top samples unavailable
int DC = 8;
int j;
for (j = 0; j < 16; ++j) {
DC += dst[-1 + j * BPS];
}
Put16(DC >> 4, dst);
Put16_SSE2(DC >> 4, dst);
}
static void DC16NoLeft(uint8_t* dst) { // DC with left samples not available
static void DC16NoLeft_SSE2(uint8_t* dst) { // DC with left samples unavailable
const __m128i zero = _mm_setzero_si128();
const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
const __m128i sad8x2 = _mm_sad_epu8(top, zero);
// sum the two sads: sad8x2[0:1] + sad8x2[8:9]
const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
const int DC = _mm_cvtsi128_si32(sum) + 8;
Put16(DC >> 4, dst);
Put16_SSE2(DC >> 4, dst);
}
static void DC16NoTopLeft(uint8_t* dst) { // DC with no top and left samples
Put16(0x80, dst);
static void DC16NoTopLeft_SSE2(uint8_t* dst) { // DC with no top & left samples
Put16_SSE2(0x80, dst);
}
//------------------------------------------------------------------------------
// Chroma
static void VE8uv(uint8_t* dst) { // vertical
static void VE8uv_SSE2(uint8_t* dst) { // vertical
int j;
const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
for (j = 0; j < 8; ++j) {
@ -1121,17 +1127,8 @@ static void VE8uv(uint8_t* dst) { // vertical
}
}
static void HE8uv(uint8_t* dst) { // horizontal
int j;
for (j = 0; j < 8; ++j) {
const __m128i values = _mm_set1_epi8(dst[-1]);
_mm_storel_epi64((__m128i*)dst, values);
dst += BPS;
}
}
// helper for chroma-DC predictions
static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
int j;
const __m128i values = _mm_set1_epi8(v);
for (j = 0; j < 8; ++j) {
@ -1139,7 +1136,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
}
}
static void DC8uv(uint8_t* dst) { // DC
static void DC8uv_SSE2(uint8_t* dst) { // DC
const __m128i zero = _mm_setzero_si128();
const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
const __m128i sum = _mm_sad_epu8(top, zero);
@ -1150,29 +1147,29 @@ static void DC8uv(uint8_t* dst) { // DC
}
{
const int DC = _mm_cvtsi128_si32(sum) + left + 8;
Put8x8uv(DC >> 4, dst);
Put8x8uv_SSE2(DC >> 4, dst);
}
}
static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
static void DC8uvNoLeft_SSE2(uint8_t* dst) { // DC with no left samples
const __m128i zero = _mm_setzero_si128();
const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
const __m128i sum = _mm_sad_epu8(top, zero);
const int DC = _mm_cvtsi128_si32(sum) + 4;
Put8x8uv(DC >> 3, dst);
Put8x8uv_SSE2(DC >> 3, dst);
}
static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
static void DC8uvNoTop_SSE2(uint8_t* dst) { // DC with no top samples
int dc0 = 4;
int i;
for (i = 0; i < 8; ++i) {
dc0 += dst[-1 + i * BPS];
}
Put8x8uv(dc0 >> 3, dst);
Put8x8uv_SSE2(dc0 >> 3, dst);
}
static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing
Put8x8uv(0x80, dst);
static void DC8uvNoTopLeft_SSE2(uint8_t* dst) { // DC with nothing
Put8x8uv_SSE2(0x80, dst);
}
//------------------------------------------------------------------------------
@ -1181,47 +1178,46 @@ static void DC8uvNoTopLeft(uint8_t* dst) { // DC with nothing
extern void VP8DspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE2(void) {
VP8Transform = Transform;
#if defined(USE_TRANSFORM_AC3)
VP8TransformAC3 = TransformAC3;
VP8Transform = Transform_SSE2;
#if (USE_TRANSFORM_AC3 == 1)
VP8TransformAC3 = TransformAC3_SSE2;
#endif
VP8VFilter16 = VFilter16;
VP8HFilter16 = HFilter16;
VP8VFilter8 = VFilter8;
VP8HFilter8 = HFilter8;
VP8VFilter16i = VFilter16i;
VP8HFilter16i = HFilter16i;
VP8VFilter8i = VFilter8i;
VP8HFilter8i = HFilter8i;
VP8VFilter16 = VFilter16_SSE2;
VP8HFilter16 = HFilter16_SSE2;
VP8VFilter8 = VFilter8_SSE2;
VP8HFilter8 = HFilter8_SSE2;
VP8VFilter16i = VFilter16i_SSE2;
VP8HFilter16i = HFilter16i_SSE2;
VP8VFilter8i = VFilter8i_SSE2;
VP8HFilter8i = HFilter8i_SSE2;
VP8SimpleVFilter16 = SimpleVFilter16;
VP8SimpleHFilter16 = SimpleHFilter16;
VP8SimpleVFilter16i = SimpleVFilter16i;
VP8SimpleHFilter16i = SimpleHFilter16i;
VP8SimpleVFilter16 = SimpleVFilter16_SSE2;
VP8SimpleHFilter16 = SimpleHFilter16_SSE2;
VP8SimpleVFilter16i = SimpleVFilter16i_SSE2;
VP8SimpleHFilter16i = SimpleHFilter16i_SSE2;
VP8PredLuma4[1] = TM4;
VP8PredLuma4[2] = VE4;
VP8PredLuma4[4] = RD4;
VP8PredLuma4[5] = VR4;
VP8PredLuma4[6] = LD4;
VP8PredLuma4[7] = VL4;
VP8PredLuma4[1] = TM4_SSE2;
VP8PredLuma4[2] = VE4_SSE2;
VP8PredLuma4[4] = RD4_SSE2;
VP8PredLuma4[5] = VR4_SSE2;
VP8PredLuma4[6] = LD4_SSE2;
VP8PredLuma4[7] = VL4_SSE2;
VP8PredLuma16[0] = DC16;
VP8PredLuma16[1] = TM16;
VP8PredLuma16[2] = VE16;
VP8PredLuma16[3] = HE16;
VP8PredLuma16[4] = DC16NoTop;
VP8PredLuma16[5] = DC16NoLeft;
VP8PredLuma16[6] = DC16NoTopLeft;
VP8PredLuma16[0] = DC16_SSE2;
VP8PredLuma16[1] = TM16_SSE2;
VP8PredLuma16[2] = VE16_SSE2;
VP8PredLuma16[3] = HE16_SSE2;
VP8PredLuma16[4] = DC16NoTop_SSE2;
VP8PredLuma16[5] = DC16NoLeft_SSE2;
VP8PredLuma16[6] = DC16NoTopLeft_SSE2;
VP8PredChroma8[0] = DC8uv;
VP8PredChroma8[1] = TM8uv;
VP8PredChroma8[2] = VE8uv;
VP8PredChroma8[3] = HE8uv;
VP8PredChroma8[4] = DC8uvNoTop;
VP8PredChroma8[5] = DC8uvNoLeft;
VP8PredChroma8[6] = DC8uvNoTopLeft;
VP8PredChroma8[0] = DC8uv_SSE2;
VP8PredChroma8[1] = TM8uv_SSE2;
VP8PredChroma8[2] = VE8uv_SSE2;
VP8PredChroma8[4] = DC8uvNoTop_SSE2;
VP8PredChroma8[5] = DC8uvNoLeft_SSE2;
VP8PredChroma8[6] = DC8uvNoTopLeft_SSE2;
}
#else // !WEBP_USE_SSE2

View file

@ -11,15 +11,15 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE41)
#include <smmintrin.h>
#include "../dec/vp8i_dec.h"
#include "../utils/utils.h"
#include "src/dec/vp8i_dec.h"
#include "src/utils/utils.h"
static void HE16(uint8_t* dst) { // horizontal
static void HE16_SSE41(uint8_t* dst) { // horizontal
int j;
const __m128i kShuffle3 = _mm_set1_epi8(3);
for (j = 16; j > 0; --j) {
@ -36,7 +36,7 @@ static void HE16(uint8_t* dst) { // horizontal
extern void VP8DspInitSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE41(void) {
VP8PredLuma16[3] = HE16;
VP8PredLuma16[3] = HE16_SSE41;
}
#else // !WEBP_USE_SSE41

View file

@ -15,10 +15,10 @@
#define WEBP_DSP_DSP_H_
#ifdef HAVE_CONFIG_H
#include "../webp/config.h"
#include "src/webp/config.h"
#endif
#include "../webp/types.h"
#include "src/webp/types.h"
#ifdef __cplusplus
extern "C" {
@ -38,10 +38,22 @@ extern "C" {
# define LOCAL_GCC_PREREQ(maj, min) 0
#endif
#if defined(__clang__)
# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
# define LOCAL_CLANG_PREREQ(maj, min) \
(LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
#else
# define LOCAL_CLANG_VERSION 0
# define LOCAL_CLANG_PREREQ(maj, min) 0
#endif
#ifndef __has_builtin
# define __has_builtin(x) 0
#endif
// for now, none of the optimizations below are available in emscripten
#if !defined(EMSCRIPTEN)
#if defined(_MSC_VER) && _MSC_VER > 1310 && \
(defined(_M_X64) || defined(_M_IX86))
#define WEBP_MSC_SSE2 // Visual C++ SSE2 targets
@ -68,18 +80,20 @@ extern "C" {
#define WEBP_USE_AVX2
#endif
#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
#define WEBP_ANDROID_NEON // Android targets that might support NEON
#endif
// The intrinsics currently cause compiler errors with arm-nacl-gcc and the
// inline assembly would need to be modified for use with Native Client.
#if (defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON) || \
#if (defined(__ARM_NEON__) || \
defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \
!defined(__native_client__)
#define WEBP_USE_NEON
#endif
#if !defined(WEBP_USE_NEON) && defined(__ANDROID__) && \
defined(__ARM_ARCH_7A__) && defined(HAVE_CPU_FEATURES_H)
#define WEBP_ANDROID_NEON // Android targets that may have NEON
#define WEBP_USE_NEON
#endif
#if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)
#define WEBP_USE_NEON
#define WEBP_USE_INTRINSICS
@ -90,7 +104,7 @@ extern "C" {
#define WEBP_USE_MIPS32
#if (__mips_isa_rev >= 2)
#define WEBP_USE_MIPS32_R2
#if defined(__mips_dspr2) || (__mips_dsp_rev >= 2)
#if defined(__mips_dspr2) || (defined(__mips_dsp_rev) && __mips_dsp_rev >= 2)
#define WEBP_USE_MIPS_DSP_R2
#endif
#endif
@ -100,6 +114,24 @@ extern "C" {
#define WEBP_USE_MSA
#endif
#endif /* EMSCRIPTEN */
#ifndef WEBP_DSP_OMIT_C_CODE
#define WEBP_DSP_OMIT_C_CODE 1
#endif
#if (defined(__aarch64__) || defined(__ARM_NEON__)) && WEBP_DSP_OMIT_C_CODE
#define WEBP_NEON_OMIT_C_CODE 1
#else
#define WEBP_NEON_OMIT_C_CODE 0
#endif
#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
#define WEBP_NEON_WORK_AROUND_GCC 1
#else
#define WEBP_NEON_WORK_AROUND_GCC 0
#endif
// This macro prevents thread_sanitizer from reporting known concurrent writes.
#define WEBP_TSAN_IGNORE_FUNCTION
#if defined(__has_feature)
@ -129,6 +161,11 @@ extern "C" {
#endif
#endif
// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
#if !defined(WEBP_SWAP_16BIT_CSP)
#define WEBP_SWAP_16BIT_CSP 0
#endif
typedef enum {
kSSE2,
kSSE3,
@ -143,7 +180,7 @@ typedef enum {
} CPUFeature;
// returns true if the CPU supports the feature.
typedef int (*VP8CPUInfo)(CPUFeature feature);
WEBP_EXTERN(VP8CPUInfo) VP8GetCPUInfo;
WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;
//------------------------------------------------------------------------------
// Init stub generator
@ -271,6 +308,7 @@ typedef double (*VP8SSIMGetClippedFunc)(const uint8_t* src1, int stride1,
int xo, int yo, // center position
int W, int H); // plane dimension
#if !defined(WEBP_REDUCE_SIZE)
// This version is called with the guarantee that you can load 8 bytes and
// 8 rows at offset src1 and src2
typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1,
@ -278,10 +316,13 @@ typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1,
extern VP8SSIMGetFunc VP8SSIMGet; // unclipped / unchecked
extern VP8SSIMGetClippedFunc VP8SSIMGetClipped; // with clipping
#endif
#if !defined(WEBP_DISABLE_STATS)
typedef uint32_t (*VP8AccumulateSSEFunc)(const uint8_t* src1,
const uint8_t* src2, int len);
extern VP8AccumulateSSEFunc VP8AccumulateSSE;
#endif
// must be called before using any of the above directly
void VP8SSIMDspInit(void);
@ -462,12 +503,12 @@ extern WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink;
// Plain-C implementation, as fall-back.
extern void WebPRescalerImportRowExpandC(struct WebPRescaler* const wrk,
const uint8_t* src);
extern void WebPRescalerImportRowShrinkC(struct WebPRescaler* const wrk,
const uint8_t* src);
extern void WebPRescalerExportRowExpandC(struct WebPRescaler* const wrk);
extern void WebPRescalerExportRowShrinkC(struct WebPRescaler* const wrk);
extern void WebPRescalerImportRowExpand_C(struct WebPRescaler* const wrk,
const uint8_t* src);
extern void WebPRescalerImportRowShrink_C(struct WebPRescaler* const wrk,
const uint8_t* src);
extern void WebPRescalerExportRowExpand_C(struct WebPRescaler* const wrk);
extern void WebPRescalerExportRowShrink_C(struct WebPRescaler* const wrk);
// Main entry calls:
extern void WebPRescalerImportRow(struct WebPRescaler* const wrk,
@ -533,25 +574,22 @@ void WebPMultRows(uint8_t* ptr, int stride,
int width, int num_rows, int inverse);
// Plain-C versions, used as fallback by some implementations.
void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
int width, int inverse);
void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse);
void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
int width, int inverse);
void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse);
// RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
int len, int step, uint32_t* out);
// This function returns true if src[i] contains a value different from 0xff.
extern int (*WebPHasAlpha8b)(const uint8_t* src, int length);
// This function returns true if src[4*i] contains a value different from 0xff.
extern int (*WebPHasAlpha32b)(const uint8_t* src, int length);
// To be called first before using the above.
void WebPInitAlphaProcessing(void);
// ARGB packing function: a/r/g/b input is rgba or bgra order.
extern void (*VP8PackARGB)(const uint8_t* a, const uint8_t* r,
const uint8_t* g, const uint8_t* b, int len,
uint32_t* out);
// RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
extern void (*VP8PackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
int len, int step, uint32_t* out);
// To be called first before using the above.
void VP8EncDspARGBInit(void);
//------------------------------------------------------------------------------
// Filter functions

View file

@ -14,16 +14,18 @@
#include <assert.h>
#include <stdlib.h> // for abs()
#include "./dsp.h"
#include "../enc/vp8i_enc.h"
#include "src/dsp/dsp.h"
#include "src/enc/vp8i_enc.h"
static WEBP_INLINE uint8_t clip_8b(int v) {
return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
}
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE int clip_max(int v, int max) {
return (v > max) ? max : v;
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms:
@ -56,9 +58,10 @@ void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
histo->last_non_zero = last_non_zero;
}
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
#if !WEBP_NEON_OMIT_C_CODE
static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
for (j = start_block; j < end_block; ++j) {
@ -76,6 +79,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
}
VP8SetHistogramData(distribution, histo);
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// run-time tables (~4k)
@ -100,6 +104,8 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
#if !WEBP_NEON_OMIT_C_CODE
#define STORE(x, y, v) \
dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
@ -140,15 +146,15 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
}
}
static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
ITransformOne(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
}
}
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
int i;
int tmp[16];
for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
@ -176,13 +182,16 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
static void FTransform2_C(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
VP8FTransform(src, ref, out);
VP8FTransform(src + 4, ref + 4, out + 16);
}
static void FTransformWHT(const int16_t* in, int16_t* out) {
#if !WEBP_NEON_OMIT_C_CODE
static void FTransformWHT_C(const int16_t* in, int16_t* out) {
// input is 12b signed
int32_t tmp[16];
int i;
@ -211,6 +220,7 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
out[12 + i] = b3 >> 1;
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
#undef MUL
#undef STORE
@ -303,8 +313,8 @@ static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
//------------------------------------------------------------------------------
// Chroma 8x8 prediction (paragraph 12.2)
static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
// U block
DCMode(C8DC8 + dst, left, top, 8, 8, 4);
VerticalPred(C8VE8 + dst, top, 8);
@ -323,8 +333,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
//------------------------------------------------------------------------------
// luma 16x16 prediction (paragraph 12.3)
static void Intra16Preds(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
static void Intra16Preds_C(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
DCMode(I16DC16 + dst, left, top, 16, 16, 5);
VerticalPred(I16VE16 + dst, top, 16);
HorizontalPred(I16HE16 + dst, left, 16);
@ -507,7 +517,7 @@ static void TM4(uint8_t* dst, const uint8_t* top) {
// Left samples are top[-5 .. -2], top_left is top[-1], top are
// located at top[0..3], and top right is top[4..7]
static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
DC4(I4DC4 + dst, top);
TM4(I4TM4 + dst, top);
VE4(I4VE4 + dst, top);
@ -523,6 +533,7 @@ static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
//------------------------------------------------------------------------------
// Metric
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
int w, int h) {
int count = 0;
@ -538,20 +549,21 @@ static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
return count;
}
static int SSE16x16(const uint8_t* a, const uint8_t* b) {
static int SSE16x16_C(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 16, 16);
}
static int SSE16x8(const uint8_t* a, const uint8_t* b) {
static int SSE16x8_C(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 16, 8);
}
static int SSE8x8(const uint8_t* a, const uint8_t* b) {
static int SSE8x8_C(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 8, 8);
}
static int SSE4x4(const uint8_t* a, const uint8_t* b) {
static int SSE4x4_C(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 4, 4);
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
int k, x, y;
for (k = 0; k < 4; ++k) {
uint32_t avg = 0;
@ -571,6 +583,7 @@ static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
// We try to match the spectral content (weighted) between source and
// reconstructed samples.
#if !WEBP_NEON_OMIT_C_CODE
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
@ -608,24 +621,25 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
return sum;
}
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int sum1 = TTransform(a, w);
const int sum2 = TTransform(b, w);
return abs(sum2 - sum1) >> 5;
}
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4(a + x + y, b + x + y, w);
D += Disto4x4_C(a + x + y, b + x + y, w);
}
}
return D;
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// Quantization
@ -636,8 +650,8 @@ static const uint8_t kZigzag[16] = {
};
// Simple quantization
static int QuantizeBlock(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
int last = -1;
int n;
for (n = 0; n < 16; ++n) {
@ -662,13 +676,15 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
return (last >= 0);
}
static int Quantize2Blocks(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
return nz;
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
//------------------------------------------------------------------------------
// Block copy
@ -682,148 +698,14 @@ static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
}
}
static void Copy4x4(const uint8_t* src, uint8_t* dst) {
static void Copy4x4_C(const uint8_t* src, uint8_t* dst) {
Copy(src, dst, 4, 4);
}
static void Copy16x8(const uint8_t* src, uint8_t* dst) {
static void Copy16x8_C(const uint8_t* src, uint8_t* dst) {
Copy(src, dst, 16, 8);
}
//------------------------------------------------------------------------------
// SSIM / PSNR
// hat-shaped filter. Sum of coefficients is equal to 16.
static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
1, 2, 3, 4, 3, 2, 1
};
static const uint32_t kWeightSum = 16 * 16; // sum{kWeight}^2
static WEBP_INLINE double SSIMCalculation(
const VP8DistoStats* const stats, uint32_t N /*num samples*/) {
const uint32_t w2 = N * N;
const uint32_t C1 = 20 * w2;
const uint32_t C2 = 60 * w2;
const uint32_t C3 = 8 * 8 * w2; // 'dark' limit ~= 6
const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
if (xmxm + ymym >= C3) {
const int64_t xmym = (int64_t)stats->xm * stats->ym;
const int64_t sxy = (int64_t)stats->xym * N - xmym; // can be negative
const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
const uint64_t syy = (uint64_t)stats->yym * N - ymym;
// we descale by 8 to prevent overflow during the fnum/fden multiply.
const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
const uint64_t den_S = (sxx + syy + C2) >> 8;
const uint64_t fnum = (2 * xmym + C1) * num_S;
const uint64_t fden = (xmxm + ymym + C1) * den_S;
const double r = (double)fnum / fden;
assert(r >= 0. && r <= 1.0);
return r;
}
return 1.; // area is too dark to contribute meaningfully
}
double VP8SSIMFromStats(const VP8DistoStats* const stats) {
return SSIMCalculation(stats, kWeightSum);
}
double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
return SSIMCalculation(stats, stats->w);
}
static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
const uint8_t* src2, int stride2,
int xo, int yo, int W, int H) {
VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
: yo + VP8_SSIM_KERNEL;
const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
: xo + VP8_SSIM_KERNEL;
int x, y;
src1 += ymin * stride1;
src2 += ymin * stride2;
for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
for (x = xmin; x <= xmax; ++x) {
const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
* kWeight[VP8_SSIM_KERNEL + y - yo];
const uint32_t s1 = src1[x];
const uint32_t s2 = src2[x];
stats.w += w;
stats.xm += w * s1;
stats.ym += w * s2;
stats.xxm += w * s1 * s1;
stats.xym += w * s1 * s2;
stats.yym += w * s2 * s2;
}
}
return VP8SSIMFromStatsClipped(&stats);
}
static double SSIMGet_C(const uint8_t* src1, int stride1,
const uint8_t* src2, int stride2) {
VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
int x, y;
for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
const uint32_t w = kWeight[x] * kWeight[y];
const uint32_t s1 = src1[x];
const uint32_t s2 = src2[x];
stats.xm += w * s1;
stats.ym += w * s2;
stats.xxm += w * s1 * s1;
stats.xym += w * s1 * s2;
stats.yym += w * s2 * s2;
}
}
return VP8SSIMFromStats(&stats);
}
//------------------------------------------------------------------------------
static uint32_t AccumulateSSE(const uint8_t* src1,
const uint8_t* src2, int len) {
int i;
uint32_t sse2 = 0;
assert(len <= 65535); // to ensure that accumulation fits within uint32_t
for (i = 0; i < len; ++i) {
const int32_t diff = src1[i] - src2[i];
sse2 += diff * diff;
}
return sse2;
}
//------------------------------------------------------------------------------
VP8SSIMGetFunc VP8SSIMGet;
VP8SSIMGetClippedFunc VP8SSIMGetClipped;
VP8AccumulateSSEFunc VP8AccumulateSSE;
extern void VP8SSIMDspInitSSE2(void);
static volatile VP8CPUInfo ssim_last_cpuinfo_used =
(VP8CPUInfo)&ssim_last_cpuinfo_used;
WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
VP8SSIMGetClipped = SSIMGetClipped_C;
VP8SSIMGet = SSIMGet_C;
VP8AccumulateSSE = AccumulateSSE;
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8SSIMDspInitSSE2();
}
#endif
}
ssim_last_cpuinfo_used = VP8GetCPUInfo;
}
//------------------------------------------------------------------------------
// Initialization
@ -868,26 +750,32 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
InitTables();
// default C implementations
VP8CollectHistogram = CollectHistogram;
VP8ITransform = ITransform;
VP8FTransform = FTransform;
VP8FTransform2 = FTransform2;
VP8FTransformWHT = FTransformWHT;
VP8EncPredLuma4 = Intra4Preds;
VP8EncPredLuma16 = Intra16Preds;
VP8EncPredChroma8 = IntraChromaPreds;
VP8SSE16x16 = SSE16x16;
VP8SSE8x8 = SSE8x8;
VP8SSE16x8 = SSE16x8;
VP8SSE4x4 = SSE4x4;
VP8TDisto4x4 = Disto4x4;
VP8TDisto16x16 = Disto16x16;
VP8Mean16x4 = Mean16x4;
VP8EncQuantizeBlock = QuantizeBlock;
VP8EncQuantize2Blocks = Quantize2Blocks;
VP8EncQuantizeBlockWHT = QuantizeBlock;
VP8Copy4x4 = Copy4x4;
VP8Copy16x8 = Copy16x8;
#if !WEBP_NEON_OMIT_C_CODE
VP8ITransform = ITransform_C;
VP8FTransform = FTransform_C;
VP8FTransformWHT = FTransformWHT_C;
VP8TDisto4x4 = Disto4x4_C;
VP8TDisto16x16 = Disto16x16_C;
VP8CollectHistogram = CollectHistogram_C;
VP8SSE16x16 = SSE16x16_C;
VP8SSE16x8 = SSE16x8_C;
VP8SSE8x8 = SSE8x8_C;
VP8SSE4x4 = SSE4x4_C;
#endif
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
VP8EncQuantizeBlock = QuantizeBlock_C;
VP8EncQuantize2Blocks = Quantize2Blocks_C;
#endif
VP8FTransform2 = FTransform2_C;
VP8EncPredLuma4 = Intra4Preds_C;
VP8EncPredLuma16 = Intra16Preds_C;
VP8EncPredChroma8 = IntraChromaPreds_C;
VP8Mean16x4 = Mean16x4_C;
VP8EncQuantizeBlockWHT = QuantizeBlock_C;
VP8Copy4x4 = Copy4x4_C;
VP8Copy16x8 = Copy16x8_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
@ -906,11 +794,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
VP8EncDspInitAVX2();
}
#endif
#if defined(WEBP_USE_NEON)
if (VP8GetCPUInfo(kNEON)) {
VP8EncDspInitNEON();
}
#endif
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
VP8EncDspInitMIPS32();
@ -927,5 +810,34 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8EncDspInitNEON();
}
#endif
assert(VP8ITransform != NULL);
assert(VP8FTransform != NULL);
assert(VP8FTransformWHT != NULL);
assert(VP8TDisto4x4 != NULL);
assert(VP8TDisto16x16 != NULL);
assert(VP8CollectHistogram != NULL);
assert(VP8SSE16x16 != NULL);
assert(VP8SSE16x8 != NULL);
assert(VP8SSE8x8 != NULL);
assert(VP8SSE4x4 != NULL);
assert(VP8EncQuantizeBlock != NULL);
assert(VP8EncQuantize2Blocks != NULL);
assert(VP8FTransform2 != NULL);
assert(VP8EncPredLuma4 != NULL);
assert(VP8EncPredLuma16 != NULL);
assert(VP8EncPredChroma8 != NULL);
assert(VP8Mean16x4 != NULL);
assert(VP8EncQuantizeBlockWHT != NULL);
assert(VP8Copy4x4 != NULL);
assert(VP8Copy16x8 != NULL);
enc_last_cpuinfo_used = VP8GetCPUInfo;
}

View file

@ -9,7 +9,7 @@
//
// AVX2 version of speed-critical encoding functions.
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_AVX2)

View file

@ -13,13 +13,13 @@
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
// Slobodan Prijic (slobodan.prijic@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS32)
#include "./mips_macro.h"
#include "../enc/vp8i_enc.h"
#include "../enc/cost_enc.h"
#include "src/dsp/mips_macro.h"
#include "src/enc/vp8i_enc.h"
#include "src/enc/cost_enc.h"
static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;
@ -113,8 +113,9 @@ static const int kC2 = 35468;
"sb %[" #TEMP12 "], 3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"
// Does one or two inverse transforms.
static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
uint8_t* dst) {
static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref,
const int16_t* in,
uint8_t* dst) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
@ -144,11 +145,11 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
);
}
static void ITransform(const uint8_t* ref, const int16_t* in,
uint8_t* dst, int do_two) {
ITransformOne(ref, in, dst);
static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in,
uint8_t* dst, int do_two) {
ITransformOne_MIPS32(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
ITransformOne_MIPS32(ref + 4, in + 16, dst + 4);
}
}
@ -187,8 +188,8 @@ static void ITransform(const uint8_t* ref, const int16_t* in,
"sh %[temp5], " #J "(%[ppin]) \n\t" \
"sh %[level], " #N "(%[pout]) \n\t"
static int QuantizeBlock(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
static int QuantizeBlock_MIPS32(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
int temp0, temp1, temp2, temp3, temp4, temp5;
int sign, coeff, level, i;
int max_level = MAX_LEVEL;
@ -238,11 +239,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
return 0;
}
static int Quantize2Blocks(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
nz = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1;
return nz;
}
@ -361,8 +362,8 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
"msub %[temp6], %[temp0] \n\t" \
"msub %[temp7], %[temp1] \n\t"
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int tmp[32];
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
@ -396,13 +397,13 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
#undef VERTICAL_PASS
#undef HORIZONTAL_PASS
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4(a + x + y, b + x + y, w);
D += Disto4x4_MIPS32(a + x + y, b + x + y, w);
}
}
return D;
@ -478,7 +479,8 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
"sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \
"sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t"
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
int temp17, temp18, temp19, temp20;
@ -539,7 +541,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
GET_SSE_INNER(C, C + 1, C + 2, C + 3) \
GET_SSE_INNER(D, D + 1, D + 2, D + 3)
static int SSE16x16(const uint8_t* a, const uint8_t* b) {
static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@ -573,7 +575,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
return count;
}
static int SSE16x8(const uint8_t* a, const uint8_t* b) {
static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@ -599,7 +601,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
return count;
}
static int SSE8x8(const uint8_t* a, const uint8_t* b) {
static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@ -621,7 +623,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
return count;
}
static int SSE4x4(const uint8_t* a, const uint8_t* b) {
static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@ -651,17 +653,20 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
extern void VP8EncDspInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) {
VP8ITransform = ITransform;
VP8FTransform = FTransform;
VP8EncQuantizeBlock = QuantizeBlock;
VP8EncQuantize2Blocks = Quantize2Blocks;
VP8TDisto4x4 = Disto4x4;
VP8TDisto16x16 = Disto16x16;
VP8ITransform = ITransform_MIPS32;
VP8FTransform = FTransform_MIPS32;
VP8EncQuantizeBlock = QuantizeBlock_MIPS32;
VP8EncQuantize2Blocks = Quantize2Blocks_MIPS32;
VP8TDisto4x4 = Disto4x4_MIPS32;
VP8TDisto16x16 = Disto16x16_MIPS32;
#if !defined(WORK_AROUND_GCC)
VP8SSE16x16 = SSE16x16;
VP8SSE8x8 = SSE8x8;
VP8SSE16x8 = SSE16x8;
VP8SSE4x4 = SSE4x4;
VP8SSE16x16 = SSE16x16_MIPS32;
VP8SSE8x8 = SSE8x8_MIPS32;
VP8SSE16x8 = SSE16x8_MIPS32;
VP8SSE4x4 = SSE4x4_MIPS32;
#endif
}

View file

@ -12,13 +12,13 @@
// Author(s): Darko Laus (darko.laus@imgtec.com)
// Mirko Raus (mirko.raus@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "./mips_macro.h"
#include "../enc/cost_enc.h"
#include "../enc/vp8i_enc.h"
#include "src/dsp/mips_macro.h"
#include "src/enc/cost_enc.h"
#include "src/enc/vp8i_enc.h"
static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;
@ -141,7 +141,8 @@ static const int kC2 = 35468;
"sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \
"sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t"
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
const int c2217 = 2217;
const int c5352 = 5352;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
@ -238,16 +239,16 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
);
}
static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
static void ITransform_MIPSdspR2(const uint8_t* ref, const int16_t* in,
uint8_t* dst, int do_two) {
ITransformOne(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
}
}
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
@ -313,13 +314,14 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
return abs(temp3 - temp17) >> 5;
}
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto16x16_MIPSdspR2(const uint8_t* const a,
const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4(a + x + y, b + x + y, w);
D += Disto4x4_MIPSdspR2(a + x + y, b + x + y, w);
}
}
return D;
@ -1011,8 +1013,8 @@ static void HU4(uint8_t* dst, const uint8_t* top) {
//------------------------------------------------------------------------------
// Chroma 8x8 prediction (paragraph 12.2)
static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
// U block
DCMode8(C8DC8 + dst, left, top);
VerticalPred8(C8VE8 + dst, top);
@ -1031,8 +1033,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
//------------------------------------------------------------------------------
// luma 16x16 prediction (paragraph 12.3)
static void Intra16Preds(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
static void Intra16Preds_MIPSdspR2(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
DCMode16(I16DC16 + dst, left, top);
VerticalPred16(I16VE16 + dst, top);
HorizontalPred16(I16HE16 + dst, left);
@ -1041,7 +1043,7 @@ static void Intra16Preds(uint8_t* dst,
// Left samples are top[-5 .. -2], top_left is top[-1], top are
// located at top[0..3], and top right is top[4..7]
static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) {
DC4(I4DC4 + dst, top);
TM4(I4TM4 + dst, top);
VE4(I4VE4 + dst, top);
@ -1077,7 +1079,7 @@ static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
GET_SSE_INNER(C) \
GET_SSE_INNER(D)
static int SSE16x16(const uint8_t* a, const uint8_t* b) {
static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3;
__asm__ volatile (
@ -1107,7 +1109,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
return count;
}
static int SSE16x8(const uint8_t* a, const uint8_t* b) {
static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3;
__asm__ volatile (
@ -1129,7 +1131,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
return count;
}
static int SSE8x8(const uint8_t* a, const uint8_t* b) {
static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3;
__asm__ volatile (
@ -1147,7 +1149,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
return count;
}
static int SSE4x4(const uint8_t* a, const uint8_t* b) {
static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3;
__asm__ volatile (
@ -1270,8 +1272,8 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
"usw $0, " #J "(%[ppin]) \n\t" \
"3: \n\t"
static int QuantizeBlock(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
int temp0, temp1, temp2, temp3, temp4, temp5,temp6;
int sign, coeff, level;
int max_level = MAX_LEVEL;
@ -1311,11 +1313,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
return (ret != 0);
}
static int Quantize2Blocks(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
nz = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1;
return nz;
}
@ -1358,7 +1360,7 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
"usw %[" #TEMP4 "], " #C "(%[out]) \n\t" \
"usw %[" #TEMP6 "], " #D "(%[out]) \n\t"
static void FTransformWHT(const int16_t* in, int16_t* out) {
static void FTransformWHT_MIPSdspR2(const int16_t* in, int16_t* out) {
int temp0, temp1, temp2, temp3, temp4;
int temp5, temp6, temp7, temp8, temp9;
@ -1450,9 +1452,9 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
"addiu %[temp8], %[temp8], 1 \n\t" \
"sw %[temp8], 0(%[temp3]) \n\t"
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
static void CollectHistogram_MIPSdspR2(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
const int max_coeff = (MAX_COEFF_THRESH << 16) + MAX_COEFF_THRESH;
@ -1484,23 +1486,28 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
extern void VP8EncDspInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) {
VP8FTransform = FTransform;
VP8ITransform = ITransform;
VP8TDisto4x4 = Disto4x4;
VP8TDisto16x16 = Disto16x16;
VP8EncPredLuma16 = Intra16Preds;
VP8EncPredChroma8 = IntraChromaPreds;
VP8EncPredLuma4 = Intra4Preds;
VP8FTransform = FTransform_MIPSdspR2;
VP8FTransformWHT = FTransformWHT_MIPSdspR2;
VP8ITransform = ITransform_MIPSdspR2;
VP8TDisto4x4 = Disto4x4_MIPSdspR2;
VP8TDisto16x16 = Disto16x16_MIPSdspR2;
VP8EncPredLuma16 = Intra16Preds_MIPSdspR2;
VP8EncPredChroma8 = IntraChromaPreds_MIPSdspR2;
VP8EncPredLuma4 = Intra4Preds_MIPSdspR2;
#if !defined(WORK_AROUND_GCC)
VP8SSE16x16 = SSE16x16;
VP8SSE8x8 = SSE8x8;
VP8SSE16x8 = SSE16x8;
VP8SSE4x4 = SSE4x4;
VP8SSE16x16 = SSE16x16_MIPSdspR2;
VP8SSE8x8 = SSE8x8_MIPSdspR2;
VP8SSE16x8 = SSE16x8_MIPSdspR2;
VP8SSE4x4 = SSE4x4_MIPSdspR2;
#endif
VP8EncQuantizeBlock = QuantizeBlock;
VP8EncQuantize2Blocks = Quantize2Blocks;
VP8FTransformWHT = FTransformWHT;
VP8CollectHistogram = CollectHistogram;
VP8EncQuantizeBlock = QuantizeBlock_MIPSdspR2;
VP8EncQuantize2Blocks = Quantize2Blocks_MIPSdspR2;
VP8CollectHistogram = CollectHistogram_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2

View file

@ -11,13 +11,13 @@
//
// Author: Prashant Patil (prashant.patil@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MSA)
#include <stdlib.h>
#include "./msa_macro.h"
#include "../enc/vp8i_enc.h"
#include "src/dsp/msa_macro.h"
#include "src/enc/vp8i_enc.h"
//------------------------------------------------------------------------------
// Transforms
@ -69,20 +69,21 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
}
static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
static void ITransform_MSA(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
ITransformOne(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
}
}
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
uint64_t out0, out1, out2, out3;
uint32_t in0, in1, in2, in3;
v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
v8i16 t0, t1, t2, t3;
v16u8 srcl0, srcl1, src0, src1;
v16u8 srcl0, srcl1, src0 = { 0 }, src1 = { 0 };
const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 };
const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 };
const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 };
@ -130,7 +131,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
SD4(out0, out1, out2, out3, out, 8);
}
static void FTransformWHT(const int16_t* in, int16_t* out) {
static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {
v8i16 in0 = { 0 };
v8i16 in1 = { 0 };
v8i16 tmp0, tmp1, tmp2, tmp3;
@ -167,10 +168,10 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
ST_SH2(out0, out1, out, 8);
}
static int TTransform(const uint8_t* in, const uint16_t* w) {
static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {
int sum;
uint32_t in0_m, in1_m, in2_m, in3_m;
v16i8 src0;
v16i8 src0 = { 0 };
v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3;
v4i32 dst0, dst1;
const v16i8 zero = { 0 };
@ -199,20 +200,20 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
return sum;
}
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int sum1 = TTransform(a, w);
const int sum2 = TTransform(b, w);
static int Disto4x4_MSA(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int sum1 = TTransform_MSA(a, w);
const int sum2 = TTransform_MSA(b, w);
return abs(sum2 - sum1) >> 5;
}
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4(a + x + y, b + x + y, w);
D += Disto4x4_MSA(a + x + y, b + x + y, w);
}
}
return D;
@ -221,9 +222,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
//------------------------------------------------------------------------------
// Histogram
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
for (j = start_block; j < end_block; ++j) {
@ -259,8 +260,9 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
#define AVG2(a, b) (((a) + (b) + 1) >> 1)
static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical
const v16u8 A1 = { 0 };
const uint64_t val_m = LD(top - 1);
const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, 0, val_m);
const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
const v16u8 B = SLDI_UB(A, A, 1);
const v16u8 C = SLDI_UB(A, A, 2);
const v16u8 AC = __msa_ave_u_b(A, C);
@ -292,8 +294,9 @@ static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
}
static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
const v16u8 A2 = { 0 };
const uint64_t val_m = LD(top - 5);
const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m);
const v16u8 A = (v16u8)__msa_insert_b((v16i8)A1, 8, top[3]);
const v16u8 B = SLDI_UB(A, A, 1);
const v16u8 C = SLDI_UB(A, A, 2);
@ -311,8 +314,9 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
}
static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
const v16u8 A1 = { 0 };
const uint64_t val_m = LD(top);
const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, 0, val_m);
const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
const v16u8 B = SLDI_UB(A, A, 1);
const v16u8 C1 = SLDI_UB(A, A, 2);
const v16u8 C = (v16u8)__msa_insert_b((v16i8)C1, 6, top[7]);
@ -427,7 +431,7 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
#undef AVG3
#undef AVG2
static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) {
DC4(I4DC4 + dst, top);
TM4(I4TM4 + dst, top);
VE4(I4VE4 + dst, top);
@ -544,8 +548,8 @@ static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
STORE16x16(out, dst);
}
static void Intra16Preds(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
static void Intra16Preds_MSA(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
DCMode16x16(I16DC16 + dst, left, top);
VerticalPred16x16(I16VE16 + dst, top);
HorizontalPred16x16(I16HE16 + dst, left);
@ -645,7 +649,7 @@ static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
uint64_t out;
v16u8 src;
v16u8 src = { 0 };
if (top != NULL && left != NULL) {
const uint64_t left_m = LD(left);
const uint64_t top_m = LD(top);
@ -666,8 +670,8 @@ static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
STORE8x8(out, dst);
}
static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
// U block
DCMode8x8(C8DC8 + dst, left, top);
VerticalPred8x8(C8VE8 + dst, top);
@ -708,7 +712,7 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \
} while (0)
static int SSE16x16(const uint8_t* a, const uint8_t* b) {
static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {
uint32_t sum;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@ -735,7 +739,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
return sum;
}
static int SSE16x8(const uint8_t* a, const uint8_t* b) {
static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {
uint32_t sum;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@ -754,7 +758,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
return sum;
}
static int SSE8x8(const uint8_t* a, const uint8_t* b) {
static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {
uint32_t sum;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@ -774,10 +778,10 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
return sum;
}
static int SSE4x4(const uint8_t* a, const uint8_t* b) {
static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {
uint32_t sum = 0;
uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
v16u8 src, ref, tmp0, tmp1;
v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1;
v8i16 diff0, diff1;
v4i32 out0, out1;
@ -796,8 +800,8 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
//------------------------------------------------------------------------------
// Quantization
static int QuantizeBlock(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
int sum;
v8i16 in0, in1, sh0, sh1, out0, out1;
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
@ -828,7 +832,7 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
tmp1 = (tmp3 > maxlevel);
tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)maxlevel, (v16u8)tmp0);
tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)maxlevel, (v16u8)tmp1);
SUB2(0, tmp2, 0, tmp3, tmp0, tmp1);
SUB2(zero, tmp2, zero, tmp3, tmp0, tmp1);
tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)tmp0, (v16u8)sign0);
tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)tmp1, (v16u8)sign1);
LD_SW4(&mtx->zthresh_[0], 4, t0, t1, t2, t3); // zthresh
@ -849,8 +853,8 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
return (sum > 0);
}
static int Quantize2Blocks(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
@ -863,26 +867,26 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
extern void VP8EncDspInitMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) {
VP8ITransform = ITransform;
VP8FTransform = FTransform;
VP8FTransformWHT = FTransformWHT;
VP8ITransform = ITransform_MSA;
VP8FTransform = FTransform_MSA;
VP8FTransformWHT = FTransformWHT_MSA;
VP8TDisto4x4 = Disto4x4;
VP8TDisto16x16 = Disto16x16;
VP8CollectHistogram = CollectHistogram;
VP8TDisto4x4 = Disto4x4_MSA;
VP8TDisto16x16 = Disto16x16_MSA;
VP8CollectHistogram = CollectHistogram_MSA;
VP8EncPredLuma4 = Intra4Preds;
VP8EncPredLuma16 = Intra16Preds;
VP8EncPredChroma8 = IntraChromaPreds;
VP8EncPredLuma4 = Intra4Preds_MSA;
VP8EncPredLuma16 = Intra16Preds_MSA;
VP8EncPredChroma8 = IntraChromaPreds_MSA;
VP8SSE16x16 = SSE16x16;
VP8SSE16x8 = SSE16x8;
VP8SSE8x8 = SSE8x8;
VP8SSE4x4 = SSE4x4;
VP8SSE16x16 = SSE16x16_MSA;
VP8SSE16x8 = SSE16x8_MSA;
VP8SSE8x8 = SSE8x8_MSA;
VP8SSE4x4 = SSE4x4_MSA;
VP8EncQuantizeBlock = QuantizeBlock;
VP8EncQuantize2Blocks = Quantize2Blocks;
VP8EncQuantizeBlockWHT = QuantizeBlock;
VP8EncQuantizeBlock = QuantizeBlock_MSA;
VP8EncQuantize2Blocks = Quantize2Blocks_MSA;
VP8EncQuantizeBlockWHT = QuantizeBlock_MSA;
}
#else // !WEBP_USE_MSA

View file

@ -11,14 +11,14 @@
//
// adapted from libvpx (http://www.webmproject.org/code/)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
#include <assert.h>
#include "./neon.h"
#include "../enc/vp8i_enc.h"
#include "src/dsp/neon.h"
#include "src/enc/vp8i_enc.h"
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
@ -37,15 +37,15 @@ static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
#if defined(WEBP_USE_INTRINSICS)
// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
}
// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
// to the corresponding rows of 'dst'.
static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
const int16x8_t dst01,
const int16x8_t dst23) {
static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
const int16x8_t dst01,
const int16x8_t dst23) {
// Unsigned saturate to 8b.
const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
@ -57,8 +57,10 @@ static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
}
static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
const uint8_t* const ref, uint8_t* const dst) {
static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
const int16x8_t row23,
const uint8_t* const ref,
uint8_t* const dst) {
uint32x2_t dst01 = vdup_n_u32(0);
uint32x2_t dst23 = vdup_n_u32(0);
@ -70,19 +72,20 @@ static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
{
// Convert to 16b.
const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
// Descale with rounding.
const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
// Add the inverse transform.
SaturateAndStore4x4(dst, out01, out23);
SaturateAndStore4x4_NEON(dst, out01, out23);
}
}
static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
int16x8x2_t* const out) {
static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
const int16x8_t in1,
int16x8x2_t* const out) {
// a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
// c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
@ -90,7 +93,7 @@ static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
*out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
}
static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
// {rows} = in0 | in4
// in8 | in12
// B1 = in4 | in12
@ -113,22 +116,22 @@ static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
Transpose8x2(E0, E1, rows);
Transpose8x2_NEON(E0, E1, rows);
}
static void ITransformOne(const uint8_t* ref,
const int16_t* in, uint8_t* dst) {
static void ITransformOne_NEON(const uint8_t* ref,
const int16_t* in, uint8_t* dst) {
int16x8x2_t rows;
INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
TransformPass(&rows);
TransformPass(&rows);
Add4x4(rows.val[0], rows.val[1], ref, dst);
TransformPass_NEON(&rows);
TransformPass_NEON(&rows);
Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
}
#else
static void ITransformOne(const uint8_t* ref,
const int16_t* in, uint8_t* dst) {
static void ITransformOne_NEON(const uint8_t* ref,
const int16_t* in, uint8_t* dst) {
const int kBPS = BPS;
const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
@ -243,16 +246,16 @@ static void ITransformOne(const uint8_t* ref,
#endif // WEBP_USE_INTRINSICS
static void ITransform(const uint8_t* ref,
const int16_t* in, uint8_t* dst, int do_two) {
ITransformOne(ref, in, dst);
static void ITransform_NEON(const uint8_t* ref,
const int16_t* in, uint8_t* dst, int do_two) {
ITransformOne_NEON(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
ITransformOne_NEON(ref + 4, in + 16, dst + 4);
}
}
// Load all 4x4 pixels into a single uint8x16_t variable.
static uint8x16_t Load4x4(const uint8_t* src) {
static uint8x16_t Load4x4_NEON(const uint8_t* src) {
uint32x4_t out = vdupq_n_u32(0);
out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
@ -265,10 +268,12 @@ static uint8x16_t Load4x4(const uint8_t* src) {
#if defined(WEBP_USE_INTRINSICS)
static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
const int16x4_t C, const int16x4_t D,
int16x8_t* const out01,
int16x8_t* const out32) {
static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
const int16x4_t B,
const int16x4_t C,
const int16x4_t D,
int16x8_t* const out01,
int16x8_t* const out32) {
const int16x4x2_t AB = vtrn_s16(A, B);
const int16x4x2_t CD = vtrn_s16(C, D);
const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
@ -283,24 +288,24 @@ static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
vreinterpret_s64_s32(tmp02.val[1])));
}
static WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a,
const uint8x8_t b) {
static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
const uint8x8_t b) {
return vreinterpretq_s16_u16(vsubl_u8(a, b));
}
static void FTransform(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
int16x8_t d0d1, d3d2; // working 4x4 int16 variables
{
const uint8x16_t S0 = Load4x4(src);
const uint8x16_t R0 = Load4x4(ref);
const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0));
const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0));
const uint8x16_t S0 = Load4x4_NEON(src);
const uint8x16_t R0 = Load4x4_NEON(ref);
const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
const int16x4_t D0 = vget_low_s16(D0D1);
const int16x4_t D1 = vget_high_s16(D0D1);
const int16x4_t D2 = vget_low_s16(D2D3);
const int16x4_t D3 = vget_high_s16(D2D3);
Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2);
Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
}
{ // 1rst pass
const int32x4_t kCst937 = vdupq_n_s32(937);
@ -318,7 +323,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
}
{ // 2nd pass
// the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0)
@ -358,8 +363,8 @@ static const int32_t kCoeff32[] = {
51000, 51000, 51000, 51000
};
static void FTransform(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
const int kBPS = BPS;
const uint8_t* src_ptr = src;
const uint8_t* ref_ptr = ref;
@ -478,7 +483,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
src += stride; \
} while (0)
static void FTransformWHT(const int16_t* src, int16_t* out) {
static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
const int stride = 16;
const int16x4_t zero = vdup_n_s16(0);
int32x4x4_t tmp0;
@ -516,7 +521,7 @@ static void FTransformWHT(const int16_t* src, int16_t* out) {
tmp0.val[3] = vsubq_s32(a0, a1);
}
{
const int32x4x4_t tmp1 = Transpose4x4(tmp0);
const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
// a0 = tmp[0 + i] + tmp[ 8 + i]
// a1 = tmp[4 + i] + tmp[12 + i]
// a2 = tmp[4 + i] - tmp[12 + i]
@ -560,7 +565,7 @@ static void FTransformWHT(const int16_t* src, int16_t* out) {
// a 26ae, b 26ae
// a 37bf, b 37bf
//
static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
@ -574,7 +579,8 @@ static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
return q4_in;
}
static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) {
static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
const int16x8x4_t q4_in) {
// {a0, a1} = {in[0] + in[2], in[1] + in[3]}
// {a3, a2} = {in[0] - in[2], in[1] - in[3]}
const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
@ -593,7 +599,7 @@ static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) {
return q4_out;
}
static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) {
static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
q4_in.val[2]));
const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
@ -610,7 +616,7 @@ static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) {
return q4_out;
}
static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
const uint16x8_t q_w07 = vld1q_u16(&w[0]);
const uint16x8_t q_w8f = vld1q_u16(&w[8]);
int16x4x4_t d4_w;
@ -622,8 +628,8 @@ static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
return d4_w;
}
static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
const int16x4x4_t d4_w) {
static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
const int16x4x4_t d4_w) {
int32x2_t d_sum;
// sum += w[ 0] * abs(b0);
// sum += w[ 4] * abs(b1);
@ -652,8 +658,8 @@ static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
@ -679,12 +685,12 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
// Vertical pass first to avoid a transpose (vertical and horizontal passes
// are commutative because w/kWeightY is symmetric) and subsequent
// transpose.
const int16x8x4_t q4_v = DistoVerticalPass(d4_in);
const int16x4x4_t d4_w = DistoLoadW(w);
const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
const int16x4x4_t d4_w = DistoLoadW_NEON(w);
// horizontal pass
const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_v);
const int16x8x4_t q4_h = DistoHorizontalPass(q4_t);
int32x2_t d_sum = DistoSum(q4_h, d4_w);
const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
// abs(sum2 - sum1) >> 5
d_sum = vabs_s32(d_sum);
@ -694,13 +700,13 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
}
#undef LOAD_LANE_32b
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4(a + x + y, b + x + y, w);
D += Disto4x4_NEON(a + x + y, b + x + y, w);
}
}
return D;
@ -708,15 +714,15 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
//------------------------------------------------------------------------------
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
for (j = start_block; j < end_block; ++j) {
int16_t out[16];
FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
{
int k;
const int16x8_t a0 = vld1q_s16(out + 0);
@ -740,9 +746,9 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
//------------------------------------------------------------------------------
static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
const uint8_t* const b,
uint32x4_t* const sum) {
static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
const uint8_t* const b,
uint32x4_t* const sum) {
const uint8x16_t a0 = vld1q_u8(a);
const uint8x16_t b0 = vld1q_u8(b);
const uint8x16_t abs_diff = vabdq_u8(a0, b0);
@ -757,7 +763,7 @@ static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
}
// Horizontal sum of all four uint32_t values in 'sum'.
static int SumToInt(uint32x4_t sum) {
static int SumToInt_NEON(uint32x4_t sum) {
const uint64x2_t sum2 = vpaddlq_u32(sum);
const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
return (int)sum3;
@ -767,18 +773,18 @@ static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
uint32x4_t sum = vdupq_n_u32(0);
int y;
for (y = 0; y < 16; ++y) {
AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
}
return SumToInt(sum);
return SumToInt_NEON(sum);
}
static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
uint32x4_t sum = vdupq_n_u32(0);
int y;
for (y = 0; y < 8; ++y) {
AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
}
return SumToInt(sum);
return SumToInt_NEON(sum);
}
static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
@ -791,12 +797,12 @@ static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
sum = vpadalq_u16(sum, prod);
}
return SumToInt(sum);
return SumToInt_NEON(sum);
}
static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
const uint8x16_t a0 = Load4x4(a);
const uint8x16_t b0 = Load4x4(b);
const uint8x16_t a0 = Load4x4_NEON(a);
const uint8x16_t b0 = Load4x4_NEON(b);
const uint8x16_t abs_diff = vabdq_u8(a0, b0);
const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
vget_low_u8(abs_diff));
@ -805,7 +811,7 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
/* pair-wise adds and widen */
const uint32x4_t sum1 = vpaddlq_u16(prod1);
const uint32x4_t sum2 = vpaddlq_u16(prod2);
return SumToInt(vaddq_u32(sum1, sum2));
return SumToInt_NEON(vaddq_u32(sum1, sum2));
}
//------------------------------------------------------------------------------
@ -813,8 +819,8 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
// Compilation with gcc-4.6.x is problematic for now.
#if !defined(WORK_AROUND_GCC)
static int16x8_t Quantize(int16_t* const in,
const VP8Matrix* const mtx, int offset) {
static int16x8_t Quantize_NEON(int16_t* const in,
const VP8Matrix* const mtx, int offset) {
const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
@ -847,10 +853,10 @@ static const uint8_t kShuffles[4][8] = {
{ 14, 15, 22, 23, 28, 29, 30, 31 }
};
static int QuantizeBlock(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
const int16x8_t out0 = Quantize(in, mtx, 0);
const int16x8_t out1 = Quantize(in, mtx, 8);
static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
uint8x8x4_t shuffles;
// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
// non-standard versions there.
@ -889,11 +895,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
return 0;
}
static int Quantize2Blocks(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
nz = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
return nz;
}
@ -905,14 +911,14 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
extern void VP8EncDspInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
VP8ITransform = ITransform;
VP8FTransform = FTransform;
VP8ITransform = ITransform_NEON;
VP8FTransform = FTransform_NEON;
VP8FTransformWHT = FTransformWHT;
VP8FTransformWHT = FTransformWHT_NEON;
VP8TDisto4x4 = Disto4x4;
VP8TDisto16x16 = Disto16x16;
VP8CollectHistogram = CollectHistogram;
VP8TDisto4x4 = Disto4x4_NEON;
VP8TDisto16x16 = Disto16x16_NEON;
VP8CollectHistogram = CollectHistogram_NEON;
VP8SSE16x16 = SSE16x16_NEON;
VP8SSE16x8 = SSE16x8_NEON;
@ -920,8 +926,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
VP8SSE4x4 = SSE4x4_NEON;
#if !defined(WORK_AROUND_GCC)
VP8EncQuantizeBlock = QuantizeBlock;
VP8EncQuantize2Blocks = Quantize2Blocks;
VP8EncQuantizeBlock = QuantizeBlock_NEON;
VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
#endif
}

View file

@ -11,23 +11,23 @@
//
// Author: Christian Duvivier (cduvivier@google.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
#include <assert.h>
#include <stdlib.h> // for abs()
#include <emmintrin.h>
#include "./common_sse2.h"
#include "../enc/cost_enc.h"
#include "../enc/vp8i_enc.h"
#include "src/dsp/common_sse2.h"
#include "src/enc/cost_enc.h"
#include "src/enc/vp8i_enc.h"
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
// Does one or two inverse transforms.
static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
// This implementation makes use of 16-bit fixed point versions of two
// multiply constants:
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@ -193,10 +193,10 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
}
}
static void FTransformPass1(const __m128i* const in01,
const __m128i* const in23,
__m128i* const out01,
__m128i* const out32) {
static void FTransformPass1_SSE2(const __m128i* const in01,
const __m128i* const in23,
__m128i* const out01,
__m128i* const out32) {
const __m128i k937 = _mm_set1_epi32(937);
const __m128i k1812 = _mm_set1_epi32(1812);
@ -239,8 +239,9 @@ static void FTransformPass1(const __m128i* const in01,
*out32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 3 2 3 2 3 2..
}
static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
int16_t* out) {
static void FTransformPass2_SSE2(const __m128i* const v01,
const __m128i* const v32,
int16_t* out) {
const __m128i zero = _mm_setzero_si128();
const __m128i seven = _mm_set1_epi16(7);
const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217,
@ -291,7 +292,8 @@ static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
_mm_storeu_si128((__m128i*)&out[8], d2_f3);
}
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
const __m128i zero = _mm_setzero_si128();
// Load src.
const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
@ -328,13 +330,14 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
__m128i v01, v32;
// First pass
FTransformPass1(&row01, &row23, &v01, &v32);
FTransformPass1_SSE2(&row01, &row23, &v01, &v32);
// Second pass
FTransformPass2(&v01, &v32, out);
FTransformPass2_SSE2(&v01, &v32, out);
}
static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
const __m128i zero = _mm_setzero_si128();
// Load src and convert to 16b.
@ -374,15 +377,15 @@ static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
__m128i v01h, v32h;
// First pass
FTransformPass1(&shuf01l, &shuf23l, &v01l, &v32l);
FTransformPass1(&shuf01h, &shuf23h, &v01h, &v32h);
FTransformPass1_SSE2(&shuf01l, &shuf23l, &v01l, &v32l);
FTransformPass1_SSE2(&shuf01h, &shuf23h, &v01h, &v32h);
// Second pass
FTransformPass2(&v01l, &v32l, out + 0);
FTransformPass2(&v01h, &v32h, out + 16);
FTransformPass2_SSE2(&v01l, &v32l, out + 0);
FTransformPass2_SSE2(&v01h, &v32h, out + 16);
}
static void FTransformWHTRow(const int16_t* const in, __m128i* const out) {
static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) {
const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1);
const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]);
const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]);
@ -398,14 +401,14 @@ static void FTransformWHTRow(const int16_t* const in, __m128i* const out) {
*out = _mm_madd_epi16(D, kMult);
}
static void FTransformWHT(const int16_t* in, int16_t* out) {
static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) {
// Input is 12b signed.
__m128i row0, row1, row2, row3;
// Rows are 14b signed.
FTransformWHTRow(in + 0 * 64, &row0);
FTransformWHTRow(in + 1 * 64, &row1);
FTransformWHTRow(in + 2 * 64, &row2);
FTransformWHTRow(in + 3 * 64, &row3);
FTransformWHTRow_SSE2(in + 0 * 64, &row0);
FTransformWHTRow_SSE2(in + 1 * 64, &row1);
FTransformWHTRow_SSE2(in + 2 * 64, &row2);
FTransformWHTRow_SSE2(in + 3 * 64, &row3);
{
// The a* are 15b signed.
@ -431,9 +434,9 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
// Compute susceptibility based on DCT-coeff histograms:
// the higher, the "easier" the macroblock is to compress.
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
const __m128i zero = _mm_setzero_si128();
const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
int j;
@ -442,7 +445,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
int16_t out[16];
int k;
FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
FTransform_SSE2(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
// Convert coefficients to bin (within out[]).
{
@ -476,7 +479,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
// Intra predictions
// helper for chroma-DC predictions
static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
int j;
const __m128i values = _mm_set1_epi8(v);
for (j = 0; j < 8; ++j) {
@ -484,7 +487,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
}
}
static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
int j;
const __m128i values = _mm_set1_epi8(v);
for (j = 0; j < 16; ++j) {
@ -492,20 +495,20 @@ static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
}
}
static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
static WEBP_INLINE void Fill_SSE2(uint8_t* dst, int value, int size) {
if (size == 4) {
int j;
for (j = 0; j < 4; ++j) {
memset(dst + j * BPS, value, 4);
}
} else if (size == 8) {
Put8x8uv(value, dst);
Put8x8uv_SSE2(value, dst);
} else {
Put16(value, dst);
Put16_SSE2(value, dst);
}
}
static WEBP_INLINE void VE8uv(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) {
int j;
const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
for (j = 0; j < 8; ++j) {
@ -513,7 +516,7 @@ static WEBP_INLINE void VE8uv(uint8_t* dst, const uint8_t* top) {
}
}
static WEBP_INLINE void VE16(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) {
const __m128i top_values = _mm_load_si128((const __m128i*)top);
int j;
for (j = 0; j < 16; ++j) {
@ -521,20 +524,20 @@ static WEBP_INLINE void VE16(uint8_t* dst, const uint8_t* top) {
}
}
static WEBP_INLINE void VerticalPred(uint8_t* dst,
const uint8_t* top, int size) {
static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst,
const uint8_t* top, int size) {
if (top != NULL) {
if (size == 8) {
VE8uv(dst, top);
VE8uv_SSE2(dst, top);
} else {
VE16(dst, top);
VE16_SSE2(dst, top);
}
} else {
Fill(dst, 127, size);
Fill_SSE2(dst, 127, size);
}
}
static WEBP_INLINE void HE8uv(uint8_t* dst, const uint8_t* left) {
static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
int j;
for (j = 0; j < 8; ++j) {
const __m128i values = _mm_set1_epi8(left[j]);
@ -543,7 +546,7 @@ static WEBP_INLINE void HE8uv(uint8_t* dst, const uint8_t* left) {
}
}
static WEBP_INLINE void HE16(uint8_t* dst, const uint8_t* left) {
static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) {
int j;
for (j = 0; j < 16; ++j) {
const __m128i values = _mm_set1_epi8(left[j]);
@ -552,21 +555,21 @@ static WEBP_INLINE void HE16(uint8_t* dst, const uint8_t* left) {
}
}
static WEBP_INLINE void HorizontalPred(uint8_t* dst,
const uint8_t* left, int size) {
static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst,
const uint8_t* left, int size) {
if (left != NULL) {
if (size == 8) {
HE8uv(dst, left);
HE8uv_SSE2(dst, left);
} else {
HE16(dst, left);
HE16_SSE2(dst, left);
}
} else {
Fill(dst, 129, size);
Fill_SSE2(dst, 129, size);
}
}
static WEBP_INLINE void TM(uint8_t* dst, const uint8_t* left,
const uint8_t* top, int size) {
static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left,
const uint8_t* top, int size) {
const __m128i zero = _mm_setzero_si128();
int y;
if (size == 8) {
@ -593,13 +596,13 @@ static WEBP_INLINE void TM(uint8_t* dst, const uint8_t* left,
}
}
static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
const uint8_t* top, int size) {
static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left,
const uint8_t* top, int size) {
if (left != NULL) {
if (top != NULL) {
TM(dst, left, top, size);
TM_SSE2(dst, left, top, size);
} else {
HorizontalPred(dst, left, size);
HorizontalPred_SSE2(dst, left, size);
}
} else {
// true motion without left samples (hence: with default 129 value)
@ -607,90 +610,90 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
// Note that if top samples are not available, the default value is
// then 129, and not 127 as in the VerticalPred case.
if (top != NULL) {
VerticalPred(dst, top, size);
VerticalPred_SSE2(dst, top, size);
} else {
Fill(dst, 129, size);
Fill_SSE2(dst, 129, size);
}
}
}
static WEBP_INLINE void DC8uv(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
const __m128i left_values = _mm_loadl_epi64((const __m128i*)left);
const __m128i combined = _mm_unpacklo_epi64(top_values, left_values);
const int DC = VP8HorizontalAdd8b(&combined) + 8;
Put8x8uv(DC >> 4, dst);
Put8x8uv_SSE2(DC >> 4, dst);
}
static WEBP_INLINE void DC8uvNoLeft(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
const __m128i zero = _mm_setzero_si128();
const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
const __m128i sum = _mm_sad_epu8(top_values, zero);
const int DC = _mm_cvtsi128_si32(sum) + 4;
Put8x8uv(DC >> 3, dst);
Put8x8uv_SSE2(DC >> 3, dst);
}
static WEBP_INLINE void DC8uvNoTop(uint8_t* dst, const uint8_t* left) {
static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* dst, const uint8_t* left) {
// 'left' is contiguous so we can reuse the top summation.
DC8uvNoLeft(dst, left);
DC8uvNoLeft_SSE2(dst, left);
}
static WEBP_INLINE void DC8uvNoTopLeft(uint8_t* dst) {
Put8x8uv(0x80, dst);
static WEBP_INLINE void DC8uvNoTopLeft_SSE2(uint8_t* dst) {
Put8x8uv_SSE2(0x80, dst);
}
static WEBP_INLINE void DC8uvMode(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
if (top != NULL) {
if (left != NULL) { // top and left present
DC8uv(dst, left, top);
DC8uv_SSE2(dst, left, top);
} else { // top, but no left
DC8uvNoLeft(dst, top);
DC8uvNoLeft_SSE2(dst, top);
}
} else if (left != NULL) { // left but no top
DC8uvNoTop(dst, left);
DC8uvNoTop_SSE2(dst, left);
} else { // no top, no left, nothing.
DC8uvNoTopLeft(dst);
DC8uvNoTopLeft_SSE2(dst);
}
}
static WEBP_INLINE void DC16(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
const __m128i top_row = _mm_load_si128((const __m128i*)top);
const __m128i left_row = _mm_load_si128((const __m128i*)left);
const int DC =
VP8HorizontalAdd8b(&top_row) + VP8HorizontalAdd8b(&left_row) + 16;
Put16(DC >> 5, dst);
Put16_SSE2(DC >> 5, dst);
}
static WEBP_INLINE void DC16NoLeft(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
const __m128i top_row = _mm_load_si128((const __m128i*)top);
const int DC = VP8HorizontalAdd8b(&top_row) + 8;
Put16(DC >> 4, dst);
Put16_SSE2(DC >> 4, dst);
}
static WEBP_INLINE void DC16NoTop(uint8_t* dst, const uint8_t* left) {
static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* dst, const uint8_t* left) {
// 'left' is contiguous so we can reuse the top summation.
DC16NoLeft(dst, left);
DC16NoLeft_SSE2(dst, left);
}
static WEBP_INLINE void DC16NoTopLeft(uint8_t* dst) {
Put16(0x80, dst);
static WEBP_INLINE void DC16NoTopLeft_SSE2(uint8_t* dst) {
Put16_SSE2(0x80, dst);
}
static WEBP_INLINE void DC16Mode(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
if (top != NULL) {
if (left != NULL) { // top and left present
DC16(dst, left, top);
DC16_SSE2(dst, left, top);
} else { // top, but no left
DC16NoLeft(dst, top);
DC16NoLeft_SSE2(dst, top);
}
} else if (left != NULL) { // left but no top
DC16NoTop(dst, left);
DC16NoTop_SSE2(dst, left);
} else { // no top, no left, nothing.
DC16NoTopLeft(dst);
DC16NoTopLeft_SSE2(dst);
}
}
@ -709,7 +712,8 @@ static WEBP_INLINE void DC16Mode(uint8_t* dst, const uint8_t* left,
// where: AC = (a + b + 1) >> 1, BC = (b + c + 1) >> 1
// and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical
static WEBP_INLINE void VE4_SSE2(uint8_t* dst,
const uint8_t* top) { // vertical
const __m128i one = _mm_set1_epi8(1);
const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1));
const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@ -725,7 +729,8 @@ static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical
}
}
static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
static WEBP_INLINE void HE4_SSE2(uint8_t* dst,
const uint8_t* top) { // horizontal
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
@ -737,14 +742,15 @@ static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
}
static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void DC4_SSE2(uint8_t* dst, const uint8_t* top) {
uint32_t dc = 4;
int i;
for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
Fill(dst, dc >> 3, 4);
Fill_SSE2(dst, dc >> 3, 4);
}
static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) { // Down-Left
static WEBP_INLINE void LD4_SSE2(uint8_t* dst,
const uint8_t* top) { // Down-Left
const __m128i one = _mm_set1_epi8(1);
const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@ -760,8 +766,8 @@ static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) { // Down-Left
WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
}
static WEBP_INLINE void VR4(uint8_t* dst,
const uint8_t* top) { // Vertical-Right
static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
const uint8_t* top) { // Vertical-Right
const __m128i one = _mm_set1_epi8(1);
const int I = top[-2];
const int J = top[-3];
@ -786,8 +792,8 @@ static WEBP_INLINE void VR4(uint8_t* dst,
DST(0, 3) = AVG3(K, J, I);
}
static WEBP_INLINE void VL4(uint8_t* dst,
const uint8_t* top) { // Vertical-Left
static WEBP_INLINE void VL4_SSE2(uint8_t* dst,
const uint8_t* top) { // Vertical-Left
const __m128i one = _mm_set1_epi8(1);
const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
@ -812,7 +818,8 @@ static WEBP_INLINE void VL4(uint8_t* dst,
DST(3, 3) = (extra_out >> 8) & 0xff;
}
static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) { // Down-right
static WEBP_INLINE void RD4_SSE2(uint8_t* dst,
const uint8_t* top) { // Down-right
const __m128i one = _mm_set1_epi8(1);
const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5));
const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4);
@ -828,7 +835,7 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) { // Down-right
WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
}
static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) {
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
@ -843,7 +850,7 @@ static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
}
static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
@ -866,7 +873,7 @@ static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
DST(1, 3) = AVG3(L, K, J);
}
static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) {
const __m128i zero = _mm_setzero_si128();
const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top));
const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
@ -888,55 +895,56 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
// Left samples are top[-5 .. -2], top_left is top[-1], top are
// located at top[0..3], and top right is top[4..7]
static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
DC4(I4DC4 + dst, top);
TM4(I4TM4 + dst, top);
VE4(I4VE4 + dst, top);
HE4(I4HE4 + dst, top);
RD4(I4RD4 + dst, top);
VR4(I4VR4 + dst, top);
LD4(I4LD4 + dst, top);
VL4(I4VL4 + dst, top);
HD4(I4HD4 + dst, top);
HU4(I4HU4 + dst, top);
static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) {
DC4_SSE2(I4DC4 + dst, top);
TM4_SSE2(I4TM4 + dst, top);
VE4_SSE2(I4VE4 + dst, top);
HE4_SSE2(I4HE4 + dst, top);
RD4_SSE2(I4RD4 + dst, top);
VR4_SSE2(I4VR4 + dst, top);
LD4_SSE2(I4LD4 + dst, top);
VL4_SSE2(I4VL4 + dst, top);
HD4_SSE2(I4HD4 + dst, top);
HU4_SSE2(I4HU4 + dst, top);
}
//------------------------------------------------------------------------------
// Chroma 8x8 prediction (paragraph 12.2)
static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
// U block
DC8uvMode(C8DC8 + dst, left, top);
VerticalPred(C8VE8 + dst, top, 8);
HorizontalPred(C8HE8 + dst, left, 8);
TrueMotion(C8TM8 + dst, left, top, 8);
DC8uvMode_SSE2(C8DC8 + dst, left, top);
VerticalPred_SSE2(C8VE8 + dst, top, 8);
HorizontalPred_SSE2(C8HE8 + dst, left, 8);
TrueMotion_SSE2(C8TM8 + dst, left, top, 8);
// V block
dst += 8;
if (top != NULL) top += 8;
if (left != NULL) left += 16;
DC8uvMode(C8DC8 + dst, left, top);
VerticalPred(C8VE8 + dst, top, 8);
HorizontalPred(C8HE8 + dst, left, 8);
TrueMotion(C8TM8 + dst, left, top, 8);
DC8uvMode_SSE2(C8DC8 + dst, left, top);
VerticalPred_SSE2(C8VE8 + dst, top, 8);
HorizontalPred_SSE2(C8HE8 + dst, left, 8);
TrueMotion_SSE2(C8TM8 + dst, left, top, 8);
}
//------------------------------------------------------------------------------
// luma 16x16 prediction (paragraph 12.3)
static void Intra16Preds(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
DC16Mode(I16DC16 + dst, left, top);
VerticalPred(I16VE16 + dst, top, 16);
HorizontalPred(I16HE16 + dst, left, 16);
TrueMotion(I16TM16 + dst, left, top, 16);
static void Intra16Preds_SSE2(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
DC16Mode_SSE2(I16DC16 + dst, left, top);
VerticalPred_SSE2(I16VE16 + dst, top, 16);
HorizontalPred_SSE2(I16HE16 + dst, left, 16);
TrueMotion_SSE2(I16TM16 + dst, left, top, 16);
}
//------------------------------------------------------------------------------
// Metric
static WEBP_INLINE void SubtractAndAccumulate(const __m128i a, const __m128i b,
__m128i* const sum) {
static WEBP_INLINE void SubtractAndAccumulate_SSE2(const __m128i a,
const __m128i b,
__m128i* const sum) {
// take abs(a-b) in 8b
const __m128i a_b = _mm_subs_epu8(a, b);
const __m128i b_a = _mm_subs_epu8(b, a);
@ -951,8 +959,8 @@ static WEBP_INLINE void SubtractAndAccumulate(const __m128i a, const __m128i b,
*sum = _mm_add_epi32(sum1, sum2);
}
static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
int num_pairs) {
static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b,
int num_pairs) {
__m128i sum = _mm_setzero_si128();
int32_t tmp[4];
int i;
@ -963,8 +971,8 @@ static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[BPS * 1]);
const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[BPS * 1]);
__m128i sum1, sum2;
SubtractAndAccumulate(a0, b0, &sum1);
SubtractAndAccumulate(a1, b1, &sum2);
SubtractAndAccumulate_SSE2(a0, b0, &sum1);
SubtractAndAccumulate_SSE2(a1, b1, &sum2);
sum = _mm_add_epi32(sum, _mm_add_epi32(sum1, sum2));
a += 2 * BPS;
b += 2 * BPS;
@ -973,18 +981,18 @@ static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
}
static int SSE16x16(const uint8_t* a, const uint8_t* b) {
return SSE_16xN(a, b, 8);
static int SSE16x16_SSE2(const uint8_t* a, const uint8_t* b) {
return SSE_16xN_SSE2(a, b, 8);
}
static int SSE16x8(const uint8_t* a, const uint8_t* b) {
return SSE_16xN(a, b, 4);
static int SSE16x8_SSE2(const uint8_t* a, const uint8_t* b) {
return SSE_16xN_SSE2(a, b, 4);
}
#define LOAD_8x16b(ptr) \
_mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero)
static int SSE8x8(const uint8_t* a, const uint8_t* b) {
static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) {
const __m128i zero = _mm_setzero_si128();
int num_pairs = 4;
__m128i sum = zero;
@ -1011,7 +1019,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
}
#undef LOAD_8x16b
static int SSE4x4(const uint8_t* a, const uint8_t* b) {
static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) {
const __m128i zero = _mm_setzero_si128();
// Load values. Note that we read 8 pixels instead of 4,
@ -1048,7 +1056,7 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
//------------------------------------------------------------------------------
static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) {
const __m128i mask = _mm_set1_epi16(0x00ff);
const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]);
const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]);
@ -1086,8 +1094,8 @@ static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
static int TTransform(const uint8_t* inA, const uint8_t* inB,
const uint16_t* const w) {
static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB,
const uint16_t* const w) {
int32_t sum[4];
__m128i tmp_0, tmp_1, tmp_2, tmp_3;
const __m128i zero = _mm_setzero_si128();
@ -1187,19 +1195,19 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB,
return sum[0] + sum[1] + sum[2] + sum[3];
}
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int diff_sum = TTransform(a, b, w);
static int Disto4x4_SSE2(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int diff_sum = TTransform_SSE2(a, b, w);
return abs(diff_sum) >> 5;
}
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4(a + x + y, b + x + y, w);
D += Disto4x4_SSE2(a + x + y, b + x + y, w);
}
}
return D;
@ -1209,9 +1217,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
// Quantization
//
static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
const uint16_t* const sharpen,
const VP8Matrix* const mtx) {
static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
const uint16_t* const sharpen,
const VP8Matrix* const mtx) {
const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
const __m128i zero = _mm_setzero_si128();
__m128i coeff0, coeff8;
@ -1321,22 +1329,22 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
}
static int QuantizeBlock(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
static int QuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
return DoQuantizeBlock_SSE2(in, out, &mtx->sharpen_[0], mtx);
}
static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
return DoQuantizeBlock(in, out, NULL, mtx);
static int QuantizeBlockWHT_SSE2(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
return DoQuantizeBlock_SSE2(in, out, NULL, mtx);
}
static int Quantize2Blocks(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
static int Quantize2Blocks_SSE2(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
const uint16_t* const sharpen = &mtx->sharpen_[0];
nz = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
nz = DoQuantizeBlock_SSE2(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
nz |= DoQuantizeBlock_SSE2(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
return nz;
}
@ -1346,139 +1354,28 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
extern void VP8EncDspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) {
VP8CollectHistogram = CollectHistogram;
VP8EncPredLuma16 = Intra16Preds;
VP8EncPredChroma8 = IntraChromaPreds;
VP8EncPredLuma4 = Intra4Preds;
VP8EncQuantizeBlock = QuantizeBlock;
VP8EncQuantize2Blocks = Quantize2Blocks;
VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
VP8ITransform = ITransform;
VP8FTransform = FTransform;
VP8FTransform2 = FTransform2;
VP8FTransformWHT = FTransformWHT;
VP8SSE16x16 = SSE16x16;
VP8SSE16x8 = SSE16x8;
VP8SSE8x8 = SSE8x8;
VP8SSE4x4 = SSE4x4;
VP8TDisto4x4 = Disto4x4;
VP8TDisto16x16 = Disto16x16;
VP8Mean16x4 = Mean16x4;
}
//------------------------------------------------------------------------------
// SSIM / PSNR entry point (TODO(skal): move to its own file later)
static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
const uint8_t* src2, int len) {
int i = 0;
uint32_t sse2 = 0;
if (len >= 16) {
const int limit = len - 32;
int32_t tmp[4];
__m128i sum1;
__m128i sum = _mm_setzero_si128();
__m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
__m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
i += 16;
while (i <= limit) {
const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
__m128i sum2;
i += 16;
SubtractAndAccumulate(a0, b0, &sum1);
sum = _mm_add_epi32(sum, sum1);
a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
i += 16;
SubtractAndAccumulate(a1, b1, &sum2);
sum = _mm_add_epi32(sum, sum2);
}
SubtractAndAccumulate(a0, b0, &sum1);
sum = _mm_add_epi32(sum, sum1);
_mm_storeu_si128((__m128i*)tmp, sum);
sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
}
for (; i < len; ++i) {
const int32_t diff = src1[i] - src2[i];
sse2 += diff * diff;
}
return sse2;
}
static uint32_t HorizontalAdd16b(const __m128i* const m) {
uint16_t tmp[8];
const __m128i a = _mm_srli_si128(*m, 8);
const __m128i b = _mm_add_epi16(*m, a);
_mm_storeu_si128((__m128i*)tmp, b);
return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
}
static uint32_t HorizontalAdd32b(const __m128i* const m) {
const __m128i a = _mm_srli_si128(*m, 8);
const __m128i b = _mm_add_epi32(*m, a);
const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
return (uint32_t)_mm_cvtsi128_si32(c);
}
static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
#define ACCUMULATE_ROW(WEIGHT) do { \
/* compute row weight (Wx * Wy) */ \
const __m128i Wy = _mm_set1_epi16((WEIGHT)); \
const __m128i W = _mm_mullo_epi16(Wx, Wy); \
/* process 8 bytes at a time (7 bytes, actually) */ \
const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
/* convert to 16b and multiply by weight */ \
const __m128i a1 = _mm_unpacklo_epi8(a0, zero); \
const __m128i b1 = _mm_unpacklo_epi8(b0, zero); \
const __m128i wa1 = _mm_mullo_epi16(a1, W); \
const __m128i wb1 = _mm_mullo_epi16(b1, W); \
/* accumulate */ \
xm = _mm_add_epi16(xm, wa1); \
ym = _mm_add_epi16(ym, wb1); \
xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1)); \
xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1)); \
yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1)); \
src1 += stride1; \
src2 += stride2; \
} while (0)
static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
const uint8_t* src2, int stride2) {
VP8DistoStats stats;
const __m128i zero = _mm_setzero_si128();
__m128i xm = zero, ym = zero; // 16b accums
__m128i xxm = zero, yym = zero, xym = zero; // 32b accum
const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
assert(2 * VP8_SSIM_KERNEL + 1 == 7);
ACCUMULATE_ROW(1);
ACCUMULATE_ROW(2);
ACCUMULATE_ROW(3);
ACCUMULATE_ROW(4);
ACCUMULATE_ROW(3);
ACCUMULATE_ROW(2);
ACCUMULATE_ROW(1);
stats.xm = HorizontalAdd16b(&xm);
stats.ym = HorizontalAdd16b(&ym);
stats.xxm = HorizontalAdd32b(&xxm);
stats.xym = HorizontalAdd32b(&xym);
stats.yym = HorizontalAdd32b(&yym);
return VP8SSIMFromStats(&stats);
}
extern void VP8SSIMDspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
VP8AccumulateSSE = AccumulateSSE_SSE2;
VP8SSIMGet = SSIMGet_SSE2;
VP8CollectHistogram = CollectHistogram_SSE2;
VP8EncPredLuma16 = Intra16Preds_SSE2;
VP8EncPredChroma8 = IntraChromaPreds_SSE2;
VP8EncPredLuma4 = Intra4Preds_SSE2;
VP8EncQuantizeBlock = QuantizeBlock_SSE2;
VP8EncQuantize2Blocks = Quantize2Blocks_SSE2;
VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE2;
VP8ITransform = ITransform_SSE2;
VP8FTransform = FTransform_SSE2;
VP8FTransform2 = FTransform2_SSE2;
VP8FTransformWHT = FTransformWHT_SSE2;
VP8SSE16x16 = SSE16x16_SSE2;
VP8SSE16x8 = SSE16x8_SSE2;
VP8SSE8x8 = SSE8x8_SSE2;
VP8SSE4x4 = SSE4x4_SSE2;
VP8TDisto4x4 = Disto4x4_SSE2;
VP8TDisto16x16 = Disto16x16_SSE2;
VP8Mean16x4 = Mean16x4_SSE2;
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2)
WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
#endif // WEBP_USE_SSE2

View file

@ -11,21 +11,21 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE41)
#include <smmintrin.h>
#include <stdlib.h> // for abs()
#include "./common_sse2.h"
#include "../enc/vp8i_enc.h"
#include "src/dsp/common_sse2.h"
#include "src/enc/vp8i_enc.h"
//------------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms.
static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
static void CollectHistogram_SSE41(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
@ -70,8 +70,8 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
static int TTransform(const uint8_t* inA, const uint8_t* inB,
const uint16_t* const w) {
static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB,
const uint16_t* const w) {
int32_t sum[4];
__m128i tmp_0, tmp_1, tmp_2, tmp_3;
@ -168,19 +168,19 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB,
return sum[0] + sum[1] + sum[2] + sum[3];
}
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int diff_sum = TTransform(a, b, w);
static int Disto4x4_SSE41(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int diff_sum = TTransform_SSE41(a, b, w);
return abs(diff_sum) >> 5;
}
static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
static int Disto16x16_SSE41(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4(a + x + y, b + x + y, w);
D += Disto4x4_SSE41(a + x + y, b + x + y, w);
}
}
return D;
@ -197,9 +197,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
2 * (D) + 1, 2 * (D) + 0, 2 * (C) + 1, 2 * (C) + 0, \
2 * (B) + 1, 2 * (B) + 0, 2 * (A) + 1, 2 * (A) + 0)
static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
const uint16_t* const sharpen,
const VP8Matrix* const mtx) {
static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
const uint16_t* const sharpen,
const VP8Matrix* const mtx) {
const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
const __m128i zero = _mm_setzero_si128();
__m128i out0, out8;
@ -300,22 +300,22 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
#undef PSHUFB_CST
static int QuantizeBlock(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
static int QuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
return DoQuantizeBlock_SSE41(in, out, &mtx->sharpen_[0], mtx);
}
static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
return DoQuantizeBlock(in, out, NULL, mtx);
static int QuantizeBlockWHT_SSE41(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
return DoQuantizeBlock_SSE41(in, out, NULL, mtx);
}
static int Quantize2Blocks(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
const uint16_t* const sharpen = &mtx->sharpen_[0];
nz = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
nz = DoQuantizeBlock_SSE41(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
nz |= DoQuantizeBlock_SSE41(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
return nz;
}
@ -324,12 +324,12 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
extern void VP8EncDspInitSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE41(void) {
VP8CollectHistogram = CollectHistogram;
VP8EncQuantizeBlock = QuantizeBlock;
VP8EncQuantize2Blocks = Quantize2Blocks;
VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
VP8TDisto4x4 = Disto4x4;
VP8TDisto16x16 = Disto16x16;
VP8CollectHistogram = CollectHistogram_SSE41;
VP8EncQuantizeBlock = QuantizeBlock_SSE41;
VP8EncQuantize2Blocks = Quantize2Blocks_SSE41;
VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE41;
VP8TDisto4x4 = Disto4x4_SSE41;
VP8TDisto16x16 = Disto16x16_SSE41;
}
#else // !WEBP_USE_SSE41

View file

@ -11,7 +11,7 @@
//
// Author: Urvang (urvang@google.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
@ -20,16 +20,17 @@
// Helpful macro.
# define SANITY_CHECK(in, out) \
assert(in != NULL); \
assert(out != NULL); \
assert((in) != NULL); \
assert((out) != NULL); \
assert(width > 0); \
assert(height > 0); \
assert(stride >= width); \
assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
(void)height; // Silence unused warning.
static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
uint8_t* dst, int length, int inverse) {
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred,
uint8_t* dst, int length, int inverse) {
int i;
if (inverse) {
for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i];
@ -41,7 +42,44 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
//------------------------------------------------------------------------------
// Horizontal filter.
static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
in += start_offset;
out += start_offset;
preds = inverse ? out : in;
if (row == 0) {
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
row = 1;
preds += stride;
in += stride;
out += stride;
}
// Filter line-by-line.
while (row < last_row) {
// Leftmost pixel is predicted from above.
PredictLine_C(in, preds - stride, out, 1, inverse);
PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
++row;
preds += stride;
in += stride;
out += stride;
}
}
//------------------------------------------------------------------------------
// Vertical filter.
static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
@ -53,48 +91,11 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
out += start_offset;
preds = inverse ? out : in;
if (row == 0) {
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
PredictLine(in + 1, preds, out + 1, width - 1, inverse);
row = 1;
preds += stride;
in += stride;
out += stride;
}
// Filter line-by-line.
while (row < last_row) {
// Leftmost pixel is predicted from above.
PredictLine(in, preds - stride, out, 1, inverse);
PredictLine(in + 1, preds, out + 1, width - 1, inverse);
++row;
preds += stride;
in += stride;
out += stride;
}
}
//------------------------------------------------------------------------------
// Vertical filter.
static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
in += start_offset;
out += start_offset;
preds = inverse ? out : in;
if (row == 0) {
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
PredictLine(in + 1, preds, out + 1, width - 1, inverse);
PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
row = 1;
in += stride;
out += stride;
@ -105,26 +106,28 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
// Filter line-by-line.
while (row < last_row) {
PredictLine(in, preds, out, width, inverse);
PredictLine_C(in, preds, out, width, inverse);
++row;
preds += stride;
in += stride;
out += stride;
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// Gradient filter.
static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) {
const int g = a + b - c;
return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255; // clip to 8bit
}
static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
@ -136,7 +139,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
// left prediction for top scan-line
if (row == 0) {
out[0] = in[0];
PredictLine(in + 1, preds, out + 1, width - 1, inverse);
PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
row = 1;
preds += stride;
in += stride;
@ -147,11 +150,11 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
while (row < last_row) {
int w;
// leftmost pixel: predict from above.
PredictLine(in, preds - stride, out, 1, inverse);
PredictLine_C(in, preds - stride, out, 1, inverse);
for (w = 1; w < width; ++w) {
const int pred = GradientPredictor(preds[w - 1],
preds[w - stride],
preds[w - stride - 1]);
const int pred = GradientPredictor_C(preds[w - 1],
preds[w - stride],
preds[w - stride - 1]);
out[w] = in[w] + (inverse ? pred : -pred);
}
++row;
@ -160,32 +163,34 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
out += stride;
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
#undef SANITY_CHECK
//------------------------------------------------------------------------------
static void HorizontalFilter(const uint8_t* data, int width, int height,
#if !WEBP_NEON_OMIT_C_CODE
static void HorizontalFilter_C(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoHorizontalFilter_C(data, width, height, stride, 0, height, 0,
filtered_data);
}
static void VerticalFilter_C(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data);
DoVerticalFilter_C(data, width, height, stride, 0, height, 0, filtered_data);
}
static void VerticalFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data);
static void GradientFilter_C(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoGradientFilter_C(data, width, height, stride, 0, height, 0, filtered_data);
}
static void GradientFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data);
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
static void HorizontalUnfilter_C(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
uint8_t pred = (prev == NULL) ? 0 : prev[0];
int i;
for (i = 0; i < width; ++i) {
@ -194,26 +199,28 @@ static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
}
}
static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
#if !WEBP_NEON_OMIT_C_CODE
static void VerticalUnfilter_C(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
if (prev == NULL) {
HorizontalUnfilter(NULL, in, out, width);
HorizontalUnfilter_C(NULL, in, out, width);
} else {
int i;
for (i = 0; i < width; ++i) out[i] = prev[i] + in[i];
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
static void GradientUnfilter_C(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
if (prev == NULL) {
HorizontalUnfilter(NULL, in, out, width);
HorizontalUnfilter_C(NULL, in, out, width);
} else {
uint8_t top = prev[0], top_left = top, left = top;
int i;
for (i = 0; i < width; ++i) {
top = prev[i]; // need to read this first, in case prev==out
left = in[i] + GradientPredictor(left, top, top_left);
left = in[i] + GradientPredictor_C(left, top, top_left);
top_left = top;
out[i] = left;
}
@ -238,14 +245,18 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
if (filters_last_cpuinfo_used == VP8GetCPUInfo) return;
WebPUnfilters[WEBP_FILTER_NONE] = NULL;
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
#if !WEBP_NEON_OMIT_C_CODE
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_C;
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_C;
#endif
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_C;
WebPFilters[WEBP_FILTER_NONE] = NULL;
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
#if !WEBP_NEON_OMIT_C_CODE
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_C;
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_C;
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_C;
#endif
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
@ -253,11 +264,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
VP8FiltersInitSSE2();
}
#endif
#if defined(WEBP_USE_NEON)
if (VP8GetCPUInfo(kNEON)) {
VP8FiltersInitNEON();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
VP8FiltersInitMIPSdspR2();
@ -269,5 +275,20 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8FiltersInitNEON();
}
#endif
assert(WebPUnfilters[WEBP_FILTER_HORIZONTAL] != NULL);
assert(WebPUnfilters[WEBP_FILTER_VERTICAL] != NULL);
assert(WebPUnfilters[WEBP_FILTER_GRADIENT] != NULL);
assert(WebPFilters[WEBP_FILTER_HORIZONTAL] != NULL);
assert(WebPFilters[WEBP_FILTER_VERTICAL] != NULL);
assert(WebPFilters[WEBP_FILTER_GRADIENT] != NULL);
filters_last_cpuinfo_used = VP8GetCPUInfo;
}

View file

@ -12,11 +12,11 @@
// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
// Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "../dsp/dsp.h"
#include "src/dsp/dsp.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
@ -101,8 +101,8 @@
); \
} while (0)
static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst,
int length) {
static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* src, uint8_t* dst,
int length) {
DO_PREDICT_LINE(src, dst, length, 0);
}
@ -192,10 +192,11 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst,
} \
} while (0)
static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
uint8_t* out) {
static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(const uint8_t* in,
int width, int height,
int stride,
int row, int num_rows,
uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
@ -207,7 +208,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
if (row == 0) {
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
PredictLine(in + 1, out + 1, width - 1);
PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
row = 1;
preds += stride;
in += stride;
@ -219,9 +220,11 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
}
#undef FILTER_LINE_BY_LINE
static void HorizontalFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data);
static void HorizontalFilter_MIPSdspR2(const uint8_t* data,
int width, int height,
int stride, uint8_t* filtered_data) {
DoHorizontalFilter_MIPSdspR2(data, width, height, stride, 0, height,
filtered_data);
}
//------------------------------------------------------------------------------
@ -237,9 +240,11 @@ static void HorizontalFilter(const uint8_t* data, int width, int height,
} \
} while (0)
static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows, uint8_t* out) {
static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(const uint8_t* in,
int width, int height,
int stride,
int row, int num_rows,
uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
@ -252,7 +257,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
PredictLine(in + 1, out + 1, width - 1);
PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
@ -266,15 +271,16 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
}
#undef FILTER_LINE_BY_LINE
static void VerticalFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoVerticalFilter(data, width, height, stride, 0, height, filtered_data);
static void VerticalFilter_MIPSdspR2(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoVerticalFilter_MIPSdspR2(data, width, height, stride, 0, height,
filtered_data);
}
//------------------------------------------------------------------------------
// Gradient filter.
static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
static int GradientPredictor_MIPSdspR2(uint8_t a, uint8_t b, uint8_t c) {
int temp0;
__asm__ volatile (
"addu %[temp0], %[a], %[b] \n\t"
@ -293,9 +299,9 @@ static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
int w; \
PREDICT_LINE_ONE_PASS(in, PREDS - stride, out); \
for (w = 1; w < width; ++w) { \
const int pred = GradientPredictor(PREDS[w - 1], \
PREDS[w - stride], \
PREDS[w - stride - 1]); \
const int pred = GradientPredictor_MIPSdspR2(PREDS[w - 1], \
PREDS[w - stride], \
PREDS[w - stride - 1]); \
out[w] = in[w] OPERATION pred; \
} \
++row; \
@ -304,9 +310,9 @@ static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
} \
} while (0)
static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows, uint8_t* out) {
static void DoGradientFilter_MIPSdspR2(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
@ -318,7 +324,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
// left prediction for top scan-line
if (row == 0) {
out[0] = in[0];
PredictLine(in + 1, out + 1, width - 1);
PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
row = 1;
preds += stride;
in += stride;
@ -330,38 +336,39 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
}
#undef FILTER_LINE_BY_LINE
static void GradientFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoGradientFilter(data, width, height, stride, 0, height, filtered_data);
static void GradientFilter_MIPSdspR2(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoGradientFilter_MIPSdspR2(data, width, height, stride, 0, height,
filtered_data);
}
//------------------------------------------------------------------------------
static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
static void HorizontalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
DO_PREDICT_LINE(in + 1, out + 1, width - 1, 1);
}
static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
static void VerticalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
if (prev == NULL) {
HorizontalUnfilter(NULL, in, out, width);
HorizontalUnfilter_MIPSdspR2(NULL, in, out, width);
} else {
DO_PREDICT_LINE_VERTICAL(in, prev, out, width, 1);
}
}
static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
static void GradientUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
if (prev == NULL) {
HorizontalUnfilter(NULL, in, out, width);
HorizontalUnfilter_MIPSdspR2(NULL, in, out, width);
} else {
uint8_t top = prev[0], top_left = top, left = top;
int i;
for (i = 0; i < width; ++i) {
top = prev[i]; // need to read this first, in case prev==dst
left = in[i] + GradientPredictor(left, top, top_left);
left = in[i] + GradientPredictor_MIPSdspR2(left, top, top_left);
top_left = top;
out[i] = left;
}
@ -379,13 +386,13 @@ static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
extern void VP8FiltersInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMIPSdspR2(void) {
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_MIPSdspR2;
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_MIPSdspR2;
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_MIPSdspR2;
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MIPSdspR2;
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MIPSdspR2;
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2

View file

@ -11,11 +11,11 @@
//
// Author: Prashant Patil (prashant.patil@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MSA)
#include "./msa_macro.h"
#include "src/dsp/msa_macro.h"
#include <assert.h>
@ -66,8 +66,8 @@ static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
//------------------------------------------------------------------------------
// Horrizontal filter
static void HorizontalFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
static void HorizontalFilter_MSA(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
const uint8_t* preds = data;
const uint8_t* in = data;
uint8_t* out = filtered_data;
@ -129,8 +129,8 @@ static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
}
static void GradientFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
static void GradientFilter_MSA(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
const uint8_t* in = data;
const uint8_t* preds = data;
uint8_t* out = filtered_data;
@ -157,8 +157,8 @@ static void GradientFilter(const uint8_t* data, int width, int height,
//------------------------------------------------------------------------------
// Vertical filter
static void VerticalFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
static void VerticalFilter_MSA(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
const uint8_t* in = data;
const uint8_t* preds = data;
uint8_t* out = filtered_data;
@ -190,9 +190,9 @@ static void VerticalFilter(const uint8_t* data, int width, int height,
extern void VP8FiltersInitMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMSA(void) {
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MSA;
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MSA;
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MSA;
}
#else // !WEBP_USE_MSA

View file

@ -11,12 +11,12 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
#include <assert.h>
#include "./neon.h"
#include "src/dsp/neon.h"
//------------------------------------------------------------------------------
// Helpful macros.
@ -134,7 +134,7 @@ static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in,
}
static void VerticalFilter_NEON(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
int stride, uint8_t* filtered_data) {
DoVerticalFilter_NEON(data, width, height, stride, 0, height,
filtered_data);
}
@ -196,7 +196,7 @@ static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in,
}
static void GradientFilter_NEON(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
int stride, uint8_t* filtered_data) {
DoGradientFilter_NEON(data, width, height, stride, 0, height,
filtered_data);
}
@ -251,9 +251,11 @@ static void VerticalUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
// GradientUnfilter_NEON is correct but slower than the C-version,
// at least on ARM64. For armv7, it's a wash.
// So best is to disable it for now, but keep the idea around...
// #define USE_GRADIENT_UNFILTER
#if !defined(USE_GRADIENT_UNFILTER)
#define USE_GRADIENT_UNFILTER 0 // ALTERNATE_CODE
#endif
#if defined(USE_GRADIENT_UNFILTER)
#if (USE_GRADIENT_UNFILTER == 1)
#define GRAD_PROCESS_LANE(L) do { \
const uint8x8_t tmp1 = ROTATE_RIGHT_N(pred, 1); /* rotate predictor in */ \
const int16x8_t tmp2 = vaddq_s16(BC, U8_TO_S16(tmp1)); \
@ -292,7 +294,7 @@ static void GradientPredictInverse_NEON(const uint8_t* const in,
#undef GRAD_PROCESS_LANE
static void GradientUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
uint8_t* out, int width) {
if (prev == NULL) {
HorizontalUnfilter_NEON(NULL, in, out, width);
} else {
@ -311,7 +313,7 @@ extern void VP8FiltersInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitNEON(void) {
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_NEON;
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_NEON;
#if defined(USE_GRADIENT_UNFILTER)
#if (USE_GRADIENT_UNFILTER == 1)
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_NEON;
#endif

View file

@ -11,7 +11,7 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
@ -24,16 +24,16 @@
// Helpful macro.
# define SANITY_CHECK(in, out) \
assert(in != NULL); \
assert(out != NULL); \
assert((in) != NULL); \
assert((out) != NULL); \
assert(width > 0); \
assert(height > 0); \
assert(stride >= width); \
assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
(void)height; // Silence unused warning.
static void PredictLineTop(const uint8_t* src, const uint8_t* pred,
uint8_t* dst, int length) {
static void PredictLineTop_SSE2(const uint8_t* src, const uint8_t* pred,
uint8_t* dst, int length) {
int i;
const int max_pos = length & ~31;
assert(length >= 0);
@ -51,7 +51,7 @@ static void PredictLineTop(const uint8_t* src, const uint8_t* pred,
}
// Special case for left-based prediction (when preds==dst-1 or preds==src-1).
static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length) {
static void PredictLineLeft_SSE2(const uint8_t* src, uint8_t* dst, int length) {
int i;
const int max_pos = length & ~31;
assert(length >= 0);
@ -71,10 +71,11 @@ static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length) {
//------------------------------------------------------------------------------
// Horizontal filter.
static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
uint8_t* out) {
static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in,
int width, int height,
int stride,
int row, int num_rows,
uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
@ -84,7 +85,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
if (row == 0) {
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
PredictLineLeft(in + 1, out + 1, width - 1);
PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
@ -94,7 +95,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
while (row < last_row) {
// Leftmost pixel is predicted from above.
out[0] = in[0] - in[-stride];
PredictLineLeft(in + 1, out + 1, width - 1);
PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
++row;
in += stride;
out += stride;
@ -104,9 +105,10 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
//------------------------------------------------------------------------------
// Vertical filter.
static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows, uint8_t* out) {
static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
@ -117,7 +119,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
PredictLineLeft(in + 1, out + 1, width - 1);
PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
@ -125,7 +127,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
// Filter line-by-line.
while (row < last_row) {
PredictLineTop(in, in - stride, out, width);
PredictLineTop_SSE2(in, in - stride, out, width);
++row;
in += stride;
out += stride;
@ -135,14 +137,14 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
//------------------------------------------------------------------------------
// Gradient filter.
static WEBP_INLINE int GradientPredictorC(uint8_t a, uint8_t b, uint8_t c) {
static WEBP_INLINE int GradientPredictor_SSE2(uint8_t a, uint8_t b, uint8_t c) {
const int g = a + b - c;
return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255; // clip to 8bit
}
static void GradientPredictDirect(const uint8_t* const row,
const uint8_t* const top,
uint8_t* const out, int length) {
static void GradientPredictDirect_SSE2(const uint8_t* const row,
const uint8_t* const top,
uint8_t* const out, int length) {
const int max_pos = length & ~7;
int i;
const __m128i zero = _mm_setzero_si128();
@ -161,14 +163,14 @@ static void GradientPredictDirect(const uint8_t* const row,
_mm_storel_epi64((__m128i*)(out + i), H);
}
for (; i < length; ++i) {
out[i] = row[i] - GradientPredictorC(row[i - 1], top[i], top[i - 1]);
out[i] = row[i] - GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
}
}
static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
uint8_t* out) {
static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
@ -178,7 +180,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
// left prediction for top scan-line
if (row == 0) {
out[0] = in[0];
PredictLineLeft(in + 1, out + 1, width - 1);
PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
@ -187,7 +189,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
// Filter line-by-line.
while (row < last_row) {
out[0] = in[0] - in[-stride];
GradientPredictDirect(in + 1, in + 1 - stride, out + 1, width - 1);
GradientPredictDirect_SSE2(in + 1, in + 1 - stride, out + 1, width - 1);
++row;
in += stride;
out += stride;
@ -198,26 +200,27 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
//------------------------------------------------------------------------------
static void HorizontalFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data);
static void HorizontalFilter_SSE2(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoHorizontalFilter_SSE2(data, width, height, stride, 0, height,
filtered_data);
}
static void VerticalFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoVerticalFilter(data, width, height, stride, 0, height, filtered_data);
static void VerticalFilter_SSE2(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoVerticalFilter_SSE2(data, width, height, stride, 0, height, filtered_data);
}
static void GradientFilter(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoGradientFilter(data, width, height, stride, 0, height, filtered_data);
static void GradientFilter_SSE2(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoGradientFilter_SSE2(data, width, height, stride, 0, height, filtered_data);
}
//------------------------------------------------------------------------------
// Inverse transforms
static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
static void HorizontalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
int i;
__m128i last;
out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
@ -238,10 +241,10 @@ static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
for (; i < width; ++i) out[i] = in[i] + out[i - 1];
}
static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
static void VerticalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
if (prev == NULL) {
HorizontalUnfilter(NULL, in, out, width);
HorizontalUnfilter_SSE2(NULL, in, out, width);
} else {
int i;
const int max_pos = width & ~31;
@ -260,9 +263,9 @@ static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
}
}
static void GradientPredictInverse(const uint8_t* const in,
const uint8_t* const top,
uint8_t* const row, int length) {
static void GradientPredictInverse_SSE2(const uint8_t* const in,
const uint8_t* const top,
uint8_t* const row, int length) {
if (length > 0) {
int i;
const int max_pos = length & ~7;
@ -293,18 +296,18 @@ static void GradientPredictInverse(const uint8_t* const in,
_mm_storel_epi64((__m128i*)&row[i], out);
}
for (; i < length; ++i) {
row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]);
row[i] = in[i] + GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
}
}
}
static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
static void GradientUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
if (prev == NULL) {
HorizontalUnfilter(NULL, in, out, width);
HorizontalUnfilter_SSE2(NULL, in, out, width);
} else {
out[0] = in[0] + prev[0]; // predict from above
GradientPredictInverse(in + 1, prev + 1, out + 1, width - 1);
GradientPredictInverse_SSE2(in + 1, prev + 1, out + 1, width - 1);
}
}
@ -314,13 +317,13 @@ static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
extern void VP8FiltersInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitSSE2(void) {
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_SSE2;
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_SSE2;
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_SSE2;
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_SSE2;
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_SSE2;
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_SSE2;
}
#else // !WEBP_USE_SSE2

View file

@ -13,14 +13,15 @@
// Jyrki Alakuijala (jyrki@google.com)
// Urvang Joshi (urvang@google.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#include <assert.h>
#include <math.h>
#include <stdlib.h>
#include "../dec/vp8li_dec.h"
#include "../utils/endian_inl_utils.h"
#include "./lossless.h"
#include "./lossless_common.h"
#include "src/dec/vp8li_dec.h"
#include "src/utils/endian_inl_utils.h"
#include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h"
#define MAX_DIFF_COST (1e30f)
@ -80,8 +81,9 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
}
// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined.
#if defined(__arm__) && LOCAL_GCC_VERSION == 0x409
// gcc <= 4.9 on ARM generates incorrect code in Select() when Sub3() is
// inlined.
#if defined(__arm__) && LOCAL_GCC_VERSION <= 0x409
# define LOCAL_INLINE __attribute__ ((noinline))
#else
# define LOCAL_INLINE WEBP_INLINE
@ -107,69 +109,69 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
//------------------------------------------------------------------------------
// Predictors
static uint32_t Predictor0(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor0_C(uint32_t left, const uint32_t* const top) {
(void)top;
(void)left;
return ARGB_BLACK;
}
static uint32_t Predictor1(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor1_C(uint32_t left, const uint32_t* const top) {
(void)top;
return left;
}
static uint32_t Predictor2(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor2_C(uint32_t left, const uint32_t* const top) {
(void)left;
return top[0];
}
static uint32_t Predictor3(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor3_C(uint32_t left, const uint32_t* const top) {
(void)left;
return top[1];
}
static uint32_t Predictor4(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor4_C(uint32_t left, const uint32_t* const top) {
(void)left;
return top[-1];
}
static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor5_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average3(left, top[0], top[1]);
return pred;
}
static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor6_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(left, top[-1]);
return pred;
}
static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor7_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(left, top[0]);
return pred;
}
static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor8_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(top[-1], top[0]);
(void)left;
return pred;
}
static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor9_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(top[0], top[1]);
(void)left;
return pred;
}
static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor10_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
return pred;
}
static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor11_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Select(top[0], left, top[-1]);
return pred;
}
static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor12_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
return pred;
}
static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor13_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
return pred;
}
GENERATE_PREDICTOR_ADD(Predictor0, PredictorAdd0)
static void PredictorAdd1(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
GENERATE_PREDICTOR_ADD(Predictor0_C, PredictorAdd0_C)
static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
uint32_t left = out[-1];
for (i = 0; i < num_pixels; ++i) {
@ -177,29 +179,29 @@ static void PredictorAdd1(const uint32_t* in, const uint32_t* upper,
}
(void)upper;
}
GENERATE_PREDICTOR_ADD(Predictor2, PredictorAdd2)
GENERATE_PREDICTOR_ADD(Predictor3, PredictorAdd3)
GENERATE_PREDICTOR_ADD(Predictor4, PredictorAdd4)
GENERATE_PREDICTOR_ADD(Predictor5, PredictorAdd5)
GENERATE_PREDICTOR_ADD(Predictor6, PredictorAdd6)
GENERATE_PREDICTOR_ADD(Predictor7, PredictorAdd7)
GENERATE_PREDICTOR_ADD(Predictor8, PredictorAdd8)
GENERATE_PREDICTOR_ADD(Predictor9, PredictorAdd9)
GENERATE_PREDICTOR_ADD(Predictor10, PredictorAdd10)
GENERATE_PREDICTOR_ADD(Predictor11, PredictorAdd11)
GENERATE_PREDICTOR_ADD(Predictor12, PredictorAdd12)
GENERATE_PREDICTOR_ADD(Predictor13, PredictorAdd13)
GENERATE_PREDICTOR_ADD(Predictor2_C, PredictorAdd2_C)
GENERATE_PREDICTOR_ADD(Predictor3_C, PredictorAdd3_C)
GENERATE_PREDICTOR_ADD(Predictor4_C, PredictorAdd4_C)
GENERATE_PREDICTOR_ADD(Predictor5_C, PredictorAdd5_C)
GENERATE_PREDICTOR_ADD(Predictor6_C, PredictorAdd6_C)
GENERATE_PREDICTOR_ADD(Predictor7_C, PredictorAdd7_C)
GENERATE_PREDICTOR_ADD(Predictor8_C, PredictorAdd8_C)
GENERATE_PREDICTOR_ADD(Predictor9_C, PredictorAdd9_C)
GENERATE_PREDICTOR_ADD(Predictor10_C, PredictorAdd10_C)
GENERATE_PREDICTOR_ADD(Predictor11_C, PredictorAdd11_C)
GENERATE_PREDICTOR_ADD(Predictor12_C, PredictorAdd12_C)
GENERATE_PREDICTOR_ADD(Predictor13_C, PredictorAdd13_C)
//------------------------------------------------------------------------------
// Inverse prediction.
static void PredictorInverseTransform(const VP8LTransform* const transform,
int y_start, int y_end,
const uint32_t* in, uint32_t* out) {
static void PredictorInverseTransform_C(const VP8LTransform* const transform,
int y_start, int y_end,
const uint32_t* in, uint32_t* out) {
const int width = transform->xsize_;
if (y_start == 0) { // First Row follows the L (mode=1) mode.
PredictorAdd0(in, NULL, 1, out);
PredictorAdd1(in + 1, NULL, width - 1, out + 1);
PredictorAdd0_C(in, NULL, 1, out);
PredictorAdd1_C(in + 1, NULL, width - 1, out + 1);
in += width;
out += width;
++y_start;
@ -217,7 +219,7 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
const uint32_t* pred_mode_src = pred_mode_base;
int x = 1;
// First pixel follows the T (mode=2) mode.
PredictorAdd2(in, out - width, 1, out);
PredictorAdd2_C(in, out - width, 1, out);
// .. the rest:
while (x < width) {
const VP8LPredictorAddSubFunc pred_func =
@ -272,8 +274,8 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
const uint32_t argb = src[i];
const uint32_t green = argb >> 8;
const uint32_t red = argb >> 16;
int new_red = red;
int new_blue = argb;
int new_red = red & 0xff;
int new_blue = argb & 0xff;
new_red += ColorTransformDelta(m->green_to_red_, green);
new_red &= 0xff;
new_blue += ColorTransformDelta(m->green_to_blue_, green);
@ -284,9 +286,9 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
}
// Color space inverse transform.
static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
int y_start, int y_end,
const uint32_t* src, uint32_t* dst) {
static void ColorSpaceInverseTransform_C(const VP8LTransform* const transform,
int y_start, int y_end,
const uint32_t* src, uint32_t* dst) {
const int width = transform->xsize_;
const int tile_width = 1 << transform->bits_;
const int mask = tile_width - 1;
@ -362,10 +364,10 @@ STATIC_DECL void FUNC_NAME(const VP8LTransform* const transform, \
} \
}
COLOR_INDEX_INVERSE(ColorIndexInverseTransform, MapARGB, static, uint32_t, 32b,
VP8GetARGBIndex, VP8GetARGBValue)
COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, MapAlpha, , uint8_t,
8b, VP8GetAlphaIndex, VP8GetAlphaValue)
COLOR_INDEX_INVERSE(ColorIndexInverseTransform_C, MapARGB_C, static,
uint32_t, 32b, VP8GetARGBIndex, VP8GetARGBValue)
COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, MapAlpha_C, ,
uint8_t, 8b, VP8GetAlphaIndex, VP8GetAlphaValue)
#undef COLOR_INDEX_INVERSE
@ -380,7 +382,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
VP8LAddGreenToBlueAndRed(in, (row_end - row_start) * width, out);
break;
case PREDICTOR_TRANSFORM:
PredictorInverseTransform(transform, row_start, row_end, in, out);
PredictorInverseTransform_C(transform, row_start, row_end, in, out);
if (row_end != transform->ysize_) {
// The last predicted row in this iteration will be the top-pred row
// for the first row in next iteration.
@ -389,7 +391,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
}
break;
case CROSS_COLOR_TRANSFORM:
ColorSpaceInverseTransform(transform, row_start, row_end, in, out);
ColorSpaceInverseTransform_C(transform, row_start, row_end, in, out);
break;
case COLOR_INDEXING_TRANSFORM:
if (in == out && transform->bits_ > 0) {
@ -403,9 +405,9 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
VP8LSubSampleSize(transform->xsize_, transform->bits_);
uint32_t* const src = out + out_stride - in_stride;
memmove(src, out, in_stride * sizeof(*src));
ColorIndexInverseTransform(transform, row_start, row_end, src, out);
ColorIndexInverseTransform_C(transform, row_start, row_end, src, out);
} else {
ColorIndexInverseTransform(transform, row_start, row_end, in, out);
ColorIndexInverseTransform_C(transform, row_start, row_end, in, out);
}
break;
}
@ -452,7 +454,7 @@ void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
const uint32_t argb = *src++;
const uint8_t rg = ((argb >> 16) & 0xf0) | ((argb >> 12) & 0xf);
const uint8_t ba = ((argb >> 0) & 0xf0) | ((argb >> 28) & 0xf);
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
*dst++ = ba;
*dst++ = rg;
#else
@ -469,7 +471,7 @@ void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
const uint32_t argb = *src++;
const uint8_t rg = ((argb >> 16) & 0xf8) | ((argb >> 13) & 0x7);
const uint8_t gb = ((argb >> 5) & 0xe0) | ((argb >> 3) & 0x1f);
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
*dst++ = gb;
*dst++ = rg;
#else
@ -496,22 +498,7 @@ static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
const uint32_t* const src_end = src + num_pixels;
while (src < src_end) {
const uint32_t argb = *src++;
#if !defined(WORDS_BIGENDIAN)
#if !defined(WEBP_REFERENCE_IMPLEMENTATION)
WebPUint32ToMem(dst, BSwap32(argb));
#else // WEBP_REFERENCE_IMPLEMENTATION
dst[0] = (argb >> 24) & 0xff;
dst[1] = (argb >> 16) & 0xff;
dst[2] = (argb >> 8) & 0xff;
dst[3] = (argb >> 0) & 0xff;
#endif
#else // WORDS_BIGENDIAN
dst[0] = (argb >> 0) & 0xff;
dst[1] = (argb >> 8) & 0xff;
dst[2] = (argb >> 16) & 0xff;
dst[3] = (argb >> 24) & 0xff;
#endif
dst += sizeof(argb);
}
} else {
@ -593,23 +580,23 @@ extern void VP8LDspInitMSA(void);
static volatile VP8CPUInfo lossless_last_cpuinfo_used =
(VP8CPUInfo)&lossless_last_cpuinfo_used;
#define COPY_PREDICTOR_ARRAY(IN, OUT) do { \
(OUT)[0] = IN##0; \
(OUT)[1] = IN##1; \
(OUT)[2] = IN##2; \
(OUT)[3] = IN##3; \
(OUT)[4] = IN##4; \
(OUT)[5] = IN##5; \
(OUT)[6] = IN##6; \
(OUT)[7] = IN##7; \
(OUT)[8] = IN##8; \
(OUT)[9] = IN##9; \
(OUT)[10] = IN##10; \
(OUT)[11] = IN##11; \
(OUT)[12] = IN##12; \
(OUT)[13] = IN##13; \
(OUT)[14] = IN##0; /* <- padding security sentinels*/ \
(OUT)[15] = IN##0; \
#define COPY_PREDICTOR_ARRAY(IN, OUT) do { \
(OUT)[0] = IN##0_C; \
(OUT)[1] = IN##1_C; \
(OUT)[2] = IN##2_C; \
(OUT)[3] = IN##3_C; \
(OUT)[4] = IN##4_C; \
(OUT)[5] = IN##5_C; \
(OUT)[6] = IN##6_C; \
(OUT)[7] = IN##7_C; \
(OUT)[8] = IN##8_C; \
(OUT)[9] = IN##9_C; \
(OUT)[10] = IN##10_C; \
(OUT)[11] = IN##11_C; \
(OUT)[12] = IN##12_C; \
(OUT)[13] = IN##13_C; \
(OUT)[14] = IN##0_C; /* <- padding security sentinels*/ \
(OUT)[15] = IN##0_C; \
} while (0);
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
@ -620,18 +607,21 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd)
COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd_C)
#if !WEBP_NEON_OMIT_C_CODE
VP8LAddGreenToBlueAndRed = VP8LAddGreenToBlueAndRed_C;
VP8LTransformColorInverse = VP8LTransformColorInverse_C;
VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
VP8LConvertBGRAToRGBA = VP8LConvertBGRAToRGBA_C;
VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;
#endif
VP8LConvertBGRAToRGBA4444 = VP8LConvertBGRAToRGBA4444_C;
VP8LConvertBGRAToRGB565 = VP8LConvertBGRAToRGB565_C;
VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;
VP8LMapColor32b = MapARGB;
VP8LMapColor8b = MapAlpha;
VP8LMapColor32b = MapARGB_C;
VP8LMapColor8b = MapAlpha_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
@ -640,11 +630,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
VP8LDspInitSSE2();
}
#endif
#if defined(WEBP_USE_NEON)
if (VP8GetCPUInfo(kNEON)) {
VP8LDspInitNEON();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
VP8LDspInitMIPSdspR2();
@ -656,6 +641,24 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8LDspInitNEON();
}
#endif
assert(VP8LAddGreenToBlueAndRed != NULL);
assert(VP8LTransformColorInverse != NULL);
assert(VP8LConvertBGRAToRGBA != NULL);
assert(VP8LConvertBGRAToRGB != NULL);
assert(VP8LConvertBGRAToBGR != NULL);
assert(VP8LConvertBGRAToRGBA4444 != NULL);
assert(VP8LConvertBGRAToRGB565 != NULL);
assert(VP8LMapColor32b != NULL);
assert(VP8LMapColor8b != NULL);
lossless_last_cpuinfo_used = VP8GetCPUInfo;
}
#undef COPY_PREDICTOR_ARRAY

View file

@ -15,18 +15,18 @@
#ifndef WEBP_DSP_LOSSLESS_H_
#define WEBP_DSP_LOSSLESS_H_
#include "../webp/types.h"
#include "../webp/decode.h"
#include "src/webp/types.h"
#include "src/webp/decode.h"
#include "../enc/histogram_enc.h"
#include "../utils/utils.h"
#include "src/enc/histogram_enc.h"
#include "src/utils/utils.h"
#ifdef __cplusplus
extern "C" {
#endif
#ifdef WEBP_EXPERIMENTAL_FEATURES
#include "../enc/delta_palettization_enc.h"
#include "src/enc/delta_palettization_enc.h"
#endif // WEBP_EXPERIMENTAL_FEATURES
//------------------------------------------------------------------------------
@ -124,7 +124,7 @@ void VP8LDspInit(void);
typedef void (*VP8LProcessEncBlueAndRedFunc)(uint32_t* dst, int num_pixels);
extern VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
uint32_t* const dst, int num_pixels);
uint32_t* dst, int num_pixels);
extern VP8LTransformColorFunc VP8LTransformColor;
typedef void (*VP8LCollectColorBlueTransformsFunc)(
const uint32_t* argb, int stride,

View file

@ -16,9 +16,9 @@
#ifndef WEBP_DSP_LOSSLESS_COMMON_H_
#define WEBP_DSP_LOSSLESS_COMMON_H_
#include "../webp/types.h"
#include "src/webp/types.h"
#include "../utils/utils.h"
#include "src/utils/utils.h"
#ifdef __cplusplus
extern "C" {
@ -93,14 +93,6 @@ static WEBP_INLINE float VP8LFastSLog2(uint32_t v) {
// -----------------------------------------------------------------------------
// PrefixEncode()
static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
const int log_floor = BitsLog2Floor(n);
if (n == (n & ~(n - 1))) { // zero or a power of two.
return log_floor;
}
return log_floor + 1;
}
// Splitting of distance and length codes into prefixes and
// extra bits. The prefixes are encoded with an entropy code
// while the extra bits are stored just as normal bits.

View file

@ -13,15 +13,16 @@
// Jyrki Alakuijala (jyrki@google.com)
// Urvang Joshi (urvang@google.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#include <assert.h>
#include <math.h>
#include <stdlib.h>
#include "../dec/vp8li_dec.h"
#include "../utils/endian_inl_utils.h"
#include "./lossless.h"
#include "./lossless_common.h"
#include "./yuv.h"
#include "src/dec/vp8li_dec.h"
#include "src/utils/endian_inl_utils.h"
#include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h"
#include "src/dsp/yuv.h"
// lookup table for small values of log2(int)
const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
@ -325,7 +326,7 @@ const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
};
static float FastSLog2Slow(uint32_t v) {
static float FastSLog2Slow_C(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
int log_cnt = 0;
@ -351,7 +352,7 @@ static float FastSLog2Slow(uint32_t v) {
}
}
static float FastLog2Slow(uint32_t v) {
static float FastLog2Slow_C(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
int log_cnt = 0;
@ -380,7 +381,7 @@ static float FastLog2Slow(uint32_t v) {
// Methods to calculate Entropy (Shannon).
// Compute the combined Shanon's entropy for distribution {X} and {X+Y}
static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
static float CombinedShannonEntropy_C(const int X[256], const int Y[256]) {
int i;
double retval = 0.;
int sumX = 0, sumXY = 0;
@ -453,9 +454,9 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
*i_prev = i;
}
static void GetEntropyUnrefined(const uint32_t X[], int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats) {
static void GetEntropyUnrefined_C(const uint32_t X[], int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats) {
int i;
int i_prev = 0;
uint32_t x_prev = X[0];
@ -474,10 +475,11 @@ static void GetEntropyUnrefined(const uint32_t X[], int length,
bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
}
static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[],
int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats) {
static void GetCombinedEntropyUnrefined_C(const uint32_t X[],
const uint32_t Y[],
int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats) {
int i = 1;
int i_prev = 0;
uint32_t xy_prev = X[0] + Y[0];
@ -520,8 +522,8 @@ void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
const uint32_t argb = data[i];
const uint32_t green = argb >> 8;
const uint32_t red = argb >> 16;
int new_red = red;
int new_blue = argb;
int new_red = red & 0xff;
int new_blue = argb & 0xff;
new_red -= ColorTransformDelta(m->green_to_red_, green);
new_red &= 0xff;
new_blue -= ColorTransformDelta(m->green_to_blue_, green);
@ -577,8 +579,8 @@ void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
//------------------------------------------------------------------------------
static int VectorMismatch(const uint32_t* const array1,
const uint32_t* const array2, int length) {
static int VectorMismatch_C(const uint32_t* const array1,
const uint32_t* const array2, int length) {
int match_len = 0;
while (match_len < length && array1[match_len] == array2[match_len]) {
@ -610,15 +612,15 @@ void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
//------------------------------------------------------------------------------
static double ExtraCost(const uint32_t* population, int length) {
static double ExtraCost_C(const uint32_t* population, int length) {
int i;
double cost = 0.;
for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
return cost;
}
static double ExtraCostCombined(const uint32_t* X, const uint32_t* Y,
int length) {
static double ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
int length) {
int i;
double cost = 0.;
for (i = 2; i < length - 2; ++i) {
@ -630,9 +632,9 @@ static double ExtraCostCombined(const uint32_t* X, const uint32_t* Y,
//------------------------------------------------------------------------------
static void HistogramAdd(const VP8LHistogram* const a,
const VP8LHistogram* const b,
VP8LHistogram* const out) {
static void HistogramAdd_C(const VP8LHistogram* const a,
const VP8LHistogram* const b,
VP8LHistogram* const out) {
int i;
const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
assert(a->palette_code_bits_ == b->palette_code_bits_);
@ -869,26 +871,28 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
VP8LDspInit();
#if !WEBP_NEON_OMIT_C_CODE
VP8LSubtractGreenFromBlueAndRed = VP8LSubtractGreenFromBlueAndRed_C;
VP8LTransformColor = VP8LTransformColor_C;
#endif
VP8LCollectColorBlueTransforms = VP8LCollectColorBlueTransforms_C;
VP8LCollectColorRedTransforms = VP8LCollectColorRedTransforms_C;
VP8LFastLog2Slow = FastLog2Slow;
VP8LFastSLog2Slow = FastSLog2Slow;
VP8LFastLog2Slow = FastLog2Slow_C;
VP8LFastSLog2Slow = FastSLog2Slow_C;
VP8LExtraCost = ExtraCost;
VP8LExtraCostCombined = ExtraCostCombined;
VP8LCombinedShannonEntropy = CombinedShannonEntropy;
VP8LExtraCost = ExtraCost_C;
VP8LExtraCostCombined = ExtraCostCombined_C;
VP8LCombinedShannonEntropy = CombinedShannonEntropy_C;
VP8LGetEntropyUnrefined = GetEntropyUnrefined;
VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined;
VP8LGetEntropyUnrefined = GetEntropyUnrefined_C;
VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_C;
VP8LHistogramAdd = HistogramAdd;
VP8LHistogramAdd = HistogramAdd_C;
VP8LVectorMismatch = VectorMismatch;
VP8LVectorMismatch = VectorMismatch_C;
VP8LBundleColorMap = VP8LBundleColorMap_C;
VP8LPredictorsSub[0] = PredictorSub0_C;
@ -937,11 +941,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
#endif
}
#endif
#if defined(WEBP_USE_NEON)
if (VP8GetCPUInfo(kNEON)) {
VP8LEncDspInitNEON();
}
#endif
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
VP8LEncDspInitMIPS32();
@ -958,6 +957,61 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8LEncDspInitNEON();
}
#endif
assert(VP8LSubtractGreenFromBlueAndRed != NULL);
assert(VP8LTransformColor != NULL);
assert(VP8LCollectColorBlueTransforms != NULL);
assert(VP8LCollectColorRedTransforms != NULL);
assert(VP8LFastLog2Slow != NULL);
assert(VP8LFastSLog2Slow != NULL);
assert(VP8LExtraCost != NULL);
assert(VP8LExtraCostCombined != NULL);
assert(VP8LCombinedShannonEntropy != NULL);
assert(VP8LGetEntropyUnrefined != NULL);
assert(VP8LGetCombinedEntropyUnrefined != NULL);
assert(VP8LHistogramAdd != NULL);
assert(VP8LVectorMismatch != NULL);
assert(VP8LBundleColorMap != NULL);
assert(VP8LPredictorsSub[0] != NULL);
assert(VP8LPredictorsSub[1] != NULL);
assert(VP8LPredictorsSub[2] != NULL);
assert(VP8LPredictorsSub[3] != NULL);
assert(VP8LPredictorsSub[4] != NULL);
assert(VP8LPredictorsSub[5] != NULL);
assert(VP8LPredictorsSub[6] != NULL);
assert(VP8LPredictorsSub[7] != NULL);
assert(VP8LPredictorsSub[8] != NULL);
assert(VP8LPredictorsSub[9] != NULL);
assert(VP8LPredictorsSub[10] != NULL);
assert(VP8LPredictorsSub[11] != NULL);
assert(VP8LPredictorsSub[12] != NULL);
assert(VP8LPredictorsSub[13] != NULL);
assert(VP8LPredictorsSub[14] != NULL);
assert(VP8LPredictorsSub[15] != NULL);
assert(VP8LPredictorsSub_C[0] != NULL);
assert(VP8LPredictorsSub_C[1] != NULL);
assert(VP8LPredictorsSub_C[2] != NULL);
assert(VP8LPredictorsSub_C[3] != NULL);
assert(VP8LPredictorsSub_C[4] != NULL);
assert(VP8LPredictorsSub_C[5] != NULL);
assert(VP8LPredictorsSub_C[6] != NULL);
assert(VP8LPredictorsSub_C[7] != NULL);
assert(VP8LPredictorsSub_C[8] != NULL);
assert(VP8LPredictorsSub_C[9] != NULL);
assert(VP8LPredictorsSub_C[10] != NULL);
assert(VP8LPredictorsSub_C[11] != NULL);
assert(VP8LPredictorsSub_C[12] != NULL);
assert(VP8LPredictorsSub_C[13] != NULL);
assert(VP8LPredictorsSub_C[14] != NULL);
assert(VP8LPredictorsSub_C[15] != NULL);
lossless_enc_last_cpuinfo_used = VP8GetCPUInfo;
}

View file

@ -12,9 +12,9 @@
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "./dsp.h"
#include "./lossless.h"
#include "./lossless_common.h"
#include "src/dsp/dsp.h"
#include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h"
#if defined(WEBP_USE_MIPS32)
@ -23,7 +23,7 @@
#include <stdlib.h>
#include <string.h>
static float FastSLog2Slow(uint32_t v) {
static float FastSLog2Slow_MIPS32(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
uint32_t log_cnt, y, correction;
@ -59,7 +59,7 @@ static float FastSLog2Slow(uint32_t v) {
}
}
static float FastLog2Slow(uint32_t v) {
static float FastLog2Slow_MIPS32(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
uint32_t log_cnt, y;
@ -104,7 +104,7 @@ static float FastLog2Slow(uint32_t v) {
// pop += 2;
// }
// return (double)cost;
static double ExtraCost(const uint32_t* const population, int length) {
static double ExtraCost_MIPS32(const uint32_t* const population, int length) {
int i, temp0, temp1;
const uint32_t* pop = &population[4];
const uint32_t* const LoopEnd = &population[length];
@ -149,8 +149,8 @@ static double ExtraCost(const uint32_t* const population, int length) {
// pY += 2;
// }
// return (double)cost;
static double ExtraCostCombined(const uint32_t* const X,
const uint32_t* const Y, int length) {
static double ExtraCostCombined_MIPS32(const uint32_t* const X,
const uint32_t* const Y, int length) {
int i, temp0, temp1, temp2, temp3;
const uint32_t* pX = &X[4];
const uint32_t* pY = &Y[4];
@ -241,9 +241,9 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
*i_prev = i;
}
static void GetEntropyUnrefined(const uint32_t X[], int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats) {
static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats) {
int i;
int i_prev = 0;
uint32_t x_prev = X[0];
@ -262,26 +262,27 @@ static void GetEntropyUnrefined(const uint32_t X[], int length,
bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
}
static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[],
int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats) {
static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
const uint32_t Y[],
int length,
VP8LBitEntropy* const entropy,
VP8LStreaks* const stats) {
int i = 1;
int i_prev = 0;
uint32_t xy_prev = X[0] + Y[0];
memset(stats, 0, sizeof(*stats));
VP8LBitEntropyInit(bit_entropy);
VP8LBitEntropyInit(entropy);
for (i = 1; i < length; ++i) {
const uint32_t xy = X[i] + Y[i];
if (xy != xy_prev) {
GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, bit_entropy, stats);
GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, entropy, stats);
}
}
GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats);
GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, entropy, stats);
bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
entropy->entropy += VP8LFastSLog2(entropy->sum);
}
#define ASM_START \
@ -374,9 +375,9 @@ static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[],
} \
} while (0)
static void HistogramAdd(const VP8LHistogram* const a,
const VP8LHistogram* const b,
VP8LHistogram* const out) {
static void HistogramAdd_MIPS32(const VP8LHistogram* const a,
const VP8LHistogram* const b,
VP8LHistogram* const out) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
const int extra_cache_size = VP8LHistogramNumCodes(a->palette_code_bits_)
- (NUM_LITERAL_CODES + NUM_LENGTH_CODES);
@ -415,13 +416,13 @@ static void HistogramAdd(const VP8LHistogram* const a,
extern void VP8LEncDspInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) {
VP8LFastSLog2Slow = FastSLog2Slow;
VP8LFastLog2Slow = FastLog2Slow;
VP8LExtraCost = ExtraCost;
VP8LExtraCostCombined = ExtraCostCombined;
VP8LGetEntropyUnrefined = GetEntropyUnrefined;
VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined;
VP8LHistogramAdd = HistogramAdd;
VP8LFastSLog2Slow = FastSLog2Slow_MIPS32;
VP8LFastLog2Slow = FastLog2Slow_MIPS32;
VP8LExtraCost = ExtraCost_MIPS32;
VP8LExtraCostCombined = ExtraCostCombined_MIPS32;
VP8LGetEntropyUnrefined = GetEntropyUnrefined_MIPS32;
VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_MIPS32;
VP8LHistogramAdd = HistogramAdd_MIPS32;
}
#else // !WEBP_USE_MIPS32

View file

@ -12,14 +12,14 @@
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "./lossless.h"
#include "src/dsp/lossless.h"
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data,
int num_pixels) {
static void SubtractGreenFromBlueAndRed_MIPSdspR2(uint32_t* argb_data,
int num_pixels) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
uint32_t* const p_loop1_end = argb_data + (num_pixels & ~3);
uint32_t* const p_loop2_end = p_loop1_end + (num_pixels & 3);
@ -78,8 +78,8 @@ static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
return (uint32_t)((int)(color_pred) * color) >> 5;
}
static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
int num_pixels) {
static void TransformColor_MIPSdspR2(const VP8LMultipliers* const m,
uint32_t* data, int num_pixels) {
int temp0, temp1, temp2, temp3, temp4, temp5;
uint32_t argb, argb1, new_red, new_red1;
const uint32_t G_to_R = m->green_to_red_;
@ -171,10 +171,13 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
return (new_blue & 0xff);
}
static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue,
int histo[]) {
static void CollectColorBlueTransforms_MIPSdspR2(const uint32_t* argb,
int stride,
int tile_width,
int tile_height,
int green_to_blue,
int red_to_blue,
int histo[]) {
const int rtb = (red_to_blue << 16) | (red_to_blue & 0xffff);
const int gtb = (green_to_blue << 16) | (green_to_blue & 0xffff);
const uint32_t mask = 0xff00ffu;
@ -222,9 +225,12 @@ static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
return (new_red & 0xff);
}
static void CollectColorRedTransforms(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]) {
static void CollectColorRedTransforms_MIPSdspR2(const uint32_t* argb,
int stride,
int tile_width,
int tile_height,
int green_to_red,
int histo[]) {
const int gtr = (green_to_red << 16) | (green_to_red & 0xffff);
while (tile_height-- > 0) {
int x;
@ -262,10 +268,10 @@ static void CollectColorRedTransforms(const uint32_t* argb, int stride,
extern void VP8LEncDspInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPSdspR2(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
VP8LTransformColor = TransformColor;
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms;
VP8LCollectColorRedTransforms = CollectColorRedTransforms;
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MIPSdspR2;
VP8LTransformColor = TransformColor_MIPSdspR2;
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_MIPSdspR2;
VP8LCollectColorRedTransforms = CollectColorRedTransforms_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2

View file

@ -11,12 +11,12 @@
//
// Authors: Prashant Patil (Prashant.Patil@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MSA)
#include "./lossless.h"
#include "./msa_macro.h"
#include "src/dsp/lossless.h"
#include "src/dsp/msa_macro.h"
#define TRANSFORM_COLOR_8(src0, src1, dst0, dst1, c0, c1, mask0, mask1) do { \
v8i16 g0, g1, t0, t1, t2, t3; \
@ -48,8 +48,8 @@
dst = VSHF_UB(src, t0, mask1); \
} while (0)
static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
int num_pixels) {
static void TransformColor_MSA(const VP8LMultipliers* const m, uint32_t* data,
int num_pixels) {
v16u8 src0, dst0;
const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
(m->green_to_red_ << 16));
@ -94,7 +94,8 @@ static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
}
}
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
static void SubtractGreenFromBlueAndRed_MSA(uint32_t* argb_data,
int num_pixels) {
int i;
uint8_t* ptemp_data = (uint8_t*)argb_data;
v16u8 src0, dst0, tmp0;
@ -136,8 +137,8 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
extern void VP8LEncDspInitMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMSA(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
VP8LTransformColor = TransformColor;
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MSA;
VP8LTransformColor = TransformColor_MSA;
}
#else // !WEBP_USE_MSA

View file

@ -11,14 +11,14 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
#include <arm_neon.h>
#include "./lossless.h"
#include "./neon.h"
#include "src/dsp/lossless.h"
#include "src/dsp/neon.h"
//------------------------------------------------------------------------------
// Subtract-Green Transform
@ -36,8 +36,8 @@ static const uint8_t kGreenShuffle[16] = {
1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
};
static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
const uint8x16_t shuffle) {
static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
const uint8x16_t shuffle) {
return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
vtbl1q_u8(argb, vget_high_u8(shuffle)));
}
@ -45,14 +45,15 @@ static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
// 255 = byte will be zeroed
static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 };
static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
const uint8x8_t shuffle) {
static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
const uint8x8_t shuffle) {
return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
vtbl1_u8(vget_high_u8(argb), shuffle));
}
#endif // USE_VTBLQ
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
static void SubtractGreenFromBlueAndRed_NEON(uint32_t* argb_data,
int num_pixels) {
const uint32_t* const end = argb_data + (num_pixels & ~3);
#ifdef USE_VTBLQ
const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
@ -61,7 +62,7 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
#endif
for (; argb_data < end; argb_data += 4) {
const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
const uint8x16_t greens = DoGreenShuffle_NEON(argb, shuffle);
vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
}
// fallthrough and finish off with plain-C
@ -71,8 +72,8 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
//------------------------------------------------------------------------------
// Color Transform
static void TransformColor(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels) {
static void TransformColor_NEON(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels) {
// sign-extended multiplying constants, pre-shifted by 6.
#define CST(X) (((int16_t)(m->X << 8)) >> 6)
const int16_t rb[8] = {
@ -102,7 +103,7 @@ static void TransformColor(const VP8LMultipliers* const m,
for (i = 0; i + 4 <= num_pixels; i += 4) {
const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i));
// 0 g 0 g
const uint8x16_t greens = DoGreenShuffle(in, shuffle);
const uint8x16_t greens = DoGreenShuffle_NEON(in, shuffle);
// x dr x db1
const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
// r 0 b 0
@ -132,8 +133,8 @@ static void TransformColor(const VP8LMultipliers* const m,
extern void VP8LEncDspInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitNEON(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
VP8LTransformColor = TransformColor;
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_NEON;
VP8LTransformColor = TransformColor_NEON;
}
#else // !WEBP_USE_NEON

View file

@ -11,22 +11,23 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
#include <assert.h>
#include <emmintrin.h>
#include "./lossless.h"
#include "./common_sse2.h"
#include "./lossless_common.h"
#include "src/dsp/lossless.h"
#include "src/dsp/common_sse2.h"
#include "src/dsp/lossless_common.h"
// For sign-extended multiplying constants, pre-shifted by 5:
#define CST_5b(X) (((int16_t)((uint16_t)X << 8)) >> 5)
#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5)
//------------------------------------------------------------------------------
// Subtract-Green Transform
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
static void SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data,
int num_pixels) {
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
@ -45,8 +46,8 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
//------------------------------------------------------------------------------
// Color Transform
static void TransformColor(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels) {
static void TransformColor_SSE2(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels) {
const __m128i mults_rb = _mm_set_epi16(
CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
@ -80,10 +81,10 @@ static void TransformColor(const VP8LMultipliers* const m,
//------------------------------------------------------------------------------
#define SPAN 8
static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue,
int histo[]) {
static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue,
int histo[]) {
const __m128i mults_r = _mm_set_epi16(
CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0,
CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0);
@ -131,9 +132,9 @@ static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
}
}
static void CollectColorRedTransforms(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]) {
static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]) {
const __m128i mults_g = _mm_set_epi16(
0, CST_5b(green_to_red), 0, CST_5b(green_to_red),
0, CST_5b(green_to_red), 0, CST_5b(green_to_red));
@ -177,8 +178,8 @@ static void CollectColorRedTransforms(const uint32_t* argb, int stride,
//------------------------------------------------------------------------------
#define LINE_SIZE 16 // 8 or 16
static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out,
int size) {
static void AddVector_SSE2(const uint32_t* a, const uint32_t* b, uint32_t* out,
int size) {
int i;
assert(size % LINE_SIZE == 0);
for (i = 0; i < size; i += LINE_SIZE) {
@ -203,7 +204,7 @@ static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out,
}
}
static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) {
static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {
int i;
assert(size % LINE_SIZE == 0);
for (i = 0; i < size; i += LINE_SIZE) {
@ -231,22 +232,22 @@ static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) {
// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
// that's ok since the histogram values are less than 1<<28 (max picture size).
static void HistogramAdd(const VP8LHistogram* const a,
const VP8LHistogram* const b,
VP8LHistogram* const out) {
static void HistogramAdd_SSE2(const VP8LHistogram* const a,
const VP8LHistogram* const b,
VP8LHistogram* const out) {
int i;
const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
assert(a->palette_code_bits_ == b->palette_code_bits_);
if (b != out) {
AddVector(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES);
AddVector(a->red_, b->red_, out->red_, NUM_LITERAL_CODES);
AddVector(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES);
AddVector(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES);
AddVector_SSE2(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES);
AddVector_SSE2(a->red_, b->red_, out->red_, NUM_LITERAL_CODES);
AddVector_SSE2(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES);
AddVector_SSE2(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES);
} else {
AddVectorEq(a->literal_, out->literal_, NUM_LITERAL_CODES);
AddVectorEq(a->red_, out->red_, NUM_LITERAL_CODES);
AddVectorEq(a->blue_, out->blue_, NUM_LITERAL_CODES);
AddVectorEq(a->alpha_, out->alpha_, NUM_LITERAL_CODES);
AddVectorEq_SSE2(a->literal_, out->literal_, NUM_LITERAL_CODES);
AddVectorEq_SSE2(a->red_, out->red_, NUM_LITERAL_CODES);
AddVectorEq_SSE2(a->blue_, out->blue_, NUM_LITERAL_CODES);
AddVectorEq_SSE2(a->alpha_, out->alpha_, NUM_LITERAL_CODES);
}
for (i = NUM_LITERAL_CODES; i < literal_size; ++i) {
out->literal_[i] = a->literal_[i] + b->literal_[i];
@ -261,9 +262,9 @@ static void HistogramAdd(const VP8LHistogram* const a,
// Checks whether the X or Y contribution is worth computing and adding.
// Used in loop unrolling.
#define ANALYZE_X_OR_Y(x_or_y, j) \
do { \
if (x_or_y[i + j] != 0) retval -= VP8LFastSLog2(x_or_y[i + j]); \
#define ANALYZE_X_OR_Y(x_or_y, j) \
do { \
if ((x_or_y)[i + (j)] != 0) retval -= VP8LFastSLog2((x_or_y)[i + (j)]); \
} while (0)
// Checks whether the X + Y contribution is worth computing and adding.
@ -276,7 +277,7 @@ static void HistogramAdd(const VP8LHistogram* const a,
} \
} while (0)
static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
int i;
double retval = 0.;
int sumX, sumXY;
@ -332,8 +333,8 @@ static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
//------------------------------------------------------------------------------
static int VectorMismatch(const uint32_t* const array1,
const uint32_t* const array2, int length) {
static int VectorMismatch_SSE2(const uint32_t* const array1,
const uint32_t* const array2, int length) {
int match_len;
if (length >= 12) {
@ -574,8 +575,8 @@ static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper,
}
// Predictor11: select.
static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B,
__m128i* const out) {
static void GetSumAbsDiff32_SSE2(const __m128i* const A, const __m128i* const B,
__m128i* const out) {
// We can unpack with any value on the upper 32 bits, provided it's the same
// on both operands (to that their sum of abs diff is zero). Here we use *A.
const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
@ -596,8 +597,8 @@ static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
__m128i pa, pb;
GetSumAbsDiff32(&T, &TL, &pa); // pa = sum |T-TL|
GetSumAbsDiff32(&L, &TL, &pb); // pb = sum |L-TL|
GetSumAbsDiff32_SSE2(&T, &TL, &pa); // pa = sum |T-TL|
GetSumAbsDiff32_SSE2(&L, &TL, &pb); // pb = sum |L-TL|
{
const __m128i mask = _mm_cmpgt_epi32(pb, pa);
const __m128i A = _mm_and_si128(mask, L);
@ -677,13 +678,13 @@ static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper,
extern void VP8LEncDspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
VP8LTransformColor = TransformColor;
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms;
VP8LCollectColorRedTransforms = CollectColorRedTransforms;
VP8LHistogramAdd = HistogramAdd;
VP8LCombinedShannonEntropy = CombinedShannonEntropy;
VP8LVectorMismatch = VectorMismatch;
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE2;
VP8LTransformColor = TransformColor_SSE2;
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE2;
VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE2;
VP8LHistogramAdd = HistogramAdd_SSE2;
VP8LCombinedShannonEntropy = CombinedShannonEntropy_SSE2;
VP8LVectorMismatch = VectorMismatch_SSE2;
VP8LBundleColorMap = BundleColorMap_SSE2;
VP8LPredictorsSub[0] = PredictorSub0_SSE2;

View file

@ -11,17 +11,18 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE41)
#include <assert.h>
#include <smmintrin.h>
#include "./lossless.h"
#include "src/dsp/lossless.h"
//------------------------------------------------------------------------------
// Subtract-Green Transform
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
int num_pixels) {
int i;
const __m128i kCstShuffle = _mm_set_epi8(-1, 13, -1, 13, -1, 9, -1, 9,
-1, 5, -1, 5, -1, 1, -1, 1);
@ -43,7 +44,7 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
extern void VP8LEncDspInitSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41;
}
#else // !WEBP_USE_SSE41

View file

@ -12,12 +12,12 @@
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "./lossless.h"
#include "./lossless_common.h"
#include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h"
#define MAP_COLOR_FUNCS(FUNC_NAME, TYPE, GET_INDEX, GET_VALUE) \
static void FUNC_NAME(const TYPE* src, \
@ -86,8 +86,8 @@ static void FUNC_NAME(const TYPE* src, \
} \
}
MAP_COLOR_FUNCS(MapARGB, uint32_t, VP8GetARGBIndex, VP8GetARGBValue)
MAP_COLOR_FUNCS(MapAlpha, uint8_t, VP8GetAlphaIndex, VP8GetAlphaValue)
MAP_COLOR_FUNCS(MapARGB_MIPSdspR2, uint32_t, VP8GetARGBIndex, VP8GetARGBValue)
MAP_COLOR_FUNCS(MapAlpha_MIPSdspR2, uint8_t, VP8GetAlphaIndex, VP8GetAlphaValue)
#undef MAP_COLOR_FUNCS
@ -188,48 +188,52 @@ static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
return Average2(Average2(a0, a1), Average2(a2, a3));
}
static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor5_MIPSdspR2(uint32_t left, const uint32_t* const top) {
return Average3(left, top[0], top[1]);
}
static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor6_MIPSdspR2(uint32_t left, const uint32_t* const top) {
return Average2(left, top[-1]);
}
static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor7_MIPSdspR2(uint32_t left, const uint32_t* const top) {
return Average2(left, top[0]);
}
static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor8_MIPSdspR2(uint32_t left, const uint32_t* const top) {
(void)left;
return Average2(top[-1], top[0]);
}
static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor9_MIPSdspR2(uint32_t left, const uint32_t* const top) {
(void)left;
return Average2(top[0], top[1]);
}
static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor10_MIPSdspR2(uint32_t left,
const uint32_t* const top) {
return Average4(left, top[-1], top[0], top[1]);
}
static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor11_MIPSdspR2(uint32_t left,
const uint32_t* const top) {
return Select(top[0], left, top[-1]);
}
static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor12_MIPSdspR2(uint32_t left,
const uint32_t* const top) {
return ClampedAddSubtractFull(left, top[0], top[-1]);
}
static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
static uint32_t Predictor13_MIPSdspR2(uint32_t left,
const uint32_t* const top) {
return ClampedAddSubtractHalf(left, top[0], top[-1]);
}
// Add green to blue and red channels (i.e. perform the inverse transform of
// 'subtract green').
static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
uint32_t* dst) {
static void AddGreenToBlueAndRed_MIPSdspR2(const uint32_t* src, int num_pixels,
uint32_t* dst) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
@ -285,9 +289,9 @@ static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
);
}
static void TransformColorInverse(const VP8LMultipliers* const m,
const uint32_t* src, int num_pixels,
uint32_t* dst) {
static void TransformColorInverse_MIPSdspR2(const VP8LMultipliers* const m,
const uint32_t* src, int num_pixels,
uint32_t* dst) {
int temp0, temp1, temp2, temp3, temp4, temp5;
uint32_t argb, argb1, new_red;
const uint32_t G_to_R = m->green_to_red_;
@ -356,8 +360,8 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
if (num_pixels & 1) VP8LTransformColorInverse_C(m, src, 1, dst);
}
static void ConvertBGRAToRGB(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGB_MIPSdspR2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
@ -408,8 +412,8 @@ static void ConvertBGRAToRGB(const uint32_t* src,
);
}
static void ConvertBGRAToRGBA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGBA_MIPSdspR2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
@ -458,8 +462,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
);
}
static void ConvertBGRAToRGBA4444(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGBA4444_MIPSdspR2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3, temp4, temp5;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
@ -492,7 +496,7 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
"ins %[temp3], %[temp5], 16, 4 \n\t"
"addiu %[src], %[src], 16 \n\t"
"precr.qb.ph %[temp3], %[temp3], %[temp2] \n\t"
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
"usw %[temp1], 0(%[dst]) \n\t"
"usw %[temp3], 4(%[dst]) \n\t"
#else
@ -514,7 +518,7 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
"ins %[temp0], %[temp5], 16, 4 \n\t"
"addiu %[src], %[src], 4 \n\t"
"precr.qb.ph %[temp0], %[temp0], %[temp0] \n\t"
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
"ush %[temp0], 0(%[dst]) \n\t"
#else
"wsbh %[temp0], %[temp0] \n\t"
@ -532,8 +536,8 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
);
}
static void ConvertBGRAToRGB565(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGB565_MIPSdspR2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3, temp4, temp5;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
@ -570,7 +574,7 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
"ins %[temp2], %[temp3], 0, 5 \n\t"
"addiu %[src], %[src], 16 \n\t"
"append %[temp2], %[temp1], 16 \n\t"
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
"usw %[temp0], 0(%[dst]) \n\t"
"usw %[temp2], 4(%[dst]) \n\t"
#else
@ -592,7 +596,7 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
"ins %[temp4], %[temp5], 0, 11 \n\t"
"addiu %[src], %[src], 4 \n\t"
"ins %[temp4], %[temp0], 0, 5 \n\t"
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
"ush %[temp4], 0(%[dst]) \n\t"
#else
"wsbh %[temp4], %[temp4] \n\t"
@ -610,8 +614,8 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
);
}
static void ConvertBGRAToBGR(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToBGR_MIPSdspR2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
@ -662,24 +666,27 @@ static void ConvertBGRAToBGR(const uint32_t* src,
extern void VP8LDspInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMIPSdspR2(void) {
VP8LMapColor32b = MapARGB;
VP8LMapColor8b = MapAlpha;
VP8LPredictors[5] = Predictor5;
VP8LPredictors[6] = Predictor6;
VP8LPredictors[7] = Predictor7;
VP8LPredictors[8] = Predictor8;
VP8LPredictors[9] = Predictor9;
VP8LPredictors[10] = Predictor10;
VP8LPredictors[11] = Predictor11;
VP8LPredictors[12] = Predictor12;
VP8LPredictors[13] = Predictor13;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
VP8LTransformColorInverse = TransformColorInverse;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
VP8LMapColor32b = MapARGB_MIPSdspR2;
VP8LMapColor8b = MapAlpha_MIPSdspR2;
VP8LPredictors[5] = Predictor5_MIPSdspR2;
VP8LPredictors[6] = Predictor6_MIPSdspR2;
VP8LPredictors[7] = Predictor7_MIPSdspR2;
VP8LPredictors[8] = Predictor8_MIPSdspR2;
VP8LPredictors[9] = Predictor9_MIPSdspR2;
VP8LPredictors[10] = Predictor10_MIPSdspR2;
VP8LPredictors[11] = Predictor11_MIPSdspR2;
VP8LPredictors[12] = Predictor12_MIPSdspR2;
VP8LPredictors[13] = Predictor13_MIPSdspR2;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_MIPSdspR2;
VP8LTransformColorInverse = TransformColorInverse_MIPSdspR2;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB_MIPSdspR2;
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_MIPSdspR2;
VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_MIPSdspR2;
VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_MIPSdspR2;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2

View file

@ -11,12 +11,12 @@
//
// Author: Prashant Patil (prashant.patil@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MSA)
#include "./lossless.h"
#include "./msa_macro.h"
#include "src/dsp/lossless.h"
#include "src/dsp/msa_macro.h"
//------------------------------------------------------------------------------
// Colorspace conversion functions
@ -43,7 +43,7 @@
#define CONVERT8_BGRA_XXX(psrc, pdst, m0, m1) do { \
uint64_t pix_d; \
v16u8 src0, src1, src2, dst0, dst1; \
v16u8 src0, src1, src2 = { 0 }, dst0, dst1; \
LD_UB2(psrc, 16, src0, src1); \
VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1); \
ST_UB(dst0, pdst); \
@ -109,8 +109,8 @@
dst = VSHF_UB(src, t0, mask1); \
} while (0)
static void ConvertBGRAToRGBA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGBA_MSA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
int i;
const uint8_t* ptemp_src = (const uint8_t*)src;
uint8_t* ptemp_dst = (uint8_t*)dst;
@ -150,8 +150,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
}
}
static void ConvertBGRAToBGR(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToBGR_MSA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint8_t* ptemp_src = (const uint8_t*)src;
uint8_t* ptemp_dst = (uint8_t*)dst;
const v16u8 mask0 = { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14,
@ -197,8 +197,8 @@ static void ConvertBGRAToBGR(const uint32_t* src,
}
}
static void ConvertBGRAToRGB(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGB_MSA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint8_t* ptemp_src = (const uint8_t*)src;
uint8_t* ptemp_dst = (uint8_t*)dst;
const v16u8 mask0 = { 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12,
@ -244,8 +244,8 @@ static void ConvertBGRAToRGB(const uint32_t* src,
}
}
static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
uint32_t* dst) {
static void AddGreenToBlueAndRed_MSA(const uint32_t* const src, int num_pixels,
uint32_t* dst) {
int i;
const uint8_t* in = (const uint8_t*)src;
uint8_t* out = (uint8_t*)dst;
@ -286,9 +286,9 @@ static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
}
}
static void TransformColorInverse(const VP8LMultipliers* const m,
const uint32_t* src, int num_pixels,
uint32_t* dst) {
static void TransformColorInverse_MSA(const VP8LMultipliers* const m,
const uint32_t* src, int num_pixels,
uint32_t* dst) {
v16u8 src0, dst0;
const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
(m->green_to_red_ << 16));
@ -341,11 +341,12 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
extern void VP8LDspInitMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMSA(void) {
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
VP8LTransformColorInverse = TransformColorInverse;
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_MSA;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR_MSA;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB_MSA;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_MSA;
VP8LTransformColorInverse = TransformColorInverse_MSA;
}
#else // !WEBP_USE_MSA

View file

@ -11,14 +11,14 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
#include <arm_neon.h>
#include "./lossless.h"
#include "./neon.h"
#include "src/dsp/lossless.h"
#include "src/dsp/neon.h"
//------------------------------------------------------------------------------
// Colorspace conversion functions
@ -26,8 +26,8 @@
#if !defined(WORK_AROUND_GCC)
// gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for
// gcc-4.8.x at least.
static void ConvertBGRAToRGBA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGBA_NEON(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~15);
for (; src < end; src += 16) {
uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@ -41,8 +41,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst); // left-overs
}
static void ConvertBGRAToBGR(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToBGR_NEON(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~15);
for (; src < end; src += 16) {
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@ -53,8 +53,8 @@ static void ConvertBGRAToBGR(const uint32_t* src,
VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst); // left-overs
}
static void ConvertBGRAToRGB(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGB_NEON(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~15);
for (; src < end; src += 16) {
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@ -71,8 +71,8 @@ static void ConvertBGRAToRGB(const uint32_t* src,
static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 };
static void ConvertBGRAToRGBA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGBA_NEON(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~1);
const uint8x8_t shuffle = vld1_u8(kRGBAShuffle);
for (; src < end; src += 2) {
@ -89,8 +89,8 @@ static const uint8_t kBGRShuffle[3][8] = {
{ 21, 22, 24, 25, 26, 28, 29, 30 }
};
static void ConvertBGRAToBGR(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToBGR_NEON(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~7);
const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]);
const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);
@ -116,8 +116,8 @@ static const uint8_t kRGBShuffle[3][8] = {
{ 21, 20, 26, 25, 24, 30, 29, 28 }
};
static void ConvertBGRAToRGB(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGB_NEON(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~7);
const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]);
const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]);
@ -139,7 +139,6 @@ static void ConvertBGRAToRGB(const uint32_t* src,
#endif // !WORK_AROUND_GCC
//------------------------------------------------------------------------------
// Predictor Transform
@ -506,8 +505,8 @@ static const uint8_t kGreenShuffle[16] = {
1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
};
static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
const uint8x16_t shuffle) {
static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
const uint8x16_t shuffle) {
return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
vtbl1q_u8(argb, vget_high_u8(shuffle)));
}
@ -515,15 +514,15 @@ static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
// 255 = byte will be zeroed
static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 };
static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
const uint8x8_t shuffle) {
static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
const uint8x8_t shuffle) {
return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
vtbl1_u8(vget_high_u8(argb), shuffle));
}
#endif // USE_VTBLQ
static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
uint32_t* dst) {
static void AddGreenToBlueAndRed_NEON(const uint32_t* src, int num_pixels,
uint32_t* dst) {
const uint32_t* const end = src + (num_pixels & ~3);
#ifdef USE_VTBLQ
const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
@ -532,7 +531,7 @@ static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
#endif
for (; src < end; src += 4, dst += 4) {
const uint8x16_t argb = vld1q_u8((const uint8_t*)src);
const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
const uint8x16_t greens = DoGreenShuffle_NEON(argb, shuffle);
vst1q_u8((uint8_t*)dst, vaddq_u8(argb, greens));
}
// fallthrough and finish off with plain-C
@ -542,9 +541,9 @@ static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
//------------------------------------------------------------------------------
// Color Transform
static void TransformColorInverse(const VP8LMultipliers* const m,
const uint32_t* const src, int num_pixels,
uint32_t* dst) {
static void TransformColorInverse_NEON(const VP8LMultipliers* const m,
const uint32_t* const src,
int num_pixels, uint32_t* dst) {
// sign-extended multiplying constants, pre-shifted by 6.
#define CST(X) (((int16_t)(m->X << 8)) >> 6)
const int16_t rb[8] = {
@ -575,7 +574,7 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
const uint8x16_t in = vld1q_u8((const uint8_t*)(src + i));
const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag);
// 0 g 0 g
const uint8x16_t greens = DoGreenShuffle(in, shuffle);
const uint8x16_t greens = DoGreenShuffle_NEON(in, shuffle);
// x dr x db1
const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
// x r' x b'
@ -627,12 +626,12 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) {
VP8LPredictorsAdd[12] = PredictorAdd12_NEON;
VP8LPredictorsAdd[13] = PredictorAdd13_NEON;
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_NEON;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR_NEON;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB_NEON;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
VP8LTransformColorInverse = TransformColorInverse;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_NEON;
VP8LTransformColorInverse = TransformColorInverse_NEON;
}
#else // !WEBP_USE_NEON

View file

@ -11,21 +11,22 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
#include "./common_sse2.h"
#include "./lossless.h"
#include "./lossless_common.h"
#include "src/dsp/common_sse2.h"
#include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h"
#include <assert.h>
#include <emmintrin.h>
//------------------------------------------------------------------------------
// Predictor Transform
static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
uint32_t c2) {
static WEBP_INLINE uint32_t ClampedAddSubtractFull_SSE2(uint32_t c0,
uint32_t c1,
uint32_t c2) {
const __m128i zero = _mm_setzero_si128();
const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
@ -37,8 +38,9 @@ static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
return output;
}
static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
uint32_t c2) {
static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0,
uint32_t c1,
uint32_t c2) {
const __m128i zero = _mm_setzero_si128();
const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
@ -55,7 +57,7 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
return output;
}
static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) {
int pa_minus_pb;
const __m128i zero = _mm_setzero_si128();
const __m128i A0 = _mm_cvtsi32_si128(a);
@ -88,8 +90,9 @@ static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
*avg = _mm_sub_epi8(avg1, one);
}
static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1,
__m128i* const avg) {
static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0,
const uint32_t a1,
__m128i* const avg) {
// (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
const __m128i ones = _mm_set1_epi8(1);
const __m128i A0 = _mm_cvtsi32_si128(a0);
@ -99,7 +102,7 @@ static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1,
*avg = _mm_sub_epi8(avg1, one);
}
static WEBP_INLINE __m128i Average2_uint32_16(uint32_t a0, uint32_t a1) {
static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) {
const __m128i zero = _mm_setzero_si128();
const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero);
const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
@ -107,15 +110,16 @@ static WEBP_INLINE __m128i Average2_uint32_16(uint32_t a0, uint32_t a1) {
return _mm_srli_epi16(sum, 1);
}
static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
static WEBP_INLINE uint32_t Average2_SSE2(uint32_t a0, uint32_t a1) {
__m128i output;
Average2_uint32(a0, a1, &output);
Average2_uint32_SSE2(a0, a1, &output);
return _mm_cvtsi128_si32(output);
}
static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
static WEBP_INLINE uint32_t Average3_SSE2(uint32_t a0, uint32_t a1,
uint32_t a2) {
const __m128i zero = _mm_setzero_si128();
const __m128i avg1 = Average2_uint32_16(a0, a2);
const __m128i avg1 = Average2_uint32_16_SSE2(a0, a2);
const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
const __m128i sum = _mm_add_epi16(avg1, A1);
const __m128i avg2 = _mm_srli_epi16(sum, 1);
@ -124,10 +128,10 @@ static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
return output;
}
static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
uint32_t a2, uint32_t a3) {
const __m128i avg1 = Average2_uint32_16(a0, a1);
const __m128i avg2 = Average2_uint32_16(a2, a3);
static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
uint32_t a2, uint32_t a3) {
const __m128i avg1 = Average2_uint32_16_SSE2(a0, a1);
const __m128i avg2 = Average2_uint32_16_SSE2(a2, a3);
const __m128i sum = _mm_add_epi16(avg2, avg1);
const __m128i avg3 = _mm_srli_epi16(sum, 1);
const __m128i A0 = _mm_packus_epi16(avg3, avg3);
@ -136,41 +140,41 @@ static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
}
static uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average3(left, top[0], top[1]);
const uint32_t pred = Average3_SSE2(left, top[0], top[1]);
return pred;
}
static uint32_t Predictor6_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(left, top[-1]);
const uint32_t pred = Average2_SSE2(left, top[-1]);
return pred;
}
static uint32_t Predictor7_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(left, top[0]);
const uint32_t pred = Average2_SSE2(left, top[0]);
return pred;
}
static uint32_t Predictor8_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(top[-1], top[0]);
const uint32_t pred = Average2_SSE2(top[-1], top[0]);
(void)left;
return pred;
}
static uint32_t Predictor9_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(top[0], top[1]);
const uint32_t pred = Average2_SSE2(top[0], top[1]);
(void)left;
return pred;
}
static uint32_t Predictor10_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
const uint32_t pred = Average4_SSE2(left, top[-1], top[0], top[1]);
return pred;
}
static uint32_t Predictor11_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Select(top[0], left, top[-1]);
const uint32_t pred = Select_SSE2(top[0], left, top[-1]);
return pred;
}
static uint32_t Predictor12_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
const uint32_t pred = ClampedAddSubtractFull_SSE2(left, top[0], top[-1]);
return pred;
}
static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
const uint32_t pred = ClampedAddSubtractHalf_SSE2(left, top[0], top[-1]);
return pred;
}
@ -272,9 +276,24 @@ GENERATE_PREDICTOR_2(9, upper[i + 1])
#undef GENERATE_PREDICTOR_2
// Predictor10: average of (average of (L,TL), average of (T, TR)).
#define DO_PRED10(OUT) do { \
__m128i avgLTL, avg; \
Average2_m128i(&L, &TL, &avgLTL); \
Average2_m128i(&avgTTR, &avgLTL, &avg); \
L = _mm_add_epi8(avg, src); \
out[i + (OUT)] = _mm_cvtsi128_si32(L); \
} while (0)
#define DO_PRED10_SHIFT do { \
/* Rotate the pre-computed values for the next iteration.*/ \
avgTTR = _mm_srli_si128(avgTTR, 4); \
TL = _mm_srli_si128(TL, 4); \
src = _mm_srli_si128(src, 4); \
} while (0)
static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i, j;
int i;
__m128i L = _mm_cvtsi32_si128(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) {
__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
@ -283,79 +302,90 @@ static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
__m128i avgTTR;
Average2_m128i(&T, &TR, &avgTTR);
for (j = 0; j < 4; ++j) {
__m128i avgLTL, avg;
Average2_m128i(&L, &TL, &avgLTL);
Average2_m128i(&avgTTR, &avgLTL, &avg);
L = _mm_add_epi8(avg, src);
out[i + j] = _mm_cvtsi128_si32(L);
// Rotate the pre-computed values for the next iteration.
avgTTR = _mm_srli_si128(avgTTR, 4);
TL = _mm_srli_si128(TL, 4);
src = _mm_srli_si128(src, 4);
}
DO_PRED10(0);
DO_PRED10_SHIFT;
DO_PRED10(1);
DO_PRED10_SHIFT;
DO_PRED10(2);
DO_PRED10_SHIFT;
DO_PRED10(3);
}
if (i != num_pixels) {
VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
}
}
#undef DO_PRED10
#undef DO_PRED10_SHIFT
// Predictor11: select.
static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B,
__m128i* const out) {
// We can unpack with any value on the upper 32 bits, provided it's the same
// on both operands (to that their sum of abs diff is zero). Here we use *A.
const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
const __m128i B_lo = _mm_unpacklo_epi32(*B, *A);
const __m128i A_hi = _mm_unpackhi_epi32(*A, *A);
const __m128i B_hi = _mm_unpackhi_epi32(*B, *A);
const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo);
const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi);
*out = _mm_packs_epi32(s_lo, s_hi);
}
#define DO_PRED11(OUT) do { \
const __m128i L_lo = _mm_unpacklo_epi32(L, T); \
const __m128i TL_lo = _mm_unpacklo_epi32(TL, T); \
const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/ \
const __m128i mask = _mm_cmpgt_epi32(pb, pa); \
const __m128i A = _mm_and_si128(mask, L); \
const __m128i B = _mm_andnot_si128(mask, T); \
const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \
L = _mm_add_epi8(src, pred); \
out[i + (OUT)] = _mm_cvtsi128_si32(L); \
} while (0)
#define DO_PRED11_SHIFT do { \
/* Shift the pre-computed value for the next iteration.*/ \
T = _mm_srli_si128(T, 4); \
TL = _mm_srli_si128(TL, 4); \
src = _mm_srli_si128(src, 4); \
pa = _mm_srli_si128(pa, 4); \
} while (0)
static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i, j;
int i;
__m128i pa;
__m128i L = _mm_cvtsi32_si128(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) {
__m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
__m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
__m128i pa;
GetSumAbsDiff32(&T, &TL, &pa); // pa = sum |T-TL|
for (j = 0; j < 4; ++j) {
const __m128i L_lo = _mm_unpacklo_epi32(L, L);
const __m128i TL_lo = _mm_unpacklo_epi32(TL, L);
const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); // pb = sum |L-TL|
const __m128i mask = _mm_cmpgt_epi32(pb, pa);
const __m128i A = _mm_and_si128(mask, L);
const __m128i B = _mm_andnot_si128(mask, T);
const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T
L = _mm_add_epi8(src, pred);
out[i + j] = _mm_cvtsi128_si32(L);
// Shift the pre-computed value for the next iteration.
T = _mm_srli_si128(T, 4);
TL = _mm_srli_si128(TL, 4);
src = _mm_srli_si128(src, 4);
pa = _mm_srli_si128(pa, 4);
{
// We can unpack with any value on the upper 32 bits, provided it's the
// same on both operands (so that their sum of abs diff is zero). Here we
// use T.
const __m128i T_lo = _mm_unpacklo_epi32(T, T);
const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);
const __m128i T_hi = _mm_unpackhi_epi32(T, T);
const __m128i TL_hi = _mm_unpackhi_epi32(TL, T);
const __m128i s_lo = _mm_sad_epu8(T_lo, TL_lo);
const __m128i s_hi = _mm_sad_epu8(T_hi, TL_hi);
pa = _mm_packs_epi32(s_lo, s_hi); // pa = sum |T-TL|
}
DO_PRED11(0);
DO_PRED11_SHIFT;
DO_PRED11(1);
DO_PRED11_SHIFT;
DO_PRED11(2);
DO_PRED11_SHIFT;
DO_PRED11(3);
}
if (i != num_pixels) {
VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
}
}
#undef DO_PRED11
#undef DO_PRED11_SHIFT
// Predictor12: ClampedAddSubtractFull.
#define DO_PRED12(DIFF, LANE, OUT) \
do { \
const __m128i all = _mm_add_epi16(L, (DIFF)); \
const __m128i alls = _mm_packus_epi16(all, all); \
const __m128i res = _mm_add_epi8(src, alls); \
out[i + (OUT)] = _mm_cvtsi128_si32(res); \
L = _mm_unpacklo_epi8(res, zero); \
#define DO_PRED12(DIFF, LANE, OUT) do { \
const __m128i all = _mm_add_epi16(L, (DIFF)); \
const __m128i alls = _mm_packus_epi16(all, all); \
const __m128i res = _mm_add_epi8(src, alls); \
out[i + (OUT)] = _mm_cvtsi128_si32(res); \
L = _mm_unpacklo_epi8(res, zero); \
} while (0)
#define DO_PRED12_SHIFT(DIFF, LANE) do { \
/* Shift the pre-computed value for the next iteration.*/ \
if (LANE == 0) (DIFF) = _mm_srli_si128((DIFF), 8); \
if ((LANE) == 0) (DIFF) = _mm_srli_si128((DIFF), 8); \
src = _mm_srli_si128(src, 4); \
} while (0)
@ -377,8 +407,11 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
__m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
__m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
DO_PRED12(diff_lo, 0, 0);
DO_PRED12_SHIFT(diff_lo, 0);
DO_PRED12(diff_lo, 1, 1);
DO_PRED12_SHIFT(diff_lo, 1);
DO_PRED12(diff_hi, 0, 2);
DO_PRED12_SHIFT(diff_hi, 0);
DO_PRED12(diff_hi, 1, 3);
}
if (i != num_pixels) {
@ -386,6 +419,7 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
}
}
#undef DO_PRED12
#undef DO_PRED12_SHIFT
// Due to averages with integers, values cannot be accumulated in parallel for
// predictors 13.
@ -394,8 +428,8 @@ GENERATE_PREDICTOR_ADD(Predictor13_SSE2, PredictorAdd13_SSE2)
//------------------------------------------------------------------------------
// Subtract-Green Transform
static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
uint32_t* dst) {
static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels,
uint32_t* dst) {
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
@ -414,9 +448,9 @@ static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
//------------------------------------------------------------------------------
// Color Transform
static void TransformColorInverse(const VP8LMultipliers* const m,
const uint32_t* const src, int num_pixels,
uint32_t* dst) {
static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
const uint32_t* const src,
int num_pixels, uint32_t* dst) {
// sign-extended multiplying constants, pre-shifted by 5.
#define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend
const __m128i mults_rb = _mm_set_epi16(
@ -454,8 +488,8 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
//------------------------------------------------------------------------------
// Color-space conversion functions
static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels,
uint8_t* dst) {
static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
uint8_t* dst) {
const __m128i* in = (const __m128i*)src;
__m128i* out = (__m128i*)dst;
@ -490,27 +524,26 @@ static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels,
}
}
static void ConvertBGRAToRGBA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
const __m128i* in = (const __m128i*)src;
__m128i* out = (__m128i*)dst;
while (num_pixels >= 8) {
const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...
const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...
const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...
const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...
const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7
const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7
const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7
const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7
const __m128i rg0 = _mm_unpacklo_epi8(rb0, ga0); // r0g0r1g1 ... r6g6r7g7
const __m128i ba0 = _mm_unpackhi_epi8(rb0, ga0); // b0a0b1a1 ... b6a6b7a7
const __m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0); // rgba0|rgba1...
const __m128i rgba4 = _mm_unpackhi_epi16(rg0, ba0); // rgba4|rgba5...
_mm_storeu_si128(out++, rgba0);
_mm_storeu_si128(out++, rgba4);
const __m128i A1 = _mm_loadu_si128(in++);
const __m128i A2 = _mm_loadu_si128(in++);
const __m128i B1 = _mm_and_si128(A1, red_blue_mask); // R 0 B 0
const __m128i B2 = _mm_and_si128(A2, red_blue_mask); // R 0 B 0
const __m128i C1 = _mm_andnot_si128(red_blue_mask, A1); // 0 G 0 A
const __m128i C2 = _mm_andnot_si128(red_blue_mask, A2); // 0 G 0 A
const __m128i D1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(2, 3, 0, 1));
const __m128i D2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(2, 3, 0, 1));
const __m128i E1 = _mm_shufflehi_epi16(D1, _MM_SHUFFLE(2, 3, 0, 1));
const __m128i E2 = _mm_shufflehi_epi16(D2, _MM_SHUFFLE(2, 3, 0, 1));
const __m128i F1 = _mm_or_si128(E1, C1);
const __m128i F2 = _mm_or_si128(E2, C2);
_mm_storeu_si128(out++, F1);
_mm_storeu_si128(out++, F2);
num_pixels -= 8;
}
// left-overs
@ -519,8 +552,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
}
}
static void ConvertBGRAToRGBA4444(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
const __m128i* in = (const __m128i*)src;
@ -541,7 +574,7 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f); // g0-|g1-|...|a6-|a7-
const __m128i rgba0 = _mm_or_si128(ga2, rb1); // rg0..rg7 | ba0..ba7
const __m128i rgba1 = _mm_srli_si128(rgba0, 8); // ba0..ba7 | 0
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0); // barg0...barg7
#else
const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1); // rgba0...rgba7
@ -555,8 +588,8 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
}
}
static void ConvertBGRAToRGB565(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToRGB565_SSE2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const __m128i mask_0xe0 = _mm_set1_epi8(0xe0);
const __m128i mask_0xf8 = _mm_set1_epi8(0xf8);
const __m128i mask_0x07 = _mm_set1_epi8(0x07);
@ -582,7 +615,7 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
const __m128i rg1 = _mm_or_si128(rb1, g_lo2); // gr0...gr7|xx
const __m128i b1 = _mm_srli_epi16(b0, 3);
const __m128i gb1 = _mm_or_si128(b1, g_hi2); // bg0...bg7|xx
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1); // rggb0...rggb7
#else
const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1); // bgrb0...bgrb7
@ -596,8 +629,8 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
}
}
static void ConvertBGRAToBGR(const uint32_t* src,
int num_pixels, uint8_t* dst) {
static void ConvertBGRAToBGR_SSE2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
const __m128i* in = (const __m128i*)src;
@ -660,14 +693,14 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) {
VP8LPredictorsAdd[12] = PredictorAdd12_SSE2;
VP8LPredictorsAdd[13] = PredictorAdd13_SSE2;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
VP8LTransformColorInverse = TransformColorInverse;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_SSE2;
VP8LTransformColorInverse = TransformColorInverse_SSE2;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE2;
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_SSE2;
VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_SSE2;
VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_SSE2;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE2;
}
#else // !WEBP_USE_SSE2

View file

@ -22,6 +22,7 @@
#endif
#ifdef CLANG_BUILD
#define ALPHAVAL (-1)
#define ADDVI_H(a, b) __msa_addvi_h((v8i16)a, b)
#define ADDVI_W(a, b) __msa_addvi_w((v4i32)a, b)
#define SRAI_B(a, b) __msa_srai_b((v16i8)a, b)
@ -32,6 +33,7 @@
#define ANDI_B(a, b) __msa_andi_b((v16u8)a, b)
#define ORI_B(a, b) __msa_ori_b((v16u8)a, b)
#else
#define ALPHAVAL (0xff)
#define ADDVI_H(a, b) (a + b)
#define ADDVI_W(a, b) (a + b)
#define SRAI_B(a, b) (a >> b)

View file

@ -14,11 +14,12 @@
#include <arm_neon.h>
#include "./dsp.h"
#include "src/dsp/dsp.h"
// Right now, some intrinsics functions seem slower, so we disable them
// everywhere except aarch64 where the inline assembly is incompatible.
#if defined(__aarch64__)
// everywhere except newer clang/gcc or aarch64 where the inline assembly is
// incompatible.
#if LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,9) || defined(__aarch64__)
#define WEBP_USE_INTRINSICS // use intrinsics when possible
#endif
@ -43,11 +44,11 @@
// if using intrinsics, this flag avoids some functions that make gcc-4.6.3
// crash ("internal compiler error: in immed_double_const, at emit-rtl.").
// (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
#if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
#define WORK_AROUND_GCC
#endif
static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) {
uint64x2x2_t row01, row23;
row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);

View file

@ -13,8 +13,8 @@
#include <assert.h>
#include "./dsp.h"
#include "../utils/rescaler_utils.h"
#include "src/dsp/dsp.h"
#include "src/utils/rescaler_utils.h"
//------------------------------------------------------------------------------
// Implementations of critical functions ImportRow / ExportRow
@ -25,7 +25,8 @@
//------------------------------------------------------------------------------
// Row import
void WebPRescalerImportRowExpandC(WebPRescaler* const wrk, const uint8_t* src) {
void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
const uint8_t* src) {
const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels;
int channel;
@ -56,7 +57,8 @@ void WebPRescalerImportRowExpandC(WebPRescaler* const wrk, const uint8_t* src) {
}
}
void WebPRescalerImportRowShrinkC(WebPRescaler* const wrk, const uint8_t* src) {
void WebPRescalerImportRowShrink_C(WebPRescaler* const wrk,
const uint8_t* src) {
const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels;
int channel;
@ -92,7 +94,7 @@ void WebPRescalerImportRowShrinkC(WebPRescaler* const wrk, const uint8_t* src) {
//------------------------------------------------------------------------------
// Row export
void WebPRescalerExportRowExpandC(WebPRescaler* const wrk) {
void WebPRescalerExportRowExpand_C(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
@ -123,7 +125,7 @@ void WebPRescalerExportRowExpandC(WebPRescaler* const wrk) {
}
}
void WebPRescalerExportRowShrinkC(WebPRescaler* const wrk) {
void WebPRescalerExportRowShrink_C(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
@ -207,11 +209,14 @@ static volatile VP8CPUInfo rescaler_last_cpuinfo_used =
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
if (rescaler_last_cpuinfo_used == VP8GetCPUInfo) return;
#if !defined(WEBP_REDUCE_SIZE)
#if !WEBP_NEON_OMIT_C_CODE
WebPRescalerExportRowExpand = WebPRescalerExportRowExpand_C;
WebPRescalerExportRowShrink = WebPRescalerExportRowShrink_C;
#endif
WebPRescalerImportRowExpand = WebPRescalerImportRowExpandC;
WebPRescalerImportRowShrink = WebPRescalerImportRowShrinkC;
WebPRescalerExportRowExpand = WebPRescalerExportRowExpandC;
WebPRescalerExportRowShrink = WebPRescalerExportRowShrinkC;
WebPRescalerImportRowExpand = WebPRescalerImportRowExpand_C;
WebPRescalerImportRowShrink = WebPRescalerImportRowShrink_C;
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
@ -219,11 +224,6 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
WebPRescalerDspInitSSE2();
}
#endif
#if defined(WEBP_USE_NEON)
if (VP8GetCPUInfo(kNEON)) {
WebPRescalerDspInitNEON();
}
#endif
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
WebPRescalerDspInitMIPS32();
@ -240,5 +240,18 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPRescalerDspInitNEON();
}
#endif
assert(WebPRescalerExportRowExpand != NULL);
assert(WebPRescalerExportRowShrink != NULL);
assert(WebPRescalerImportRowExpand != NULL);
assert(WebPRescalerImportRowShrink != NULL);
#endif // WEBP_REDUCE_SIZE
rescaler_last_cpuinfo_used = VP8GetCPUInfo;
}

View file

@ -11,17 +11,18 @@
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS32)
#if defined(WEBP_USE_MIPS32) && !defined(WEBP_REDUCE_SIZE)
#include <assert.h>
#include "../utils/rescaler_utils.h"
#include "src/utils/rescaler_utils.h"
//------------------------------------------------------------------------------
// Row import
static void ImportRowShrink(WebPRescaler* const wrk, const uint8_t* src) {
static void ImportRowShrink_MIPS32(WebPRescaler* const wrk,
const uint8_t* src) {
const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const int fx_scale = wrk->fx_scale;
@ -80,7 +81,8 @@ static void ImportRowShrink(WebPRescaler* const wrk, const uint8_t* src) {
}
}
static void ImportRowExpand(WebPRescaler* const wrk, const uint8_t* src) {
static void ImportRowExpand_MIPS32(WebPRescaler* const wrk,
const uint8_t* src) {
const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const int x_add = wrk->x_add;
@ -144,7 +146,7 @@ static void ImportRowExpand(WebPRescaler* const wrk, const uint8_t* src) {
//------------------------------------------------------------------------------
// Row export
static void ExportRowExpand(WebPRescaler* const wrk) {
static void ExportRowExpand_MIPS32(WebPRescaler* const wrk) {
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
@ -207,7 +209,7 @@ static void ExportRowExpand(WebPRescaler* const wrk) {
}
}
static void ExportRowShrink(WebPRescaler* const wrk) {
static void ExportRowShrink_MIPS32(WebPRescaler* const wrk) {
const int x_out_max = wrk->dst_width * wrk->num_channels;
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
@ -278,10 +280,10 @@ static void ExportRowShrink(WebPRescaler* const wrk) {
extern void WebPRescalerDspInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPS32(void) {
WebPRescalerImportRowExpand = ImportRowExpand;
WebPRescalerImportRowShrink = ImportRowShrink;
WebPRescalerExportRowExpand = ExportRowExpand;
WebPRescalerExportRowShrink = ExportRowShrink;
WebPRescalerImportRowExpand = ImportRowExpand_MIPS32;
WebPRescalerImportRowShrink = ImportRowShrink_MIPS32;
WebPRescalerExportRowExpand = ExportRowExpand_MIPS32;
WebPRescalerExportRowShrink = ExportRowShrink_MIPS32;
}
#else // !WEBP_USE_MIPS32

View file

@ -11,12 +11,12 @@
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#if defined(WEBP_USE_MIPS_DSP_R2) && !defined(WEBP_REDUCE_SIZE)
#include <assert.h>
#include "../utils/rescaler_utils.h"
#include "src/utils/rescaler_utils.h"
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
@ -24,7 +24,7 @@
//------------------------------------------------------------------------------
// Row export
static void ExportRowShrink(WebPRescaler* const wrk) {
static void ExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
int i;
const int x_out_max = wrk->dst_width * wrk->num_channels;
uint8_t* dst = wrk->dst;
@ -162,7 +162,7 @@ static void ExportRowShrink(WebPRescaler* const wrk) {
}
}
static void ExportRowExpand(WebPRescaler* const wrk) {
static void ExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
int i;
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
@ -303,8 +303,8 @@ static void ExportRowExpand(WebPRescaler* const wrk) {
extern void WebPRescalerDspInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPSdspR2(void) {
WebPRescalerExportRowExpand = ExportRowExpand;
WebPRescalerExportRowShrink = ExportRowShrink;
WebPRescalerExportRowExpand = ExportRowExpand_MIPSdspR2;
WebPRescalerExportRowShrink = ExportRowShrink_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2

View file

@ -11,14 +11,14 @@
//
// Author: Prashant Patil (prashant.patil@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MSA)
#if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE)
#include <assert.h>
#include "../utils/rescaler_utils.h"
#include "./msa_macro.h"
#include "src/utils/rescaler_utils.h"
#include "src/dsp/msa_macro.h"
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
@ -246,7 +246,7 @@ static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
}
}
static void RescalerExportRowExpand(WebPRescaler* const wrk) {
static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
@ -411,7 +411,7 @@ static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
}
}
static void RescalerExportRowShrink(WebPRescaler* const wrk) {
static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
@ -433,8 +433,8 @@ static void RescalerExportRowShrink(WebPRescaler* const wrk) {
extern void WebPRescalerDspInitMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
WebPRescalerExportRowExpand = RescalerExportRowExpand;
WebPRescalerExportRowShrink = RescalerExportRowShrink;
WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2;
WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2;
}
#else // !WEBP_USE_MSA

View file

@ -11,14 +11,14 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
#if defined(WEBP_USE_NEON) && !defined(WEBP_REDUCE_SIZE)
#include <arm_neon.h>
#include <assert.h>
#include "./neon.h"
#include "../utils/rescaler_utils.h"
#include "src/dsp/neon.h"
#include "src/utils/rescaler_utils.h"
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
#define MULT_FIX_C(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
@ -41,9 +41,9 @@
#error "MULT_FIX/WEBP_RESCALER_RFIX need some more work"
#endif
static uint32x4_t Interpolate(const rescaler_t* const frow,
const rescaler_t* const irow,
uint32_t A, uint32_t B) {
static uint32x4_t Interpolate_NEON(const rescaler_t* const frow,
const rescaler_t* const irow,
uint32_t A, uint32_t B) {
LOAD_32x4(frow, A0);
LOAD_32x4(irow, B0);
const uint64x2_t C0 = vmull_n_u32(vget_low_u32(A0), A);
@ -56,7 +56,7 @@ static uint32x4_t Interpolate(const rescaler_t* const frow,
return E;
}
static void RescalerExportRowExpand(WebPRescaler* const wrk) {
static void RescalerExportRowExpand_NEON(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
@ -91,9 +91,9 @@ static void RescalerExportRowExpand(WebPRescaler* const wrk) {
const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
for (x_out = 0; x_out < max_span; x_out += 8) {
const uint32x4_t C0 =
Interpolate(frow + x_out + 0, irow + x_out + 0, A, B);
Interpolate_NEON(frow + x_out + 0, irow + x_out + 0, A, B);
const uint32x4_t C1 =
Interpolate(frow + x_out + 4, irow + x_out + 4, A, B);
Interpolate_NEON(frow + x_out + 4, irow + x_out + 4, A, B);
const uint32x4_t D0 = MULT_FIX(C0, fy_scale_half);
const uint32x4_t D1 = MULT_FIX(C1, fy_scale_half);
const uint16x4_t E0 = vmovn_u32(D0);
@ -112,7 +112,7 @@ static void RescalerExportRowExpand(WebPRescaler* const wrk) {
}
}
static void RescalerExportRowShrink(WebPRescaler* const wrk) {
static void RescalerExportRowShrink_NEON(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
@ -175,8 +175,8 @@ static void RescalerExportRowShrink(WebPRescaler* const wrk) {
extern void WebPRescalerDspInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitNEON(void) {
WebPRescalerExportRowExpand = RescalerExportRowExpand;
WebPRescalerExportRowShrink = RescalerExportRowShrink;
WebPRescalerExportRowExpand = RescalerExportRowExpand_NEON;
WebPRescalerExportRowShrink = RescalerExportRowShrink_NEON;
}
#else // !WEBP_USE_NEON

View file

@ -11,14 +11,14 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
#if defined(WEBP_USE_SSE2) && !defined(WEBP_REDUCE_SIZE)
#include <emmintrin.h>
#include <assert.h>
#include "../utils/rescaler_utils.h"
#include "../utils/utils.h"
#include "src/utils/rescaler_utils.h"
#include "src/utils/utils.h"
//------------------------------------------------------------------------------
// Implementations of critical functions ImportRow / ExportRow
@ -27,7 +27,7 @@
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
// input: 8 bytes ABCDEFGH -> output: A0E0B0F0C0G0D0H0
static void LoadTwoPixels(const uint8_t* const src, __m128i* out) {
static void LoadTwoPixels_SSE2(const uint8_t* const src, __m128i* out) {
const __m128i zero = _mm_setzero_si128();
const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH
const __m128i B = _mm_unpacklo_epi8(A, zero); // A0B0C0D0E0F0G0H0
@ -36,14 +36,14 @@ static void LoadTwoPixels(const uint8_t* const src, __m128i* out) {
}
// input: 8 bytes ABCDEFGH -> output: A0B0C0D0E0F0G0H0
static void LoadHeightPixels(const uint8_t* const src, __m128i* out) {
static void LoadHeightPixels_SSE2(const uint8_t* const src, __m128i* out) {
const __m128i zero = _mm_setzero_si128();
const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH
*out = _mm_unpacklo_epi8(A, zero);
}
static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
const uint8_t* src) {
static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
const uint8_t* src) {
rescaler_t* frow = wrk->frow;
const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels;
const int x_add = wrk->x_add;
@ -54,10 +54,10 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
assert(wrk->x_expand);
if (wrk->num_channels == 4) {
if (wrk->src_width < 2) {
WebPRescalerImportRowExpandC(wrk, src);
WebPRescalerImportRowExpand_C(wrk, src);
return;
}
LoadTwoPixels(src, &cur_pixels);
LoadTwoPixels_SSE2(src, &cur_pixels);
src += 4;
while (1) {
const __m128i mult = _mm_set1_epi32(((x_add - accum) << 16) | accum);
@ -67,7 +67,7 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
if (frow >= frow_end) break;
accum -= wrk->x_sub;
if (accum < 0) {
LoadTwoPixels(src, &cur_pixels);
LoadTwoPixels_SSE2(src, &cur_pixels);
src += 4;
accum += x_add;
}
@ -76,10 +76,10 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
int left;
const uint8_t* const src_limit = src + wrk->src_width - 8;
if (wrk->src_width < 8) {
WebPRescalerImportRowExpandC(wrk, src);
WebPRescalerImportRowExpand_C(wrk, src);
return;
}
LoadHeightPixels(src, &cur_pixels);
LoadHeightPixels_SSE2(src, &cur_pixels);
src += 7;
left = 7;
while (1) {
@ -94,7 +94,7 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
if (--left) {
cur_pixels = _mm_srli_si128(cur_pixels, 2);
} else if (src <= src_limit) {
LoadHeightPixels(src, &cur_pixels);
LoadHeightPixels_SSE2(src, &cur_pixels);
src += 7;
left = 7;
} else { // tail
@ -110,8 +110,8 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
assert(accum == 0);
}
static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
const uint8_t* src) {
static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk,
const uint8_t* src) {
const int x_sub = wrk->x_sub;
int accum = 0;
const __m128i zero = _mm_setzero_si128();
@ -123,7 +123,7 @@ static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width;
if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) {
WebPRescalerImportRowShrinkC(wrk, src);
WebPRescalerImportRowShrink_C(wrk, src);
return;
}
assert(!WebPRescalerInputDone(wrk));
@ -169,12 +169,12 @@ static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
// Row export
// load *src as epi64, multiply by mult and store result in [out0 ... out3]
static WEBP_INLINE void LoadDispatchAndMult(const rescaler_t* const src,
const __m128i* const mult,
__m128i* const out0,
__m128i* const out1,
__m128i* const out2,
__m128i* const out3) {
static WEBP_INLINE void LoadDispatchAndMult_SSE2(const rescaler_t* const src,
const __m128i* const mult,
__m128i* const out0,
__m128i* const out1,
__m128i* const out2,
__m128i* const out3) {
const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + 0));
const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + 4));
const __m128i A2 = _mm_srli_epi64(A0, 32);
@ -192,12 +192,12 @@ static WEBP_INLINE void LoadDispatchAndMult(const rescaler_t* const src,
}
}
static WEBP_INLINE void ProcessRow(const __m128i* const A0,
const __m128i* const A1,
const __m128i* const A2,
const __m128i* const A3,
const __m128i* const mult,
uint8_t* const dst) {
static WEBP_INLINE void ProcessRow_SSE2(const __m128i* const A0,
const __m128i* const A1,
const __m128i* const A2,
const __m128i* const A3,
const __m128i* const mult,
uint8_t* const dst) {
const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0);
const __m128i B0 = _mm_mul_epu32(*A0, *mult);
@ -210,7 +210,7 @@ static WEBP_INLINE void ProcessRow(const __m128i* const A0,
const __m128i C3 = _mm_add_epi64(B3, rounder);
const __m128i D0 = _mm_srli_epi64(C0, WEBP_RESCALER_RFIX);
const __m128i D1 = _mm_srli_epi64(C1, WEBP_RESCALER_RFIX);
#if (WEBP_RESCALER_FIX < 32)
#if (WEBP_RESCALER_RFIX < 32)
const __m128i D2 =
_mm_and_si128(_mm_slli_epi64(C2, 32 - WEBP_RESCALER_RFIX), mask);
const __m128i D3 =
@ -226,7 +226,7 @@ static WEBP_INLINE void ProcessRow(const __m128i* const A0,
_mm_storel_epi64((__m128i*)dst, G);
}
static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
static void RescalerExportRowExpand_SSE2(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
@ -240,8 +240,8 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
if (wrk->y_accum == 0) {
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
__m128i A0, A1, A2, A3;
LoadDispatchAndMult(frow + x_out, NULL, &A0, &A1, &A2, &A3);
ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out);
LoadDispatchAndMult_SSE2(frow + x_out, NULL, &A0, &A1, &A2, &A3);
ProcessRow_SSE2(&A0, &A1, &A2, &A3, &mult, dst + x_out);
}
for (; x_out < x_out_max; ++x_out) {
const uint32_t J = frow[x_out];
@ -257,8 +257,8 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
__m128i A0, A1, A2, A3, B0, B1, B2, B3;
LoadDispatchAndMult(frow + x_out, &mA, &A0, &A1, &A2, &A3);
LoadDispatchAndMult(irow + x_out, &mB, &B0, &B1, &B2, &B3);
LoadDispatchAndMult_SSE2(frow + x_out, &mA, &A0, &A1, &A2, &A3);
LoadDispatchAndMult_SSE2(irow + x_out, &mB, &B0, &B1, &B2, &B3);
{
const __m128i C0 = _mm_add_epi64(A0, B0);
const __m128i C1 = _mm_add_epi64(A1, B1);
@ -272,7 +272,7 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
const __m128i E1 = _mm_srli_epi64(D1, WEBP_RESCALER_RFIX);
const __m128i E2 = _mm_srli_epi64(D2, WEBP_RESCALER_RFIX);
const __m128i E3 = _mm_srli_epi64(D3, WEBP_RESCALER_RFIX);
ProcessRow(&E0, &E1, &E2, &E3, &mult, dst + x_out);
ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult, dst + x_out);
}
}
for (; x_out < x_out_max; ++x_out) {
@ -286,7 +286,7 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
}
}
static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
@ -303,8 +303,8 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
__m128i A0, A1, A2, A3, B0, B1, B2, B3;
LoadDispatchAndMult(irow + x_out, NULL, &A0, &A1, &A2, &A3);
LoadDispatchAndMult(frow + x_out, &mult_y, &B0, &B1, &B2, &B3);
LoadDispatchAndMult_SSE2(irow + x_out, NULL, &A0, &A1, &A2, &A3);
LoadDispatchAndMult_SSE2(frow + x_out, &mult_y, &B0, &B1, &B2, &B3);
{
const __m128i C0 = _mm_add_epi64(B0, rounder);
const __m128i C1 = _mm_add_epi64(B1, rounder);
@ -324,7 +324,7 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
const __m128i G1 = _mm_or_si128(D1, F3);
_mm_storeu_si128((__m128i*)(irow + x_out + 0), G0);
_mm_storeu_si128((__m128i*)(irow + x_out + 4), G1);
ProcessRow(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out);
ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out);
}
}
for (; x_out < x_out_max; ++x_out) {
@ -340,10 +340,10 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
const __m128i zero = _mm_setzero_si128();
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
__m128i A0, A1, A2, A3;
LoadDispatchAndMult(irow + x_out, NULL, &A0, &A1, &A2, &A3);
LoadDispatchAndMult_SSE2(irow + x_out, NULL, &A0, &A1, &A2, &A3);
_mm_storeu_si128((__m128i*)(irow + x_out + 0), zero);
_mm_storeu_si128((__m128i*)(irow + x_out + 4), zero);
ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out);
ProcessRow_SSE2(&A0, &A1, &A2, &A3, &mult, dst + x_out);
}
for (; x_out < x_out_max; ++x_out) {
const int v = (int)MULT_FIX(irow[x_out], scale);
@ -362,10 +362,10 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
extern void WebPRescalerDspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitSSE2(void) {
WebPRescalerImportRowExpand = RescalerImportRowExpandSSE2;
WebPRescalerImportRowShrink = RescalerImportRowShrinkSSE2;
WebPRescalerExportRowExpand = RescalerExportRowExpandSSE2;
WebPRescalerExportRowShrink = RescalerExportRowShrinkSSE2;
WebPRescalerImportRowExpand = RescalerImportRowExpand_SSE2;
WebPRescalerImportRowShrink = RescalerImportRowShrink_SSE2;
WebPRescalerExportRowExpand = RescalerExportRowExpand_SSE2;
WebPRescalerExportRowShrink = RescalerExportRowShrink_SSE2;
}
#else // !WEBP_USE_SSE2

166
thirdparty/libwebp/src/dsp/ssim.c vendored Normal file
View file

@ -0,0 +1,166 @@
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// distortion calculation
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include <stdlib.h> // for abs()
#include "src/dsp/dsp.h"
#if !defined(WEBP_REDUCE_SIZE)
//------------------------------------------------------------------------------
// SSIM / PSNR
// hat-shaped filter. Sum of coefficients is equal to 16.
static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
1, 2, 3, 4, 3, 2, 1
};
static const uint32_t kWeightSum = 16 * 16; // sum{kWeight}^2
static WEBP_INLINE double SSIMCalculation(
const VP8DistoStats* const stats, uint32_t N /*num samples*/) {
const uint32_t w2 = N * N;
const uint32_t C1 = 20 * w2;
const uint32_t C2 = 60 * w2;
const uint32_t C3 = 8 * 8 * w2; // 'dark' limit ~= 6
const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
if (xmxm + ymym >= C3) {
const int64_t xmym = (int64_t)stats->xm * stats->ym;
const int64_t sxy = (int64_t)stats->xym * N - xmym; // can be negative
const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
const uint64_t syy = (uint64_t)stats->yym * N - ymym;
// we descale by 8 to prevent overflow during the fnum/fden multiply.
const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
const uint64_t den_S = (sxx + syy + C2) >> 8;
const uint64_t fnum = (2 * xmym + C1) * num_S;
const uint64_t fden = (xmxm + ymym + C1) * den_S;
const double r = (double)fnum / fden;
assert(r >= 0. && r <= 1.0);
return r;
}
return 1.; // area is too dark to contribute meaningfully
}
double VP8SSIMFromStats(const VP8DistoStats* const stats) {
return SSIMCalculation(stats, kWeightSum);
}
double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
return SSIMCalculation(stats, stats->w);
}
static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
const uint8_t* src2, int stride2,
int xo, int yo, int W, int H) {
VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
: yo + VP8_SSIM_KERNEL;
const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
: xo + VP8_SSIM_KERNEL;
int x, y;
src1 += ymin * stride1;
src2 += ymin * stride2;
for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
for (x = xmin; x <= xmax; ++x) {
const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
* kWeight[VP8_SSIM_KERNEL + y - yo];
const uint32_t s1 = src1[x];
const uint32_t s2 = src2[x];
stats.w += w;
stats.xm += w * s1;
stats.ym += w * s2;
stats.xxm += w * s1 * s1;
stats.xym += w * s1 * s2;
stats.yym += w * s2 * s2;
}
}
return VP8SSIMFromStatsClipped(&stats);
}
static double SSIMGet_C(const uint8_t* src1, int stride1,
const uint8_t* src2, int stride2) {
VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
int x, y;
for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
const uint32_t w = kWeight[x] * kWeight[y];
const uint32_t s1 = src1[x];
const uint32_t s2 = src2[x];
stats.xm += w * s1;
stats.ym += w * s2;
stats.xxm += w * s1 * s1;
stats.xym += w * s1 * s2;
stats.yym += w * s2 * s2;
}
}
return VP8SSIMFromStats(&stats);
}
#endif // !defined(WEBP_REDUCE_SIZE)
//------------------------------------------------------------------------------
#if !defined(WEBP_DISABLE_STATS)
static uint32_t AccumulateSSE_C(const uint8_t* src1,
const uint8_t* src2, int len) {
int i;
uint32_t sse2 = 0;
assert(len <= 65535); // to ensure that accumulation fits within uint32_t
for (i = 0; i < len; ++i) {
const int32_t diff = src1[i] - src2[i];
sse2 += diff * diff;
}
return sse2;
}
#endif
//------------------------------------------------------------------------------
#if !defined(WEBP_REDUCE_SIZE)
VP8SSIMGetFunc VP8SSIMGet;
VP8SSIMGetClippedFunc VP8SSIMGetClipped;
#endif
#if !defined(WEBP_DISABLE_STATS)
VP8AccumulateSSEFunc VP8AccumulateSSE;
#endif
extern void VP8SSIMDspInitSSE2(void);
static volatile VP8CPUInfo ssim_last_cpuinfo_used =
(VP8CPUInfo)&ssim_last_cpuinfo_used;
WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
#if !defined(WEBP_REDUCE_SIZE)
VP8SSIMGetClipped = SSIMGetClipped_C;
VP8SSIMGet = SSIMGet_C;
#endif
#if !defined(WEBP_DISABLE_STATS)
VP8AccumulateSSE = AccumulateSSE_C;
#endif
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8SSIMDspInitSSE2();
}
#endif
}
ssim_last_cpuinfo_used = VP8GetCPUInfo;
}

165
thirdparty/libwebp/src/dsp/ssim_sse2.c vendored Normal file
View file

@ -0,0 +1,165 @@
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 version of distortion calculation
//
// Author: Skal (pascal.massimino@gmail.com)
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
#include <assert.h>
#include <emmintrin.h>
#include "src/dsp/common_sse2.h"
#if !defined(WEBP_DISABLE_STATS)
// Helper function
static WEBP_INLINE void SubtractAndSquare_SSE2(const __m128i a, const __m128i b,
__m128i* const sum) {
// take abs(a-b) in 8b
const __m128i a_b = _mm_subs_epu8(a, b);
const __m128i b_a = _mm_subs_epu8(b, a);
const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
// zero-extend to 16b
const __m128i zero = _mm_setzero_si128();
const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero);
const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero);
// multiply with self
const __m128i sum1 = _mm_madd_epi16(C0, C0);
const __m128i sum2 = _mm_madd_epi16(C1, C1);
*sum = _mm_add_epi32(sum1, sum2);
}
//------------------------------------------------------------------------------
// SSIM / PSNR entry point
static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
const uint8_t* src2, int len) {
int i = 0;
uint32_t sse2 = 0;
if (len >= 16) {
const int limit = len - 32;
int32_t tmp[4];
__m128i sum1;
__m128i sum = _mm_setzero_si128();
__m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
__m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
i += 16;
while (i <= limit) {
const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
__m128i sum2;
i += 16;
SubtractAndSquare_SSE2(a0, b0, &sum1);
sum = _mm_add_epi32(sum, sum1);
a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
i += 16;
SubtractAndSquare_SSE2(a1, b1, &sum2);
sum = _mm_add_epi32(sum, sum2);
}
SubtractAndSquare_SSE2(a0, b0, &sum1);
sum = _mm_add_epi32(sum, sum1);
_mm_storeu_si128((__m128i*)tmp, sum);
sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
}
for (; i < len; ++i) {
const int32_t diff = src1[i] - src2[i];
sse2 += diff * diff;
}
return sse2;
}
#endif // !defined(WEBP_DISABLE_STATS)
#if !defined(WEBP_REDUCE_SIZE)
static uint32_t HorizontalAdd16b_SSE2(const __m128i* const m) {
uint16_t tmp[8];
const __m128i a = _mm_srli_si128(*m, 8);
const __m128i b = _mm_add_epi16(*m, a);
_mm_storeu_si128((__m128i*)tmp, b);
return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
}
static uint32_t HorizontalAdd32b_SSE2(const __m128i* const m) {
const __m128i a = _mm_srli_si128(*m, 8);
const __m128i b = _mm_add_epi32(*m, a);
const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
return (uint32_t)_mm_cvtsi128_si32(c);
}
static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
#define ACCUMULATE_ROW(WEIGHT) do { \
/* compute row weight (Wx * Wy) */ \
const __m128i Wy = _mm_set1_epi16((WEIGHT)); \
const __m128i W = _mm_mullo_epi16(Wx, Wy); \
/* process 8 bytes at a time (7 bytes, actually) */ \
const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
/* convert to 16b and multiply by weight */ \
const __m128i a1 = _mm_unpacklo_epi8(a0, zero); \
const __m128i b1 = _mm_unpacklo_epi8(b0, zero); \
const __m128i wa1 = _mm_mullo_epi16(a1, W); \
const __m128i wb1 = _mm_mullo_epi16(b1, W); \
/* accumulate */ \
xm = _mm_add_epi16(xm, wa1); \
ym = _mm_add_epi16(ym, wb1); \
xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1)); \
xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1)); \
yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1)); \
src1 += stride1; \
src2 += stride2; \
} while (0)
static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
const uint8_t* src2, int stride2) {
VP8DistoStats stats;
const __m128i zero = _mm_setzero_si128();
__m128i xm = zero, ym = zero; // 16b accums
__m128i xxm = zero, yym = zero, xym = zero; // 32b accum
const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
assert(2 * VP8_SSIM_KERNEL + 1 == 7);
ACCUMULATE_ROW(1);
ACCUMULATE_ROW(2);
ACCUMULATE_ROW(3);
ACCUMULATE_ROW(4);
ACCUMULATE_ROW(3);
ACCUMULATE_ROW(2);
ACCUMULATE_ROW(1);
stats.xm = HorizontalAdd16b_SSE2(&xm);
stats.ym = HorizontalAdd16b_SSE2(&ym);
stats.xxm = HorizontalAdd32b_SSE2(&xxm);
stats.xym = HorizontalAdd32b_SSE2(&xym);
stats.yym = HorizontalAdd32b_SSE2(&yym);
return VP8SSIMFromStats(&stats);
}
#endif // !defined(WEBP_REDUCE_SIZE)
extern void VP8SSIMDspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
#if !defined(WEBP_DISABLE_STATS)
VP8AccumulateSSE = AccumulateSSE_SSE2;
#endif
#if !defined(WEBP_REDUCE_SIZE)
VP8SSIMGet = SSIMGet_SSE2;
#endif
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
#endif // WEBP_USE_SSE2

View file

@ -11,8 +11,8 @@
//
// Author: somnath@google.com (Somnath Banerjee)
#include "./dsp.h"
#include "./yuv.h"
#include "src/dsp/dsp.h"
#include "src/dsp/yuv.h"
#include <assert.h>
@ -63,17 +63,17 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint32_t uv0 = (diag_12 + tl_uv) >> 1; \
const uint32_t uv1 = (diag_03 + t_uv) >> 1; \
FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \
top_dst + (2 * x - 1) * XSTEP); \
top_dst + (2 * x - 1) * (XSTEP)); \
FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16), \
top_dst + (2 * x - 0) * XSTEP); \
top_dst + (2 * x - 0) * (XSTEP)); \
} \
if (bottom_y != NULL) { \
const uint32_t uv0 = (diag_03 + l_uv) >> 1; \
const uint32_t uv1 = (diag_12 + uv) >> 1; \
FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \
bottom_dst + (2 * x - 1) * XSTEP); \
bottom_dst + (2 * x - 1) * (XSTEP)); \
FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16), \
bottom_dst + (2 * x + 0) * XSTEP); \
bottom_dst + (2 * x + 0) * (XSTEP)); \
} \
tl_uv = t_uv; \
l_uv = uv; \
@ -82,24 +82,50 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
{ \
const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \
FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16), \
top_dst + (len - 1) * XSTEP); \
top_dst + (len - 1) * (XSTEP)); \
} \
if (bottom_y != NULL) { \
const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \
FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16), \
bottom_dst + (len - 1) * XSTEP); \
bottom_dst + (len - 1) * (XSTEP)); \
} \
} \
}
// All variants implemented.
UPSAMPLE_FUNC(UpsampleRgbLinePair, VP8YuvToRgb, 3)
UPSAMPLE_FUNC(UpsampleBgrLinePair, VP8YuvToBgr, 3)
UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4)
UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2)
UPSAMPLE_FUNC(UpsampleRgb565LinePair, VP8YuvToRgb565, 2)
#if !WEBP_NEON_OMIT_C_CODE
UPSAMPLE_FUNC(UpsampleRgbaLinePair_C, VP8YuvToRgba, 4)
UPSAMPLE_FUNC(UpsampleBgraLinePair_C, VP8YuvToBgra, 4)
#if !defined(WEBP_REDUCE_CSP)
UPSAMPLE_FUNC(UpsampleArgbLinePair_C, VP8YuvToArgb, 4)
UPSAMPLE_FUNC(UpsampleRgbLinePair_C, VP8YuvToRgb, 3)
UPSAMPLE_FUNC(UpsampleBgrLinePair_C, VP8YuvToBgr, 3)
UPSAMPLE_FUNC(UpsampleRgba4444LinePair_C, VP8YuvToRgba4444, 2)
UPSAMPLE_FUNC(UpsampleRgb565LinePair_C, VP8YuvToRgb565, 2)
#else
static void EmptyUpsampleFunc(const uint8_t* top_y, const uint8_t* bottom_y,
const uint8_t* top_u, const uint8_t* top_v,
const uint8_t* cur_u, const uint8_t* cur_v,
uint8_t* top_dst, uint8_t* bottom_dst, int len) {
(void)top_y;
(void)bottom_y;
(void)top_u;
(void)top_v;
(void)cur_u;
(void)cur_v;
(void)top_dst;
(void)bottom_dst;
(void)len;
assert(0); // COLORSPACE SUPPORT NOT COMPILED
}
#define UpsampleArgbLinePair_C EmptyUpsampleFunc
#define UpsampleRgbLinePair_C EmptyUpsampleFunc
#define UpsampleBgrLinePair_C EmptyUpsampleFunc
#define UpsampleRgba4444LinePair_C EmptyUpsampleFunc
#define UpsampleRgb565LinePair_C EmptyUpsampleFunc
#endif // WEBP_REDUCE_CSP
#endif
#undef LOAD_UV
#undef UPSAMPLE_FUNC
@ -141,7 +167,6 @@ DUAL_SAMPLE_FUNC(DualLineSamplerARGB, VP8YuvToArgb)
WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last) {
WebPInitUpsamplers();
VP8YUVInit();
#ifdef FANCY_UPSAMPLING
return WebPUpsamplers[alpha_is_last ? MODE_BGRA : MODE_ARGB];
#else
@ -158,16 +183,33 @@ extern void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
int i; \
for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \
for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * (XSTEP)]); \
}
YUV444_FUNC(WebPYuv444ToRgbC, VP8YuvToRgb, 3)
YUV444_FUNC(WebPYuv444ToBgrC, VP8YuvToBgr, 3)
YUV444_FUNC(WebPYuv444ToRgbaC, VP8YuvToRgba, 4)
YUV444_FUNC(WebPYuv444ToBgraC, VP8YuvToBgra, 4)
YUV444_FUNC(WebPYuv444ToArgbC, VP8YuvToArgb, 4)
YUV444_FUNC(WebPYuv444ToRgba4444C, VP8YuvToRgba4444, 2)
YUV444_FUNC(WebPYuv444ToRgb565C, VP8YuvToRgb565, 2)
YUV444_FUNC(WebPYuv444ToRgba_C, VP8YuvToRgba, 4)
YUV444_FUNC(WebPYuv444ToBgra_C, VP8YuvToBgra, 4)
#if !defined(WEBP_REDUCE_CSP)
YUV444_FUNC(WebPYuv444ToRgb_C, VP8YuvToRgb, 3)
YUV444_FUNC(WebPYuv444ToBgr_C, VP8YuvToBgr, 3)
YUV444_FUNC(WebPYuv444ToArgb_C, VP8YuvToArgb, 4)
YUV444_FUNC(WebPYuv444ToRgba4444_C, VP8YuvToRgba4444, 2)
YUV444_FUNC(WebPYuv444ToRgb565_C, VP8YuvToRgb565, 2)
#else
static void EmptyYuv444Func(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
(void)y;
(void)u;
(void)v;
(void)dst;
(void)len;
}
#define WebPYuv444ToRgb_C EmptyYuv444Func
#define WebPYuv444ToBgr_C EmptyYuv444Func
#define WebPYuv444ToArgb_C EmptyYuv444Func
#define WebPYuv444ToRgba4444_C EmptyYuv444Func
#define WebPYuv444ToRgb565_C EmptyYuv444Func
#endif // WEBP_REDUCE_CSP
#undef YUV444_FUNC
@ -182,17 +224,17 @@ static volatile VP8CPUInfo upsampling_last_cpuinfo_used1 =
WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) {
if (upsampling_last_cpuinfo_used1 == VP8GetCPUInfo) return;
WebPYUV444Converters[MODE_RGB] = WebPYuv444ToRgbC;
WebPYUV444Converters[MODE_RGBA] = WebPYuv444ToRgbaC;
WebPYUV444Converters[MODE_BGR] = WebPYuv444ToBgrC;
WebPYUV444Converters[MODE_BGRA] = WebPYuv444ToBgraC;
WebPYUV444Converters[MODE_ARGB] = WebPYuv444ToArgbC;
WebPYUV444Converters[MODE_RGBA_4444] = WebPYuv444ToRgba4444C;
WebPYUV444Converters[MODE_RGB_565] = WebPYuv444ToRgb565C;
WebPYUV444Converters[MODE_rgbA] = WebPYuv444ToRgbaC;
WebPYUV444Converters[MODE_bgrA] = WebPYuv444ToBgraC;
WebPYUV444Converters[MODE_Argb] = WebPYuv444ToArgbC;
WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444C;
WebPYUV444Converters[MODE_RGBA] = WebPYuv444ToRgba_C;
WebPYUV444Converters[MODE_BGRA] = WebPYuv444ToBgra_C;
WebPYUV444Converters[MODE_RGB] = WebPYuv444ToRgb_C;
WebPYUV444Converters[MODE_BGR] = WebPYuv444ToBgr_C;
WebPYUV444Converters[MODE_ARGB] = WebPYuv444ToArgb_C;
WebPYUV444Converters[MODE_RGBA_4444] = WebPYuv444ToRgba4444_C;
WebPYUV444Converters[MODE_RGB_565] = WebPYuv444ToRgb565_C;
WebPYUV444Converters[MODE_rgbA] = WebPYuv444ToRgba_C;
WebPYUV444Converters[MODE_bgrA] = WebPYuv444ToBgra_C;
WebPYUV444Converters[MODE_Argb] = WebPYuv444ToArgb_C;
WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444_C;
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
@ -224,17 +266,19 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
if (upsampling_last_cpuinfo_used2 == VP8GetCPUInfo) return;
#ifdef FANCY_UPSAMPLING
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
#if !WEBP_NEON_OMIT_C_CODE
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_C;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_C;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_C;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_C;
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_C;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_C;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_C;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_C;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_C;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_C;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_C;
#endif
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
@ -243,11 +287,6 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
WebPInitUpsamplersSSE2();
}
#endif
#if defined(WEBP_USE_NEON)
if (VP8GetCPUInfo(kNEON)) {
WebPInitUpsamplersNEON();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
WebPInitUpsamplersMIPSdspR2();
@ -259,6 +298,26 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPInitUpsamplersNEON();
}
#endif
assert(WebPUpsamplers[MODE_RGBA] != NULL);
assert(WebPUpsamplers[MODE_BGRA] != NULL);
assert(WebPUpsamplers[MODE_rgbA] != NULL);
assert(WebPUpsamplers[MODE_bgrA] != NULL);
assert(WebPUpsamplers[MODE_RGB] != NULL);
assert(WebPUpsamplers[MODE_BGR] != NULL);
assert(WebPUpsamplers[MODE_ARGB] != NULL);
assert(WebPUpsamplers[MODE_RGBA_4444] != NULL);
assert(WebPUpsamplers[MODE_RGB_565] != NULL);
assert(WebPUpsamplers[MODE_Argb] != NULL);
assert(WebPUpsamplers[MODE_rgbA_4444] != NULL);
#endif // FANCY_UPSAMPLING
upsampling_last_cpuinfo_used2 = VP8GetCPUInfo;
}

View file

@ -12,14 +12,12 @@
// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
// Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include <assert.h>
#include "./yuv.h"
#if !defined(WEBP_YUV_USE_TABLE)
#include "src/dsp/yuv.h"
#define YUV_TO_RGB(Y, U, V, R, G, B) do { \
const int t1 = MultHi(Y, 19077); \
@ -48,6 +46,7 @@
); \
} while (0)
#if !defined(WEBP_REDUCE_CSP)
static WEBP_INLINE void YuvToRgb(int y, int u, int v, uint8_t* const rgb) {
int r, g, b;
YUV_TO_RGB(y, u, v, r, g, b);
@ -68,7 +67,7 @@ static WEBP_INLINE void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
{
const int rg = (r & 0xf8) | (g >> 5);
const int gb = ((g << 3) & 0xe0) | (b >> 3);
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
rgb[0] = gb;
rgb[1] = rg;
#else
@ -84,7 +83,7 @@ static WEBP_INLINE void YuvToRgba4444(int y, int u, int v,
{
const int rg = (r & 0xf0) | (g >> 4);
const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
argb[0] = ba;
argb[1] = rg;
#else
@ -93,11 +92,12 @@ static WEBP_INLINE void YuvToRgba4444(int y, int u, int v,
#endif
}
}
#endif // WEBP_YUV_USE_TABLE
#endif // WEBP_REDUCE_CSP
//-----------------------------------------------------------------------------
// Alpha handling variants
#if !defined(WEBP_REDUCE_CSP)
static WEBP_INLINE void YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const argb) {
int r, g, b;
@ -107,6 +107,7 @@ static WEBP_INLINE void YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
argb[2] = g;
argb[3] = b;
}
#endif // WEBP_REDUCE_CSP
static WEBP_INLINE void YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const bgra) {
int r, g, b;
@ -200,13 +201,15 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
}
// All variants implemented.
UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3)
UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3)
UPSAMPLE_FUNC(UpsampleRgbaLinePair, YuvToRgba, 4)
UPSAMPLE_FUNC(UpsampleBgraLinePair, YuvToBgra, 4)
#if !defined(WEBP_REDUCE_CSP)
UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3)
UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3)
UPSAMPLE_FUNC(UpsampleArgbLinePair, YuvToArgb, 4)
UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
UPSAMPLE_FUNC(UpsampleRgb565LinePair, YuvToRgb565, 2)
#endif // WEBP_REDUCE_CSP
#undef LOAD_UV
#undef UPSAMPLE_FUNC
@ -217,17 +220,19 @@ UPSAMPLE_FUNC(UpsampleRgb565LinePair, YuvToRgb565, 2)
extern void WebPInitUpsamplersMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMIPSdspR2(void) {
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
#if !defined(WEBP_REDUCE_CSP)
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
#endif // WEBP_REDUCE_CSP
}
#endif // FANCY_UPSAMPLING
@ -242,13 +247,15 @@ static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \
}
YUV444_FUNC(Yuv444ToRgb, YuvToRgb, 3)
YUV444_FUNC(Yuv444ToBgr, YuvToBgr, 3)
YUV444_FUNC(Yuv444ToRgba, YuvToRgba, 4)
YUV444_FUNC(Yuv444ToBgra, YuvToBgra, 4)
#if !defined(WEBP_REDUCE_CSP)
YUV444_FUNC(Yuv444ToRgb, YuvToRgb, 3)
YUV444_FUNC(Yuv444ToBgr, YuvToBgr, 3)
YUV444_FUNC(Yuv444ToArgb, YuvToArgb, 4)
YUV444_FUNC(Yuv444ToRgba4444, YuvToRgba4444, 2)
YUV444_FUNC(Yuv444ToRgb565, YuvToRgb565, 2)
#endif // WEBP_REDUCE_CSP
#undef YUV444_FUNC
@ -258,17 +265,19 @@ YUV444_FUNC(Yuv444ToRgb565, YuvToRgb565, 2)
extern void WebPInitYUV444ConvertersMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersMIPSdspR2(void) {
WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb;
WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba;
WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr;
WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra;
WebPYUV444Converters[MODE_rgbA] = Yuv444ToRgba;
WebPYUV444Converters[MODE_bgrA] = Yuv444ToBgra;
#if !defined(WEBP_REDUCE_CSP)
WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb;
WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr;
WebPYUV444Converters[MODE_ARGB] = Yuv444ToArgb;
WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444;
WebPYUV444Converters[MODE_RGB_565] = Yuv444ToRgb565;
WebPYUV444Converters[MODE_rgbA] = Yuv444ToRgba;
WebPYUV444Converters[MODE_bgrA] = Yuv444ToBgra;
WebPYUV444Converters[MODE_Argb] = Yuv444ToArgb;
WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444;
#endif // WEBP_REDUCE_CSP
}
#else // !WEBP_USE_MIPS_DSP_R2

View file

@ -12,12 +12,12 @@
// Author: Prashant Patil (prashant.patil@imgtec.com)
#include <string.h>
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MSA)
#include "./msa_macro.h"
#include "./yuv.h"
#include "src/dsp/msa_macro.h"
#include "src/dsp/yuv.h"
#ifdef FANCY_UPSAMPLING
@ -274,7 +274,7 @@ static void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
const int b = Clip8(b1 >> 6);
const int rg = (r & 0xf8) | (g >> 5);
const int gb = ((g << 3) & 0xe0) | (b >> 3);
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
rgb[0] = gb;
rgb[1] = rg;
#else
@ -293,7 +293,7 @@ static void YuvToRgba4444(int y, int u, int v, uint8_t* const argb) {
const int b = Clip8(b1 >> 6);
const int rg = (r & 0xf0) | (g >> 4);
const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
argb[0] = ba;
argb[1] = rg;
#else
@ -374,7 +374,7 @@ static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
v16u8 R, G, B;
const v16u8 A = (v16u8)__msa_ldi_b(0xff);
const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
while (length >= 16) {
CALC_RGB16(y, u, v, R, G, B);
STORE16_4(R, G, B, A, dst);
@ -402,7 +402,7 @@ static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
v16u8 R, G, B;
const v16u8 A = (v16u8)__msa_ldi_b(0xff);
const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
while (length >= 16) {
CALC_RGB16(y, u, v, R, G, B);
STORE16_4(B, G, R, A, dst);
@ -430,7 +430,7 @@ static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
v16u8 R, G, B;
const v16u8 A = (v16u8)__msa_ldi_b(0xff);
const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
while (length >= 16) {
CALC_RGB16(y, u, v, R, G, B);
STORE16_4(A, R, G, B, dst);
@ -459,11 +459,11 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
v16u8 R, G, B, RG, BA, tmp0, tmp1;
while (length >= 16) {
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
CALC_RGBA4444(y, u, v, BA, RG, 16, dst);
#else
#else
CALC_RGBA4444(y, u, v, RG, BA, 16, dst);
#endif
#endif
y += 16;
u += 16;
v += 16;
@ -473,7 +473,7 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
if (length > 8) {
uint8_t temp[2 * 16] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
CALC_RGBA4444(temp, u, v, BA, RG, 16, temp);
#else
CALC_RGBA4444(temp, u, v, RG, BA, 16, temp);
@ -482,7 +482,7 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
} else if (length > 0) {
uint8_t temp[2 * 8] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
CALC_RGBA4444(temp, u, v, BA, RG, 8, temp);
#else
CALC_RGBA4444(temp, u, v, RG, BA, 8, temp);
@ -495,11 +495,11 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
v16u8 R, G, B, RG, GB, tmp0, tmp1;
while (length >= 16) {
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
CALC_RGB565(y, u, v, GB, RG, 16, dst);
#else
#else
CALC_RGB565(y, u, v, RG, GB, 16, dst);
#endif
#endif
y += 16;
u += 16;
v += 16;
@ -509,7 +509,7 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
if (length > 8) {
uint8_t temp[2 * 16] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
CALC_RGB565(temp, u, v, GB, RG, 16, temp);
#else
CALC_RGB565(temp, u, v, RG, GB, 16, temp);
@ -518,7 +518,7 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
} else if (length > 0) {
uint8_t temp[2 * 8] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
CALC_RGB565(temp, u, v, GB, RG, 8, temp);
#else
CALC_RGB565(temp, u, v, RG, GB, 8, temp);
@ -640,13 +640,15 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y, \
} \
}
UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3)
UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3)
UPSAMPLE_FUNC(UpsampleRgbaLinePair, YuvToRgba, 4)
UPSAMPLE_FUNC(UpsampleBgraLinePair, YuvToBgra, 4)
#if !defined(WEBP_REDUCE_CSP)
UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3)
UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3)
UPSAMPLE_FUNC(UpsampleArgbLinePair, YuvToArgb, 4)
UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
UPSAMPLE_FUNC(UpsampleRgb565LinePair, YuvToRgb565, 2)
#endif // WEBP_REDUCE_CSP
//------------------------------------------------------------------------------
// Entry point
@ -656,17 +658,19 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
extern void WebPInitUpsamplersMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMSA(void) {
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
#if !defined(WEBP_REDUCE_CSP)
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
#endif // WEBP_REDUCE_CSP
}
#endif // FANCY_UPSAMPLING

View file

@ -12,15 +12,15 @@
// Author: mans@mansr.com (Mans Rullgard)
// Based on SSE code by: somnath@google.com (Somnath Banerjee)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_NEON)
#include <assert.h>
#include <arm_neon.h>
#include <string.h>
#include "./neon.h"
#include "./yuv.h"
#include "src/dsp/neon.h"
#include "src/dsp/yuv.h"
#ifdef FANCY_UPSAMPLING
@ -58,8 +58,8 @@
} while (0)
// Turn the macro into a function for reducing code-size when non-critical
static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
uint8_t *out) {
static void Upsample16Pixels_NEON(const uint8_t *r1, const uint8_t *r2,
uint8_t *out) {
UPSAMPLE_16PIXELS(r1, r2, out);
}
@ -70,7 +70,7 @@ static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
/* replicate last byte */ \
memset(r1 + (num_pixels), r1[(num_pixels) - 1], 9 - (num_pixels)); \
memset(r2 + (num_pixels), r2[(num_pixels) - 1], 9 - (num_pixels)); \
Upsample16Pixels(r1, r2, out); \
Upsample16Pixels_NEON(r1, r2, out); \
}
//-----------------------------------------------------------------------------
@ -243,13 +243,15 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y, \
}
// NEON variants of the fancy upsampler.
NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair, Rgb, 3)
NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair, Bgr, 3)
NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair, Rgba, 4)
NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair, Bgra, 4)
NEON_UPSAMPLE_FUNC(UpsampleArgbLinePair, Argb, 4)
NEON_UPSAMPLE_FUNC(UpsampleRgba4444LinePair, Rgba4444, 2)
NEON_UPSAMPLE_FUNC(UpsampleRgb565LinePair, Rgb565, 2)
NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair_NEON, Rgba, 4)
NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair_NEON, Bgra, 4)
#if !defined(WEBP_REDUCE_CSP)
NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair_NEON, Rgb, 3)
NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair_NEON, Bgr, 3)
NEON_UPSAMPLE_FUNC(UpsampleArgbLinePair_NEON, Argb, 4)
NEON_UPSAMPLE_FUNC(UpsampleRgba4444LinePair_NEON, Rgba4444, 2)
NEON_UPSAMPLE_FUNC(UpsampleRgb565LinePair_NEON, Rgb565, 2)
#endif // WEBP_REDUCE_CSP
//------------------------------------------------------------------------------
// Entry point
@ -259,17 +261,19 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
extern void WebPInitUpsamplersNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersNEON(void) {
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_NEON;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_NEON;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_NEON;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_NEON;
#if !defined(WEBP_REDUCE_CSP)
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_NEON;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_NEON;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_NEON;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_NEON;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_NEON;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_NEON;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_NEON;
#endif // WEBP_REDUCE_CSP
}
#endif // FANCY_UPSAMPLING

View file

@ -11,14 +11,14 @@
//
// Author: somnath@google.com (Somnath Banerjee)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_SSE2)
#include <assert.h>
#include <emmintrin.h>
#include <string.h>
#include "./yuv.h"
#include "src/dsp/yuv.h"
#ifdef FANCY_UPSAMPLING
@ -83,13 +83,13 @@
GET_M(ad, s, diag2); /* diag2 = (3a + b + c + 3d) / 8 */ \
\
/* pack the alternate pixels */ \
PACK_AND_STORE(a, b, diag1, diag2, out + 0); /* store top */ \
PACK_AND_STORE(c, d, diag2, diag1, out + 2 * 32); /* store bottom */ \
PACK_AND_STORE(a, b, diag1, diag2, (out) + 0); /* store top */ \
PACK_AND_STORE(c, d, diag2, diag1, (out) + 2 * 32); /* store bottom */ \
}
// Turn the macro into a function for reducing code-size when non-critical
static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
uint8_t* const out) {
static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[],
uint8_t* const out) {
UPSAMPLE_32PIXELS(r1, r2, out);
}
@ -101,30 +101,30 @@ static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
memset(r1 + (num_pixels), r1[(num_pixels) - 1], 17 - (num_pixels)); \
memset(r2 + (num_pixels), r2[(num_pixels) - 1], 17 - (num_pixels)); \
/* using the shared function instead of the macro saves ~3k code size */ \
Upsample32Pixels(r1, r2, out); \
Upsample32Pixels_SSE2(r1, r2, out); \
}
#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, \
top_dst, bottom_dst, cur_x, num_pixels) { \
int n; \
for (n = 0; n < (num_pixels); ++n) { \
FUNC(top_y[(cur_x) + n], r_u[n], r_v[n], \
top_dst + ((cur_x) + n) * XSTEP); \
FUNC((top_y)[(cur_x) + n], r_u[n], r_v[n], \
(top_dst) + ((cur_x) + n) * (XSTEP)); \
} \
if (bottom_y != NULL) { \
if ((bottom_y) != NULL) { \
for (n = 0; n < (num_pixels); ++n) { \
FUNC(bottom_y[(cur_x) + n], r_u[64 + n], r_v[64 + n], \
bottom_dst + ((cur_x) + n) * XSTEP); \
FUNC((bottom_y)[(cur_x) + n], r_u[64 + n], r_v[64 + n], \
(bottom_dst) + ((cur_x) + n) * (XSTEP)); \
} \
} \
}
#define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, \
top_dst, bottom_dst, cur_x) do { \
FUNC##32(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP); \
if (bottom_y != NULL) { \
FUNC##32(bottom_y + (cur_x), r_u + 64, r_v + 64, \
bottom_dst + (cur_x) * XSTEP); \
FUNC##32_SSE2((top_y) + (cur_x), r_u, r_v, (top_dst) + (cur_x) * (XSTEP)); \
if ((bottom_y) != NULL) { \
FUNC##32_SSE2((bottom_y) + (cur_x), r_u + 64, r_v + 64, \
(bottom_dst) + (cur_x) * (XSTEP)); \
} \
} while (0)
@ -169,13 +169,16 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
}
// SSE2 variants of the fancy upsampler.
SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePair, VP8YuvToRgb, 3)
SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePair, VP8YuvToBgr, 3)
SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
SSE2_UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4)
SSE2_UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2)
SSE2_UPSAMPLE_FUNC(UpsampleRgb565LinePair, VP8YuvToRgb565, 2)
SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePair_SSE2, VP8YuvToRgba, 4)
SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePair_SSE2, VP8YuvToBgra, 4)
#if !defined(WEBP_REDUCE_CSP)
SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePair_SSE2, VP8YuvToRgb, 3)
SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePair_SSE2, VP8YuvToBgr, 3)
SSE2_UPSAMPLE_FUNC(UpsampleArgbLinePair_SSE2, VP8YuvToArgb, 4)
SSE2_UPSAMPLE_FUNC(UpsampleRgba4444LinePair_SSE2, VP8YuvToRgba4444, 2)
SSE2_UPSAMPLE_FUNC(UpsampleRgb565LinePair_SSE2, VP8YuvToRgb565, 2)
#endif // WEBP_REDUCE_CSP
#undef GET_M
#undef PACK_AND_STORE
@ -193,17 +196,19 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
extern void WebPInitUpsamplersSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE2(void) {
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_SSE2;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_SSE2;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_SSE2;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_SSE2;
#if !defined(WEBP_REDUCE_CSP)
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_SSE2;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_SSE2;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_SSE2;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_SSE2;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_SSE2;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_SSE2;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_SSE2;
#endif // WEBP_REDUCE_CSP
}
#endif // FANCY_UPSAMPLING
@ -213,29 +218,46 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE2(void) {
extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
extern void WebPInitYUV444ConvertersSSE2(void);
#define YUV444_FUNC(FUNC_NAME, CALL, XSTEP) \
extern void WebP##FUNC_NAME##C(const uint8_t* y, const uint8_t* u, \
const uint8_t* v, uint8_t* dst, int len); \
#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \
extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len); \
static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
int i; \
const int max_len = len & ~31; \
for (i = 0; i < max_len; i += 32) CALL(y + i, u + i, v + i, dst + i * XSTEP);\
for (i = 0; i < max_len; i += 32) { \
CALL(y + i, u + i, v + i, dst + i * (XSTEP)); \
} \
if (i < len) { /* C-fallback */ \
WebP##FUNC_NAME##C(y + i, u + i, v + i, dst + i * XSTEP, len - i); \
CALL_C(y + i, u + i, v + i, dst + i * (XSTEP), len - i); \
} \
}
YUV444_FUNC(Yuv444ToRgba, VP8YuvToRgba32, 4);
YUV444_FUNC(Yuv444ToBgra, VP8YuvToBgra32, 4);
YUV444_FUNC(Yuv444ToRgb, VP8YuvToRgb32, 3);
YUV444_FUNC(Yuv444ToBgr, VP8YuvToBgr32, 3);
YUV444_FUNC(Yuv444ToRgba_SSE2, VP8YuvToRgba32_SSE2, WebPYuv444ToRgba_C, 4);
YUV444_FUNC(Yuv444ToBgra_SSE2, VP8YuvToBgra32_SSE2, WebPYuv444ToBgra_C, 4);
#if !defined(WEBP_REDUCE_CSP)
YUV444_FUNC(Yuv444ToRgb_SSE2, VP8YuvToRgb32_SSE2, WebPYuv444ToRgb_C, 3);
YUV444_FUNC(Yuv444ToBgr_SSE2, VP8YuvToBgr32_SSE2, WebPYuv444ToBgr_C, 3);
YUV444_FUNC(Yuv444ToArgb_SSE2, VP8YuvToArgb32_SSE2, WebPYuv444ToArgb_C, 4)
YUV444_FUNC(Yuv444ToRgba4444_SSE2, VP8YuvToRgba444432_SSE2, \
WebPYuv444ToRgba4444_C, 2)
YUV444_FUNC(Yuv444ToRgb565_SSE2, VP8YuvToRgb56532_SSE2, WebPYuv444ToRgb565_C, 2)
#endif // WEBP_REDUCE_CSP
WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersSSE2(void) {
WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba;
WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra;
WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb;
WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr;
WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba_SSE2;
WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra_SSE2;
WebPYUV444Converters[MODE_rgbA] = Yuv444ToRgba_SSE2;
WebPYUV444Converters[MODE_bgrA] = Yuv444ToBgra_SSE2;
#if !defined(WEBP_REDUCE_CSP)
WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb_SSE2;
WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr_SSE2;
WebPYUV444Converters[MODE_ARGB] = Yuv444ToArgb_SSE2;
WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444_SSE2;
WebPYUV444Converters[MODE_RGB_565] = Yuv444ToRgb565_SSE2;
WebPYUV444Converters[MODE_Argb] = Yuv444ToArgb_SSE2;
WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444_SSE2;
#endif // WEBP_REDUCE_CSP
}
#else

View file

@ -11,63 +11,11 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./yuv.h"
#include "src/dsp/yuv.h"
#include <assert.h>
#include <stdlib.h>
#if defined(WEBP_YUV_USE_TABLE)
static int done = 0;
static WEBP_INLINE uint8_t clip(int v, int max_value) {
return v < 0 ? 0 : v > max_value ? max_value : v;
}
int16_t VP8kVToR[256], VP8kUToB[256];
int32_t VP8kVToG[256], VP8kUToG[256];
uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {
int i;
if (done) {
return;
}
#ifndef USE_YUVj
for (i = 0; i < 256; ++i) {
VP8kVToR[i] = (89858 * (i - 128) + YUV_HALF) >> YUV_FIX;
VP8kUToG[i] = -22014 * (i - 128) + YUV_HALF;
VP8kVToG[i] = -45773 * (i - 128);
VP8kUToB[i] = (113618 * (i - 128) + YUV_HALF) >> YUV_FIX;
}
for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) {
const int k = ((i - 16) * 76283 + YUV_HALF) >> YUV_FIX;
VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
}
#else
for (i = 0; i < 256; ++i) {
VP8kVToR[i] = (91881 * (i - 128) + YUV_HALF) >> YUV_FIX;
VP8kUToG[i] = -22554 * (i - 128) + YUV_HALF;
VP8kVToG[i] = -46802 * (i - 128);
VP8kUToB[i] = (116130 * (i - 128) + YUV_HALF) >> YUV_FIX;
}
for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) {
const int k = i;
VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
}
#endif
done = 1;
}
#else
WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {}
#endif // WEBP_YUV_USE_TABLE
//-----------------------------------------------------------------------------
// Plain-C version
@ -75,14 +23,14 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {}
static void FUNC_NAME(const uint8_t* y, \
const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
const uint8_t* const end = dst + (len & ~1) * XSTEP; \
const uint8_t* const end = dst + (len & ~1) * (XSTEP); \
while (dst != end) { \
FUNC(y[0], u[0], v[0], dst); \
FUNC(y[1], u[0], v[0], dst + XSTEP); \
FUNC(y[1], u[0], v[0], dst + (XSTEP)); \
y += 2; \
++u; \
++v; \
dst += 2 * XSTEP; \
dst += 2 * (XSTEP); \
} \
if (len & 1) { \
FUNC(y[0], u[0], v[0], dst); \
@ -168,7 +116,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) {
//-----------------------------------------------------------------------------
// ARGB -> YUV converters
static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) {
int i;
for (i = 0; i < width; ++i) {
const uint32_t p = argb[i];
@ -220,14 +168,14 @@ void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
//-----------------------------------------------------------------------------
static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
static void ConvertRGB24ToY_C(const uint8_t* rgb, uint8_t* y, int width) {
int i;
for (i = 0; i < width; ++i, rgb += 3) {
y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
}
}
static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
static void ConvertBGR24ToY_C(const uint8_t* bgr, uint8_t* y, int width) {
int i;
for (i = 0; i < width; ++i, bgr += 3) {
y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
@ -246,6 +194,7 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
//-----------------------------------------------------------------------------
#if !WEBP_NEON_OMIT_C_CODE
#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
static uint16_t clip_y(int v) {
return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
@ -283,6 +232,7 @@ static void SharpYUVFilterRow_C(const int16_t* A, const int16_t* B, int len,
out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
#undef MAX_Y
@ -308,22 +258,26 @@ static volatile VP8CPUInfo rgba_to_yuv_last_cpuinfo_used =
(VP8CPUInfo)&rgba_to_yuv_last_cpuinfo_used;
extern void WebPInitConvertARGBToYUVSSE2(void);
extern void WebPInitConvertARGBToYUVNEON(void);
extern void WebPInitSharpYUVSSE2(void);
extern void WebPInitSharpYUVNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
if (rgba_to_yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
WebPConvertARGBToY = ConvertARGBToY;
WebPConvertARGBToY = ConvertARGBToY_C;
WebPConvertARGBToUV = WebPConvertARGBToUV_C;
WebPConvertRGB24ToY = ConvertRGB24ToY;
WebPConvertBGR24ToY = ConvertBGR24ToY;
WebPConvertRGB24ToY = ConvertRGB24ToY_C;
WebPConvertBGR24ToY = ConvertBGR24ToY_C;
WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C;
#if !WEBP_NEON_OMIT_C_CODE
WebPSharpYUVUpdateY = SharpYUVUpdateY_C;
WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_C;
WebPSharpYUVFilterRow = SharpYUVFilterRow_C;
#endif
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
@ -333,5 +287,23 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
}
#endif // WEBP_USE_SSE2
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPInitConvertARGBToYUVNEON();
WebPInitSharpYUVNEON();
}
#endif // WEBP_USE_NEON
assert(WebPConvertARGBToY != NULL);
assert(WebPConvertARGBToUV != NULL);
assert(WebPConvertRGB24ToY != NULL);
assert(WebPConvertBGR24ToY != NULL);
assert(WebPConvertRGBA32ToUV != NULL);
assert(WebPSharpYUVUpdateY != NULL);
assert(WebPSharpYUVUpdateRGB != NULL);
assert(WebPSharpYUVFilterRow != NULL);
rgba_to_yuv_last_cpuinfo_used = VP8GetCPUInfo;
}

View file

@ -35,18 +35,8 @@
#ifndef WEBP_DSP_YUV_H_
#define WEBP_DSP_YUV_H_
#include "./dsp.h"
#include "../dec/vp8_dec.h"
#if defined(WEBP_EXPERIMENTAL_FEATURES)
// Do NOT activate this feature for real compression. This is only experimental!
// This flag is for comparison purpose against JPEG's "YUVj" natural colorspace.
// This colorspace is close to Rec.601's Y'CbCr model with the notable
// difference of allowing larger range for luma/chroma.
// See http://en.wikipedia.org/wiki/YCbCr#JPEG_conversion paragraph, and its
// difference with http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
// #define USE_YUVj
#endif
#include "src/dsp/dsp.h"
#include "src/dec/vp8_dec.h"
//------------------------------------------------------------------------------
// YUV -> RGB conversion
@ -58,12 +48,8 @@ extern "C" {
enum {
YUV_FIX = 16, // fixed-point precision for RGB->YUV
YUV_HALF = 1 << (YUV_FIX - 1),
YUV_MASK = (256 << YUV_FIX) - 1,
YUV_RANGE_MIN = -227, // min value of r/g/b output
YUV_RANGE_MAX = 256 + 226, // max value of r/g/b output
YUV_FIX2 = 6, // fixed-point precision for YUV->RGB
YUV_HALF2 = 1 << YUV_FIX2 >> 1,
YUV_MASK2 = (256 << YUV_FIX2) - 1
};
@ -111,7 +97,7 @@ static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
const int b = VP8YUVToB(y, u); // 5 usable bits
const int rg = (r & 0xf8) | (g >> 5);
const int gb = ((g << 3) & 0xe0) | (b >> 3);
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
rgb[0] = gb;
rgb[1] = rg;
#else
@ -127,7 +113,7 @@ static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
const int b = VP8YUVToB(y, u); // 4 usable bits
const int rg = (r & 0xf0) | (g >> 4);
const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits
#ifdef WEBP_SWAP_16BIT_CSP
#if (WEBP_SWAP_16BIT_CSP == 1)
argb[0] = ba;
argb[1] = rg;
#else
@ -157,29 +143,26 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
rgba[3] = 0xff;
}
// Must be called before everything, to initialize the tables.
void VP8YUVInit(void);
//-----------------------------------------------------------------------------
// SSE2 extra functions (mostly for upsampling_sse2.c)
#if defined(WEBP_USE_SSE2)
// Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v,
void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst);
void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
#endif // WEBP_USE_SSE2
@ -192,8 +175,6 @@ static WEBP_INLINE int VP8ClipUV(int uv, int rounding) {
return ((uv & ~0xff) == 0) ? uv : (uv < 0) ? 0 : 255;
}
#ifndef USE_YUVj
static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
const int luma = 16839 * r + 33059 * g + 6420 * b;
return (luma + rounding + (16 << YUV_FIX)) >> YUV_FIX; // no need to clip
@ -209,28 +190,6 @@ static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
return VP8ClipUV(v, rounding);
}
#else
// This JPEG-YUV colorspace, only for comparison!
// These are also 16bit precision coefficients from Rec.601, but with full
// [0..255] output range.
static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
const int luma = 19595 * r + 38470 * g + 7471 * b;
return (luma + rounding) >> YUV_FIX; // no need to clip
}
static WEBP_INLINE int VP8RGBToU(int r, int g, int b, int rounding) {
const int u = -11058 * r - 21710 * g + 32768 * b;
return VP8ClipUV(u, rounding);
}
static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
const int v = 32768 * r - 27439 * g - 5329 * b;
return VP8ClipUV(v, rounding);
}
#endif // USE_YUVj
#ifdef __cplusplus
} // extern "C"
#endif

View file

@ -12,11 +12,11 @@
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS32)
#include "./yuv.h"
#include "src/dsp/yuv.h"
//------------------------------------------------------------------------------
// simple point-sampling
@ -77,10 +77,10 @@ static void FUNC_NAME(const uint8_t* y, \
} \
}
ROW_FUNC(YuvToRgbRow, 3, 0, 1, 2, 0)
ROW_FUNC(YuvToRgbaRow, 4, 0, 1, 2, 3)
ROW_FUNC(YuvToBgrRow, 3, 2, 1, 0, 0)
ROW_FUNC(YuvToBgraRow, 4, 2, 1, 0, 3)
ROW_FUNC(YuvToRgbRow_MIPS32, 3, 0, 1, 2, 0)
ROW_FUNC(YuvToRgbaRow_MIPS32, 4, 0, 1, 2, 3)
ROW_FUNC(YuvToBgrRow_MIPS32, 3, 2, 1, 0, 0)
ROW_FUNC(YuvToBgraRow_MIPS32, 4, 2, 1, 0, 3)
#undef ROW_FUNC
@ -90,10 +90,10 @@ ROW_FUNC(YuvToBgraRow, 4, 2, 1, 0, 3)
extern void WebPInitSamplersMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPS32(void) {
WebPSamplers[MODE_RGB] = YuvToRgbRow;
WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
WebPSamplers[MODE_BGR] = YuvToBgrRow;
WebPSamplers[MODE_BGRA] = YuvToBgraRow;
WebPSamplers[MODE_RGB] = YuvToRgbRow_MIPS32;
WebPSamplers[MODE_RGBA] = YuvToRgbaRow_MIPS32;
WebPSamplers[MODE_BGR] = YuvToBgrRow_MIPS32;
WebPSamplers[MODE_BGRA] = YuvToBgraRow_MIPS32;
}
#else // !WEBP_USE_MIPS32

View file

@ -12,11 +12,11 @@
// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
// Djordje Pesut (djordje.pesut@imgtec.com)
#include "./dsp.h"
#include "src/dsp/dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "./yuv.h"
#include "src/dsp/yuv.h"
//------------------------------------------------------------------------------
// simple point-sampling
@ -105,10 +105,10 @@ static void FUNC_NAME(const uint8_t* y, \
} \
}
ROW_FUNC(YuvToRgbRow, 3, 0, 1, 2, 0)
ROW_FUNC(YuvToRgbaRow, 4, 0, 1, 2, 3)
ROW_FUNC(YuvToBgrRow, 3, 2, 1, 0, 0)
ROW_FUNC(YuvToBgraRow, 4, 2, 1, 0, 3)
ROW_FUNC(YuvToRgbRow_MIPSdspR2, 3, 0, 1, 2, 0)
ROW_FUNC(YuvToRgbaRow_MIPSdspR2, 4, 0, 1, 2, 3)
ROW_FUNC(YuvToBgrRow_MIPSdspR2, 3, 2, 1, 0, 0)
ROW_FUNC(YuvToBgraRow_MIPSdspR2, 4, 2, 1, 0, 3)
#undef ROW_FUNC
#undef ASM_CLOBBER_LIST
@ -121,10 +121,10 @@ ROW_FUNC(YuvToBgraRow, 4, 2, 1, 0, 3)
extern void WebPInitSamplersMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPSdspR2(void) {
WebPSamplers[MODE_RGB] = YuvToRgbRow;
WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
WebPSamplers[MODE_BGR] = YuvToBgrRow;
WebPSamplers[MODE_BGRA] = YuvToBgraRow;
WebPSamplers[MODE_RGB] = YuvToRgbRow_MIPSdspR2;
WebPSamplers[MODE_RGBA] = YuvToRgbaRow_MIPSdspR2;
WebPSamplers[MODE_BGR] = YuvToBgrRow_MIPSdspR2;
WebPSamplers[MODE_BGRA] = YuvToBgraRow_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2

288
thirdparty/libwebp/src/dsp/yuv_neon.c vendored Normal file
View file

@ -0,0 +1,288 @@
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// YUV->RGB conversion functions
//
// Author: Skal (pascal.massimino@gmail.com)
#include "src/dsp/yuv.h"
#if defined(WEBP_USE_NEON)
#include <assert.h>
#include <stdlib.h>
#include "src/dsp/neon.h"
//-----------------------------------------------------------------------------
static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R,
const uint8x8_t G,
const uint8x8_t B) {
const uint16x8_t r = vmovl_u8(R);
const uint16x8_t g = vmovl_u8(G);
const uint16x8_t b = vmovl_u8(B);
const uint16x4_t r_lo = vget_low_u16(r);
const uint16x4_t r_hi = vget_high_u16(r);
const uint16x4_t g_lo = vget_low_u16(g);
const uint16x4_t g_hi = vget_high_u16(g);
const uint16x4_t b_lo = vget_low_u16(b);
const uint16x4_t b_hi = vget_high_u16(b);
const uint32x4_t tmp0_lo = vmull_n_u16( r_lo, 16839u);
const uint32x4_t tmp0_hi = vmull_n_u16( r_hi, 16839u);
const uint32x4_t tmp1_lo = vmlal_n_u16(tmp0_lo, g_lo, 33059u);
const uint32x4_t tmp1_hi = vmlal_n_u16(tmp0_hi, g_hi, 33059u);
const uint32x4_t tmp2_lo = vmlal_n_u16(tmp1_lo, b_lo, 6420u);
const uint32x4_t tmp2_hi = vmlal_n_u16(tmp1_hi, b_hi, 6420u);
const uint16x8_t Y1 = vcombine_u16(vrshrn_n_u32(tmp2_lo, 16),
vrshrn_n_u32(tmp2_hi, 16));
const uint16x8_t Y2 = vaddq_u16(Y1, vdupq_n_u16(16));
return vqmovn_u16(Y2);
}
static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) {
int i;
for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) {
const uint8x8x3_t RGB = vld3_u8(rgb);
const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[0], RGB.val[1], RGB.val[2]);
vst1_u8(y + i, Y);
}
for (; i < width; ++i, rgb += 3) { // left-over
y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
}
}
static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) {
int i;
for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) {
const uint8x8x3_t BGR = vld3_u8(bgr);
const uint8x8_t Y = ConvertRGBToY_NEON(BGR.val[2], BGR.val[1], BGR.val[0]);
vst1_u8(y + i, Y);
}
for (; i < width; ++i, bgr += 3) { // left-over
y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
}
}
static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) {
int i;
for (i = 0; i + 8 <= width; i += 8) {
const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]);
const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[2], RGB.val[1], RGB.val[0]);
vst1_u8(y + i, Y);
}
for (; i < width; ++i) { // left-over
const uint32_t p = argb[i];
y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff,
YUV_HALF);
}
}
//-----------------------------------------------------------------------------
// computes: DST_s16 = [(C0 * r + C1 * g + C2 * b) >> 16] + CST
#define MULTIPLY_16b_PREAMBLE(r, g, b) \
const int16x4_t r_lo = vreinterpret_s16_u16(vget_low_u16(r)); \
const int16x4_t r_hi = vreinterpret_s16_u16(vget_high_u16(r)); \
const int16x4_t g_lo = vreinterpret_s16_u16(vget_low_u16(g)); \
const int16x4_t g_hi = vreinterpret_s16_u16(vget_high_u16(g)); \
const int16x4_t b_lo = vreinterpret_s16_u16(vget_low_u16(b)); \
const int16x4_t b_hi = vreinterpret_s16_u16(vget_high_u16(b))
#define MULTIPLY_16b(C0, C1, C2, CST, DST_s16) do { \
const int32x4_t tmp0_lo = vmull_n_s16( r_lo, C0); \
const int32x4_t tmp0_hi = vmull_n_s16( r_hi, C0); \
const int32x4_t tmp1_lo = vmlal_n_s16(tmp0_lo, g_lo, C1); \
const int32x4_t tmp1_hi = vmlal_n_s16(tmp0_hi, g_hi, C1); \
const int32x4_t tmp2_lo = vmlal_n_s16(tmp1_lo, b_lo, C2); \
const int32x4_t tmp2_hi = vmlal_n_s16(tmp1_hi, b_hi, C2); \
const int16x8_t tmp3 = vcombine_s16(vshrn_n_s32(tmp2_lo, 16), \
vshrn_n_s32(tmp2_hi, 16)); \
DST_s16 = vaddq_s16(tmp3, vdupq_n_s16(CST)); \
} while (0)
// This needs to be a macro, since (128 << SHIFT) needs to be an immediate.
#define CONVERT_RGB_TO_UV(r, g, b, SHIFT, U_DST, V_DST) do { \
MULTIPLY_16b_PREAMBLE(r, g, b); \
MULTIPLY_16b(-9719, -19081, 28800, 128 << SHIFT, U_DST); \
MULTIPLY_16b(28800, -24116, -4684, 128 << SHIFT, V_DST); \
} while (0)
static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width) {
int i;
for (i = 0; i + 8 <= width; i += 8, rgb += 4 * 8) {
const uint16x8x4_t RGB = vld4q_u16((const uint16_t*)rgb);
int16x8_t U, V;
CONVERT_RGB_TO_UV(RGB.val[0], RGB.val[1], RGB.val[2], 2, U, V);
vst1_u8(u + i, vqrshrun_n_s16(U, 2));
vst1_u8(v + i, vqrshrun_n_s16(V, 2));
}
for (; i < width; i += 1, rgb += 4) {
const int r = rgb[0], g = rgb[1], b = rgb[2];
u[i] = VP8RGBToU(r, g, b, YUV_HALF << 2);
v[i] = VP8RGBToV(r, g, b, YUV_HALF << 2);
}
}
static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v,
int src_width, int do_store) {
int i;
for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) {
const uint8x16x4_t RGB = vld4q_u8((const uint8_t*)&argb[i]);
const uint16x8_t R = vpaddlq_u8(RGB.val[2]); // pair-wise adds
const uint16x8_t G = vpaddlq_u8(RGB.val[1]);
const uint16x8_t B = vpaddlq_u8(RGB.val[0]);
int16x8_t U_tmp, V_tmp;
CONVERT_RGB_TO_UV(R, G, B, 1, U_tmp, V_tmp);
{
const uint8x8_t U = vqrshrun_n_s16(U_tmp, 1);
const uint8x8_t V = vqrshrun_n_s16(V_tmp, 1);
if (do_store) {
vst1_u8(u, U);
vst1_u8(v, V);
} else {
const uint8x8_t prev_u = vld1_u8(u);
const uint8x8_t prev_v = vld1_u8(v);
vst1_u8(u, vrhadd_u8(U, prev_u));
vst1_u8(v, vrhadd_u8(V, prev_v));
}
}
}
if (i < src_width) { // left-over
WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
}
}
//------------------------------------------------------------------------------
extern void WebPInitConvertARGBToYUVNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVNEON(void) {
WebPConvertRGB24ToY = ConvertRGB24ToY_NEON;
WebPConvertBGR24ToY = ConvertBGR24ToY_NEON;
WebPConvertARGBToY = ConvertARGBToY_NEON;
WebPConvertARGBToUV = ConvertARGBToUV_NEON;
WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_NEON;
}
//------------------------------------------------------------------------------
#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
static uint16_t clip_y_NEON(int v) {
return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
}
static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
uint16_t* dst, int len) {
int i;
const int16x8_t zero = vdupq_n_s16(0);
const int16x8_t max = vdupq_n_s16(MAX_Y);
uint64x2_t sum = vdupq_n_u64(0);
uint64_t diff;
for (i = 0; i + 8 <= len; i += 8) {
const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
const int16x8_t D = vsubq_s16(A, B); // diff_y
const int16x8_t F = vaddq_s16(C, D); // new_y
const uint16x8_t H =
vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
const int16x8_t I = vabsq_s16(D); // abs(diff_y)
vst1q_u16(dst + i, H);
sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
}
diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
for (; i < len; ++i) {
const int diff_y = ref[i] - src[i];
const int new_y = (int)(dst[i]) + diff_y;
dst[i] = clip_y_NEON(new_y);
diff += (uint64_t)(abs(diff_y));
}
return diff;
}
static void SharpYUVUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
int16_t* dst, int len) {
int i;
for (i = 0; i + 8 <= len; i += 8) {
const int16x8_t A = vld1q_s16(ref + i);
const int16x8_t B = vld1q_s16(src + i);
const int16x8_t C = vld1q_s16(dst + i);
const int16x8_t D = vsubq_s16(A, B); // diff_uv
const int16x8_t E = vaddq_s16(C, D); // new_uv
vst1q_s16(dst + i, E);
}
for (; i < len; ++i) {
const int diff_uv = ref[i] - src[i];
dst[i] += diff_uv;
}
}
static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out) {
int i;
const int16x8_t max = vdupq_n_s16(MAX_Y);
const int16x8_t zero = vdupq_n_s16(0);
for (i = 0; i + 8 <= len; i += 8) {
const int16x8_t a0 = vld1q_s16(A + i + 0);
const int16x8_t a1 = vld1q_s16(A + i + 1);
const int16x8_t b0 = vld1q_s16(B + i + 0);
const int16x8_t b1 = vld1q_s16(B + i + 1);
const int16x8_t a0b1 = vaddq_s16(a0, b1);
const int16x8_t a1b0 = vaddq_s16(a1, b0);
const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1
const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1)
const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0)
const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
const int16x8_t d0 = vaddq_s16(c1, a0);
const int16x8_t d1 = vaddq_s16(c0, a1);
const int16x8_t e0 = vrshrq_n_s16(d0, 1);
const int16x8_t e1 = vrshrq_n_s16(d1, 1);
const int16x8x2_t f = vzipq_s16(e0, e1);
const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
}
for (; i < len; ++i) {
const int a0b1 = A[i + 0] + B[i + 1];
const int a1b0 = A[i + 1] + B[i + 0];
const int a0a1b0b1 = a0b1 + a1b0 + 8;
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
out[2 * i + 0] = clip_y_NEON(best_y[2 * i + 0] + v0);
out[2 * i + 1] = clip_y_NEON(best_y[2 * i + 1] + v1);
}
}
#undef MAX_Y
//------------------------------------------------------------------------------
extern void WebPInitSharpYUVNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVNEON(void) {
WebPSharpYUVUpdateY = SharpYUVUpdateY_NEON;
WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_NEON;
WebPSharpYUVFilterRow = SharpYUVFilterRow_NEON;
}
#else // !WEBP_USE_NEON
WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVNEON)
WEBP_DSP_INIT_STUB(WebPInitSharpYUVNEON)
#endif // WEBP_USE_NEON

View file

@ -11,11 +11,11 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./yuv.h"
#include "src/dsp/yuv.h"
#if defined(WEBP_USE_SSE2)
#include "./common_sse2.h"
#include "src/dsp/common_sse2.h"
#include <stdlib.h>
#include <emmintrin.h>
@ -26,12 +26,12 @@
// R = (19077 * y + 26149 * v - 14234) >> 6
// G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6
// B = (19077 * y + 33050 * u - 17685) >> 6
static void ConvertYUV444ToRGB(const __m128i* const Y0,
const __m128i* const U0,
const __m128i* const V0,
__m128i* const R,
__m128i* const G,
__m128i* const B) {
static void ConvertYUV444ToRGB_SSE2(const __m128i* const Y0,
const __m128i* const U0,
const __m128i* const V0,
__m128i* const R,
__m128i* const G,
__m128i* const B) {
const __m128i k19077 = _mm_set1_epi16(19077);
const __m128i k26149 = _mm_set1_epi16(26149);
const __m128i k14234 = _mm_set1_epi16(14234);
@ -66,13 +66,13 @@ static void ConvertYUV444ToRGB(const __m128i* const Y0,
}
// Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
static WEBP_INLINE __m128i Load_HI_16(const uint8_t* src) {
static WEBP_INLINE __m128i Load_HI_16_SSE2(const uint8_t* src) {
const __m128i zero = _mm_setzero_si128();
return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
}
// Load and replicate the U/V samples
static WEBP_INLINE __m128i Load_UV_HI_8(const uint8_t* src) {
static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) {
const __m128i zero = _mm_setzero_si128();
const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src);
const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
@ -80,29 +80,33 @@ static WEBP_INLINE __m128i Load_UV_HI_8(const uint8_t* src) {
}
// Convert 32 samples of YUV444 to R/G/B
static void YUV444ToRGB(const uint8_t* const y,
const uint8_t* const u,
const uint8_t* const v,
__m128i* const R, __m128i* const G, __m128i* const B) {
const __m128i Y0 = Load_HI_16(y), U0 = Load_HI_16(u), V0 = Load_HI_16(v);
ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B);
static void YUV444ToRGB_SSE2(const uint8_t* const y,
const uint8_t* const u,
const uint8_t* const v,
__m128i* const R, __m128i* const G,
__m128i* const B) {
const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u),
V0 = Load_HI_16_SSE2(v);
ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
}
// Convert 32 samples of YUV420 to R/G/B
static void YUV420ToRGB(const uint8_t* const y,
const uint8_t* const u,
const uint8_t* const v,
__m128i* const R, __m128i* const G, __m128i* const B) {
const __m128i Y0 = Load_HI_16(y), U0 = Load_UV_HI_8(u), V0 = Load_UV_HI_8(v);
ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B);
static void YUV420ToRGB_SSE2(const uint8_t* const y,
const uint8_t* const u,
const uint8_t* const v,
__m128i* const R, __m128i* const G,
__m128i* const B) {
const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u),
V0 = Load_UV_HI_8_SSE2(v);
ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
}
// Pack R/G/B/A results into 32b output.
static WEBP_INLINE void PackAndStore4(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
const __m128i* const A,
uint8_t* const dst) {
static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
const __m128i* const A,
uint8_t* const dst) {
const __m128i rb = _mm_packus_epi16(*R, *B);
const __m128i ga = _mm_packus_epi16(*G, *A);
const __m128i rg = _mm_unpacklo_epi8(rb, ga);
@ -114,12 +118,12 @@ static WEBP_INLINE void PackAndStore4(const __m128i* const R,
}
// Pack R/G/B/A results into 16b output.
static WEBP_INLINE void PackAndStore4444(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
const __m128i* const A,
uint8_t* const dst) {
#if !defined(WEBP_SWAP_16BIT_CSP)
static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
const __m128i* const A,
uint8_t* const dst) {
#if (WEBP_SWAP_16BIT_CSP == 0)
const __m128i rg0 = _mm_packus_epi16(*R, *G);
const __m128i ba0 = _mm_packus_epi16(*B, *A);
#else
@ -136,10 +140,10 @@ static WEBP_INLINE void PackAndStore4444(const __m128i* const R,
}
// Pack R/G/B results into 16b output.
static WEBP_INLINE void PackAndStore565(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
uint8_t* const dst) {
static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
uint8_t* const dst) {
const __m128i r0 = _mm_packus_epi16(*R, *R);
const __m128i g0 = _mm_packus_epi16(*G, *G);
const __m128i b0 = _mm_packus_epi16(*B, *B);
@ -149,7 +153,7 @@ static WEBP_INLINE void PackAndStore565(const __m128i* const R,
const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0x1c)), 3);
const __m128i rg = _mm_or_si128(r1, g1);
const __m128i gb = _mm_or_si128(g2, b1);
#if !defined(WEBP_SWAP_16BIT_CSP)
#if (WEBP_SWAP_16BIT_CSP == 0)
const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb);
#else
const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg);
@ -160,10 +164,10 @@ static WEBP_INLINE void PackAndStore565(const __m128i* const R,
// Pack the planar buffers
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1,
__m128i* const in2, __m128i* const in3,
__m128i* const in4, __m128i* const in5,
uint8_t* const rgb) {
static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
__m128i* const in2, __m128i* const in3,
__m128i* const in4, __m128i* const in5,
uint8_t* const rgb) {
// The input is 6 registers of sixteen 8b but for the sake of explanation,
// let's take 6 registers of four 8b values.
// To pack, we will keep taking one every two 8b integer and move it
@ -186,69 +190,69 @@ static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1,
_mm_storeu_si128((__m128i*)(rgb + 80), *in5);
}
void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n < 32; n += 8, dst += 32) {
__m128i R, G, B;
YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
PackAndStore4(&R, &G, &B, &kAlpha, dst);
YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
}
}
void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n < 32; n += 8, dst += 32) {
__m128i R, G, B;
YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
PackAndStore4(&B, &G, &R, &kAlpha, dst);
YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
}
}
void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n < 32; n += 8, dst += 32) {
__m128i R, G, B;
YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
PackAndStore4(&kAlpha, &R, &G, &B, dst);
YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
}
}
void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v,
void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n < 32; n += 8, dst += 16) {
__m128i R, G, B;
YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
PackAndStore4444_SSE2(&R, &G, &B, &kAlpha, dst);
}
}
void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
int n;
for (n = 0; n < 32; n += 8, dst += 16) {
__m128i R, G, B;
YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
PackAndStore565_SSE2(&R, &G, &B, dst);
}
}
void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n < 32; n += 8, dst += 16) {
__m128i R, G, B;
YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
PackAndStore4444(&R, &G, &B, &kAlpha, dst);
}
}
void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
int n;
for (n = 0; n < 32; n += 8, dst += 16) {
__m128i R, G, B;
YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
PackAndStore565(&R, &G, &B, dst);
}
}
void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1);
YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2);
YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);
YUV444ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
YUV444ToRGB_SSE2(y + 8, u + 8, v + 8, &R1, &G1, &B1);
YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2);
YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3);
// Cast to 8b and store as RRRRGGGGBBBB.
rgb0 = _mm_packus_epi16(R0, R1);
@ -259,18 +263,18 @@ void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
rgb5 = _mm_packus_epi16(B2, B3);
// Pack as RGBRGBRGBRGB.
PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
}
void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1);
YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2);
YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);
YUV444ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
YUV444ToRGB_SSE2(y + 8, u + 8, v + 8, &R1, &G1, &B1);
YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2);
YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3);
// Cast to 8b and store as BBBBGGGGRRRR.
bgr0 = _mm_packus_epi16(B0, B1);
@ -281,20 +285,21 @@ void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
bgr5= _mm_packus_epi16(R2, R3);
// Pack as BGRBGRBGRBGR.
PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
}
//-----------------------------------------------------------------------------
// Arbitrary-length row conversion functions
static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
static void YuvToRgbaRow_SSE2(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n + 8 <= len; n += 8, dst += 32) {
__m128i R, G, B;
YUV420ToRGB(y, u, v, &R, &G, &B);
PackAndStore4(&R, &G, &B, &kAlpha, dst);
YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
y += 8;
u += 4;
v += 4;
@ -308,14 +313,15 @@ static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
}
}
static void YuvToBgraRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
static void YuvToBgraRow_SSE2(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n + 8 <= len; n += 8, dst += 32) {
__m128i R, G, B;
YUV420ToRGB(y, u, v, &R, &G, &B);
PackAndStore4(&B, &G, &R, &kAlpha, dst);
YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
y += 8;
u += 4;
v += 4;
@ -329,14 +335,15 @@ static void YuvToBgraRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
}
}
static void YuvToArgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
static void YuvToArgbRow_SSE2(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n + 8 <= len; n += 8, dst += 32) {
__m128i R, G, B;
YUV420ToRGB(y, u, v, &R, &G, &B);
PackAndStore4(&kAlpha, &R, &G, &B, dst);
YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
y += 8;
u += 4;
v += 4;
@ -350,17 +357,18 @@ static void YuvToArgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
}
}
static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
static void YuvToRgbRow_SSE2(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
int n;
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1);
YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2);
YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);
YUV420ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
YUV420ToRGB_SSE2(y + 8, u + 4, v + 4, &R1, &G1, &B1);
YUV420ToRGB_SSE2(y + 16, u + 8, v + 8, &R2, &G2, &B2);
YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3);
// Cast to 8b and store as RRRRGGGGBBBB.
rgb0 = _mm_packus_epi16(R0, R1);
@ -371,7 +379,7 @@ static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
rgb5 = _mm_packus_epi16(B2, B3);
// Pack as RGBRGBRGBRGB.
PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
y += 32;
u += 16;
@ -386,17 +394,18 @@ static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
}
}
static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
static void YuvToBgrRow_SSE2(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
int n;
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1);
YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2);
YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);
YUV420ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
YUV420ToRGB_SSE2(y + 8, u + 4, v + 4, &R1, &G1, &B1);
YUV420ToRGB_SSE2(y + 16, u + 8, v + 8, &R2, &G2, &B2);
YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3);
// Cast to 8b and store as BBBBGGGGRRRR.
bgr0 = _mm_packus_epi16(B0, B1);
@ -407,7 +416,7 @@ static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
bgr5 = _mm_packus_epi16(R2, R3);
// Pack as BGRBGRBGRBGR.
PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
y += 32;
u += 16;
@ -428,11 +437,11 @@ static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
extern void WebPInitSamplersSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) {
WebPSamplers[MODE_RGB] = YuvToRgbRow;
WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
WebPSamplers[MODE_BGR] = YuvToBgrRow;
WebPSamplers[MODE_BGRA] = YuvToBgraRow;
WebPSamplers[MODE_ARGB] = YuvToArgbRow;
WebPSamplers[MODE_RGB] = YuvToRgbRow_SSE2;
WebPSamplers[MODE_RGBA] = YuvToRgbaRow_SSE2;
WebPSamplers[MODE_BGR] = YuvToBgrRow_SSE2;
WebPSamplers[MODE_BGRA] = YuvToBgraRow_SSE2;
WebPSamplers[MODE_ARGB] = YuvToArgbRow_SSE2;
}
//------------------------------------------------------------------------------
@ -445,7 +454,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) {
// Function that inserts a value of the second half of the in buffer in between
// every two char of the first half.
static WEBP_INLINE void RGB24PackedToPlanarHelper(
static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2(
const __m128i* const in /*in[6]*/, __m128i* const out /*out[6]*/) {
out[0] = _mm_unpacklo_epi8(in[0], in[3]);
out[1] = _mm_unpackhi_epi8(in[0], in[3]);
@ -458,8 +467,8 @@ static WEBP_INLINE void RGB24PackedToPlanarHelper(
// Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// Similar to PlanarTo24bHelper(), but in reverse order.
static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb,
__m128i* const out /*out[6]*/) {
static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
__m128i tmp[6];
tmp[0] = _mm_loadu_si128((const __m128i*)(rgb + 0));
tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16));
@ -468,16 +477,16 @@ static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb,
tmp[4] = _mm_loadu_si128((const __m128i*)(rgb + 64));
tmp[5] = _mm_loadu_si128((const __m128i*)(rgb + 80));
RGB24PackedToPlanarHelper(tmp, out);
RGB24PackedToPlanarHelper(out, tmp);
RGB24PackedToPlanarHelper(tmp, out);
RGB24PackedToPlanarHelper(out, tmp);
RGB24PackedToPlanarHelper(tmp, out);
RGB24PackedToPlanarHelper_SSE2(tmp, out);
RGB24PackedToPlanarHelper_SSE2(out, tmp);
RGB24PackedToPlanarHelper_SSE2(tmp, out);
RGB24PackedToPlanarHelper_SSE2(out, tmp);
RGB24PackedToPlanarHelper_SSE2(tmp, out);
}
// Convert 8 packed ARGB to r[], g[], b[]
static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb,
__m128i* const rgb /*in[6]*/) {
static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb,
__m128i* const rgb /*in[6]*/) {
const __m128i zero = _mm_setzero_si128();
__m128i a0 = LOAD_16(argb + 0);
__m128i a1 = LOAD_16(argb + 4);
@ -511,10 +520,10 @@ static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb,
} while (0)
#define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
static WEBP_INLINE void ConvertRGBToY(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
__m128i* const Y) {
static WEBP_INLINE void ConvertRGBToY_SSE2(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
__m128i* const Y) {
const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384);
const __m128i kGB_y = MK_CST_16(16384, 6420);
const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF);
@ -526,10 +535,11 @@ static WEBP_INLINE void ConvertRGBToY(const __m128i* const R,
TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
}
static WEBP_INLINE void ConvertRGBToUV(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
__m128i* const U, __m128i* const V) {
static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
__m128i* const U,
__m128i* const V) {
const __m128i kRG_u = MK_CST_16(-9719, -19081);
const __m128i kGB_u = MK_CST_16(0, 28800);
const __m128i kRG_v = MK_CST_16(28800, 0);
@ -549,14 +559,14 @@ static WEBP_INLINE void ConvertRGBToUV(const __m128i* const R,
#undef MK_CST_16
#undef TRANSFORM
static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) {
const int max_width = width & ~31;
int i;
for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
__m128i rgb_plane[6];
int j;
RGB24PackedToPlanar(rgb, rgb_plane);
RGB24PackedToPlanar_SSE2(rgb, rgb_plane);
for (j = 0; j < 2; ++j, i += 16) {
const __m128i zero = _mm_setzero_si128();
@ -566,13 +576,13 @@ static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero);
g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero);
b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero);
ConvertRGBToY(&r, &g, &b, &Y0);
ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
// Convert to 16-bit Y.
r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero);
g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero);
b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero);
ConvertRGBToY(&r, &g, &b, &Y1);
ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
// Cast to 8-bit and store.
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
@ -583,14 +593,14 @@ static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
}
}
static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) {
const int max_width = width & ~31;
int i;
for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
__m128i bgr_plane[6];
int j;
RGB24PackedToPlanar(bgr, bgr_plane);
RGB24PackedToPlanar_SSE2(bgr, bgr_plane);
for (j = 0; j < 2; ++j, i += 16) {
const __m128i zero = _mm_setzero_si128();
@ -600,13 +610,13 @@ static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero);
g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero);
r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero);
ConvertRGBToY(&r, &g, &b, &Y0);
ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
// Convert to 16-bit Y.
b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero);
g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero);
r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero);
ConvertRGBToY(&r, &g, &b, &Y1);
ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
// Cast to 8-bit and store.
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
@ -617,14 +627,14 @@ static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
}
}
static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
static void ConvertARGBToY_SSE2(const uint32_t* argb, uint8_t* y, int width) {
const int max_width = width & ~15;
int i;
for (i = 0; i < max_width; i += 16) {
__m128i Y0, Y1, rgb[6];
RGB32PackedToPlanar(&argb[i], rgb);
ConvertRGBToY(&rgb[0], &rgb[2], &rgb[4], &Y0);
ConvertRGBToY(&rgb[1], &rgb[3], &rgb[5], &Y1);
RGB32PackedToPlanar_SSE2(&argb[i], rgb);
ConvertRGBToY_SSE2(&rgb[0], &rgb[2], &rgb[4], &Y0);
ConvertRGBToY_SSE2(&rgb[1], &rgb[3], &rgb[5], &Y1);
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
}
for (; i < width; ++i) { // left-over
@ -636,31 +646,33 @@ static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
// Horizontal add (doubled) of two 16b values, result is 16b.
// in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ...
static void HorizontalAddPack(const __m128i* const A, const __m128i* const B,
__m128i* const out) {
static void HorizontalAddPack_SSE2(const __m128i* const A,
const __m128i* const B,
__m128i* const out) {
const __m128i k2 = _mm_set1_epi16(2);
const __m128i C = _mm_madd_epi16(*A, k2);
const __m128i D = _mm_madd_epi16(*B, k2);
*out = _mm_packs_epi32(C, D);
}
static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v,
int src_width, int do_store) {
static void ConvertARGBToUV_SSE2(const uint32_t* argb,
uint8_t* u, uint8_t* v,
int src_width, int do_store) {
const int max_width = src_width & ~31;
int i;
for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
__m128i rgb[6], U0, V0, U1, V1;
RGB32PackedToPlanar(&argb[i], rgb);
HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]);
HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]);
HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]);
ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
RGB32PackedToPlanar_SSE2(&argb[i], rgb);
HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]);
HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]);
HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]);
ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
RGB32PackedToPlanar(&argb[i + 16], rgb);
HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]);
HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]);
HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]);
ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
RGB32PackedToPlanar_SSE2(&argb[i + 16], rgb);
HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]);
HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]);
HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]);
ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
U0 = _mm_packus_epi16(U0, U1);
V0 = _mm_packus_epi16(V0, V1);
@ -679,10 +691,9 @@ static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v,
}
// Convert 16 packed ARGB 16b-values to r[], g[], b[]
static WEBP_INLINE void RGBA32PackedToPlanar_16b(const uint16_t* const rgbx,
__m128i* const r,
__m128i* const g,
__m128i* const b) {
static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
const uint16_t* const rgbx,
__m128i* const r, __m128i* const g, __m128i* const b) {
const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x
const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x
const __m128i in2 = LOAD_16(rgbx + 16); // r4 | ...
@ -701,16 +712,16 @@ static WEBP_INLINE void RGBA32PackedToPlanar_16b(const uint16_t* const rgbx,
*b = _mm_unpacklo_epi64(B1, B3);
}
static void ConvertRGBA32ToUV(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width) {
static void ConvertRGBA32ToUV_SSE2(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width) {
const int max_width = width & ~15;
const uint16_t* const last_rgb = rgb + 4 * max_width;
while (rgb < last_rgb) {
__m128i r, g, b, U0, V0, U1, V1;
RGBA32PackedToPlanar_16b(rgb + 0, &r, &g, &b);
ConvertRGBToUV(&r, &g, &b, &U0, &V0);
RGBA32PackedToPlanar_16b(rgb + 32, &r, &g, &b);
ConvertRGBToUV(&r, &g, &b, &U1, &V1);
RGBA32PackedToPlanar_16b_SSE2(rgb + 0, &r, &g, &b);
ConvertRGBToUV_SSE2(&r, &g, &b, &U0, &V0);
RGBA32PackedToPlanar_16b_SSE2(rgb + 32, &r, &g, &b);
ConvertRGBToUV_SSE2(&r, &g, &b, &U1, &V1);
STORE_16(_mm_packus_epi16(U0, U1), u);
STORE_16(_mm_packus_epi16(V0, V1), v);
u += 16;
@ -727,13 +738,13 @@ static void ConvertRGBA32ToUV(const uint16_t* rgb,
extern void WebPInitConvertARGBToYUVSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
WebPConvertARGBToY = ConvertARGBToY;
WebPConvertARGBToUV = ConvertARGBToUV;
WebPConvertARGBToY = ConvertARGBToY_SSE2;
WebPConvertARGBToUV = ConvertARGBToUV_SSE2;
WebPConvertRGB24ToY = ConvertRGB24ToY;
WebPConvertBGR24ToY = ConvertBGR24ToY;
WebPConvertRGB24ToY = ConvertRGB24ToY_SSE2;
WebPConvertBGR24ToY = ConvertBGR24ToY_SSE2;
WebPConvertRGBA32ToUV = ConvertRGBA32ToUV;
WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2;
}
//------------------------------------------------------------------------------

View file

@ -14,12 +14,12 @@
#include <assert.h>
#include <stdlib.h>
#include "./vp8i_enc.h"
#include "../dsp/dsp.h"
#include "../utils/filters_utils.h"
#include "../utils/quant_levels_utils.h"
#include "../utils/utils.h"
#include "../webp/format_constants.h"
#include "src/enc/vp8i_enc.h"
#include "src/dsp/dsp.h"
#include "src/utils/filters_utils.h"
#include "src/utils/quant_levels_utils.h"
#include "src/utils/utils.h"
#include "src/webp/format_constants.h"
// -----------------------------------------------------------------------------
// Encodes the given alpha data via specified compression method 'method'.
@ -44,11 +44,11 @@
// invalid quality or method, or
// memory allocation for the compressed data fails.
#include "../enc/vp8li_enc.h"
#include "src/enc/vp8li_enc.h"
static int EncodeLossless(const uint8_t* const data, int width, int height,
int effort_level, // in [0..6] range
VP8LBitWriter* const bw,
int use_quality_100, VP8LBitWriter* const bw,
WebPAuxStats* const stats) {
int ok = 0;
WebPConfig config;
@ -76,7 +76,10 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
// Set a low default quality for encoding alpha. Ensure that Alpha quality at
// lower methods (3 and below) is less than the threshold for triggering
// costly 'BackwardReferencesTraceBackwards'.
config.quality = 8.f * effort_level;
// If the alpha quality is set to 100 and the method to 6, allow for a high
// lossless quality to trigger the cruncher.
config.quality =
(use_quality_100 && effort_level == 6) ? 100 : 8.f * effort_level;
assert(config.quality >= 0 && config.quality <= 100.f);
// TODO(urvang): Temporary fix to avoid generating images that trigger
@ -134,7 +137,7 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
if (method != ALPHA_NO_COMPRESSION) {
ok = VP8LBitWriterInit(&tmp_bw, data_size >> 3);
ok = ok && EncodeLossless(alpha_src, width, height, effort_level,
&tmp_bw, &result->stats);
!reduce_levels, &tmp_bw, &result->stats);
if (ok) {
output = VP8LBitWriterFinish(&tmp_bw);
output_size = VP8LBitWriterNumBytes(&tmp_bw);
@ -264,6 +267,7 @@ static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
reduce_levels, effort_level, NULL, &best);
}
if (ok) {
#if !defined(WEBP_DISABLE_STATS)
if (stats != NULL) {
stats->lossless_features = best.stats.lossless_features;
stats->histogram_bits = best.stats.histogram_bits;
@ -274,6 +278,9 @@ static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
stats->lossless_hdr_size = best.stats.lossless_hdr_size;
stats->lossless_data_size = best.stats.lossless_data_size;
}
#else
(void)stats;
#endif
*output_size = VP8BitWriterSize(&best.bw);
*output = VP8BitWriterBuf(&best.bw);
} else {
@ -339,10 +346,12 @@ static int EncodeAlpha(VP8Encoder* const enc,
ok = ApplyFiltersAndEncode(quant_alpha, width, height, data_size, method,
filter, reduce_levels, effort_level, output,
output_size, pic->stats);
#if !defined(WEBP_DISABLE_STATS)
if (pic->stats != NULL) { // need stats?
pic->stats->coded_size += (int)(*output_size);
enc->sse_[3] = sse;
}
#endif
}
WebPSafeFree(quant_alpha);

View file

@ -15,9 +15,9 @@
#include <string.h>
#include <assert.h>
#include "./vp8i_enc.h"
#include "./cost_enc.h"
#include "../utils/utils.h"
#include "src/enc/vp8i_enc.h"
#include "src/enc/cost_enc.h"
#include "src/utils/utils.h"
#define MAX_ITERS_K_MEANS 6

View file

@ -0,0 +1,790 @@
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Improves a given set of backward references by analyzing its bit cost.
// The algorithm is similar to the Zopfli compression algorithm but tailored to
// images.
//
// Author: Vincent Rabaud (vrabaud@google.com)
//
#include <assert.h>
#include "src/enc/backward_references_enc.h"
#include "src/enc/histogram_enc.h"
#include "src/dsp/lossless_common.h"
#include "src/utils/color_cache_utils.h"
#include "src/utils/utils.h"
#define VALUES_IN_BYTE 256
extern void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs);
extern int VP8LDistanceToPlaneCode(int xsize, int dist);
extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
const PixOrCopy v);
typedef struct {
double alpha_[VALUES_IN_BYTE];
double red_[VALUES_IN_BYTE];
double blue_[VALUES_IN_BYTE];
double distance_[NUM_DISTANCE_CODES];
double* literal_;
} CostModel;
static void ConvertPopulationCountTableToBitEstimates(
int num_symbols, const uint32_t population_counts[], double output[]) {
uint32_t sum = 0;
int nonzeros = 0;
int i;
for (i = 0; i < num_symbols; ++i) {
sum += population_counts[i];
if (population_counts[i] > 0) {
++nonzeros;
}
}
if (nonzeros <= 1) {
memset(output, 0, num_symbols * sizeof(*output));
} else {
const double logsum = VP8LFastLog2(sum);
for (i = 0; i < num_symbols; ++i) {
output[i] = logsum - VP8LFastLog2(population_counts[i]);
}
}
}
static int CostModelBuild(CostModel* const m, int xsize, int cache_bits,
const VP8LBackwardRefs* const refs) {
int ok = 0;
VP8LRefsCursor c = VP8LRefsCursorInit(refs);
VP8LHistogram* const histo = VP8LAllocateHistogram(cache_bits);
if (histo == NULL) goto Error;
// The following code is similar to VP8LHistogramCreate but converts the
// distance to plane code.
VP8LHistogramInit(histo, cache_bits);
while (VP8LRefsCursorOk(&c)) {
VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos, VP8LDistanceToPlaneCode,
xsize);
VP8LRefsCursorNext(&c);
}
ConvertPopulationCountTableToBitEstimates(
VP8LHistogramNumCodes(histo->palette_code_bits_),
histo->literal_, m->literal_);
ConvertPopulationCountTableToBitEstimates(
VALUES_IN_BYTE, histo->red_, m->red_);
ConvertPopulationCountTableToBitEstimates(
VALUES_IN_BYTE, histo->blue_, m->blue_);
ConvertPopulationCountTableToBitEstimates(
VALUES_IN_BYTE, histo->alpha_, m->alpha_);
ConvertPopulationCountTableToBitEstimates(
NUM_DISTANCE_CODES, histo->distance_, m->distance_);
ok = 1;
Error:
VP8LFreeHistogram(histo);
return ok;
}
static WEBP_INLINE double GetLiteralCost(const CostModel* const m, uint32_t v) {
return m->alpha_[v >> 24] +
m->red_[(v >> 16) & 0xff] +
m->literal_[(v >> 8) & 0xff] +
m->blue_[v & 0xff];
}
static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) {
const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx;
return m->literal_[literal_idx];
}
static WEBP_INLINE double GetLengthCost(const CostModel* const m,
uint32_t length) {
int code, extra_bits;
VP8LPrefixEncodeBits(length, &code, &extra_bits);
return m->literal_[VALUES_IN_BYTE + code] + extra_bits;
}
static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
uint32_t distance) {
int code, extra_bits;
VP8LPrefixEncodeBits(distance, &code, &extra_bits);
return m->distance_[code] + extra_bits;
}
static WEBP_INLINE void AddSingleLiteralWithCostModel(
const uint32_t* const argb, VP8LColorCache* const hashers,
const CostModel* const cost_model, int idx, int use_color_cache,
float prev_cost, float* const cost, uint16_t* const dist_array) {
double cost_val = prev_cost;
const uint32_t color = argb[idx];
const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1;
if (ix >= 0) {
// use_color_cache is true and hashers contains color
const double mul0 = 0.68;
cost_val += GetCacheCost(cost_model, ix) * mul0;
} else {
const double mul1 = 0.82;
if (use_color_cache) VP8LColorCacheInsert(hashers, color);
cost_val += GetLiteralCost(cost_model, color) * mul1;
}
if (cost[idx] > cost_val) {
cost[idx] = (float)cost_val;
dist_array[idx] = 1; // only one is inserted.
}
}
// -----------------------------------------------------------------------------
// CostManager and interval handling
// Empirical value to avoid high memory consumption but good for performance.
#define COST_CACHE_INTERVAL_SIZE_MAX 500
// To perform backward reference every pixel at index index_ is considered and
// the cost for the MAX_LENGTH following pixels computed. Those following pixels
// at index index_ + k (k from 0 to MAX_LENGTH) have a cost of:
// cost_ = distance cost at index + GetLengthCost(cost_model, k)
// and the minimum value is kept. GetLengthCost(cost_model, k) is cached in an
// array of size MAX_LENGTH.
// Instead of performing MAX_LENGTH comparisons per pixel, we keep track of the
// minimal values using intervals of constant cost.
// An interval is defined by the index_ of the pixel that generated it and
// is only useful in a range of indices from start_ to end_ (exclusive), i.e.
// it contains the minimum value for pixels between start_ and end_.
// Intervals are stored in a linked list and ordered by start_. When a new
// interval has a better value, old intervals are split or removed. There are
// therefore no overlapping intervals.
typedef struct CostInterval CostInterval;
struct CostInterval {
float cost_;
int start_;
int end_;
int index_;
CostInterval* previous_;
CostInterval* next_;
};
// The GetLengthCost(cost_model, k) are cached in a CostCacheInterval.
typedef struct {
double cost_;
int start_;
int end_; // Exclusive.
} CostCacheInterval;
// This structure is in charge of managing intervals and costs.
// It caches the different CostCacheInterval, caches the different
// GetLengthCost(cost_model, k) in cost_cache_ and the CostInterval's (whose
// count_ is limited by COST_CACHE_INTERVAL_SIZE_MAX).
#define COST_MANAGER_MAX_FREE_LIST 10
typedef struct {
CostInterval* head_;
int count_; // The number of stored intervals.
CostCacheInterval* cache_intervals_;
size_t cache_intervals_size_;
double cost_cache_[MAX_LENGTH]; // Contains the GetLengthCost(cost_model, k).
float* costs_;
uint16_t* dist_array_;
// Most of the time, we only need few intervals -> use a free-list, to avoid
// fragmentation with small allocs in most common cases.
CostInterval intervals_[COST_MANAGER_MAX_FREE_LIST];
CostInterval* free_intervals_;
// These are regularly malloc'd remains. This list can't grow larger than than
// size COST_CACHE_INTERVAL_SIZE_MAX - COST_MANAGER_MAX_FREE_LIST, note.
CostInterval* recycled_intervals_;
} CostManager;
static void CostIntervalAddToFreeList(CostManager* const manager,
CostInterval* const interval) {
interval->next_ = manager->free_intervals_;
manager->free_intervals_ = interval;
}
static int CostIntervalIsInFreeList(const CostManager* const manager,
const CostInterval* const interval) {
return (interval >= &manager->intervals_[0] &&
interval <= &manager->intervals_[COST_MANAGER_MAX_FREE_LIST - 1]);
}
static void CostManagerInitFreeList(CostManager* const manager) {
int i;
manager->free_intervals_ = NULL;
for (i = 0; i < COST_MANAGER_MAX_FREE_LIST; ++i) {
CostIntervalAddToFreeList(manager, &manager->intervals_[i]);
}
}
static void DeleteIntervalList(CostManager* const manager,
const CostInterval* interval) {
while (interval != NULL) {
const CostInterval* const next = interval->next_;
if (!CostIntervalIsInFreeList(manager, interval)) {
WebPSafeFree((void*)interval);
} // else: do nothing
interval = next;
}
}
static void CostManagerClear(CostManager* const manager) {
if (manager == NULL) return;
WebPSafeFree(manager->costs_);
WebPSafeFree(manager->cache_intervals_);
// Clear the interval lists.
DeleteIntervalList(manager, manager->head_);
manager->head_ = NULL;
DeleteIntervalList(manager, manager->recycled_intervals_);
manager->recycled_intervals_ = NULL;
// Reset pointers, count_ and cache_intervals_size_.
memset(manager, 0, sizeof(*manager));
CostManagerInitFreeList(manager);
}
static int CostManagerInit(CostManager* const manager,
uint16_t* const dist_array, int pix_count,
const CostModel* const cost_model) {
int i;
const int cost_cache_size = (pix_count > MAX_LENGTH) ? MAX_LENGTH : pix_count;
manager->costs_ = NULL;
manager->cache_intervals_ = NULL;
manager->head_ = NULL;
manager->recycled_intervals_ = NULL;
manager->count_ = 0;
manager->dist_array_ = dist_array;
CostManagerInitFreeList(manager);
// Fill in the cost_cache_.
manager->cache_intervals_size_ = 1;
manager->cost_cache_[0] = GetLengthCost(cost_model, 0);
for (i = 1; i < cost_cache_size; ++i) {
manager->cost_cache_[i] = GetLengthCost(cost_model, i);
// Get the number of bound intervals.
if (manager->cost_cache_[i] != manager->cost_cache_[i - 1]) {
++manager->cache_intervals_size_;
}
}
// With the current cost model, we usually have below 20 intervals.
// The worst case scenario with a cost model would be if every length has a
// different cost, hence MAX_LENGTH but that is impossible with the current
// implementation that spirals around a pixel.
assert(manager->cache_intervals_size_ <= MAX_LENGTH);
manager->cache_intervals_ = (CostCacheInterval*)WebPSafeMalloc(
manager->cache_intervals_size_, sizeof(*manager->cache_intervals_));
if (manager->cache_intervals_ == NULL) {
CostManagerClear(manager);
return 0;
}
// Fill in the cache_intervals_.
{
CostCacheInterval* cur = manager->cache_intervals_;
// Consecutive values in cost_cache_ are compared and if a big enough
// difference is found, a new interval is created and bounded.
cur->start_ = 0;
cur->end_ = 1;
cur->cost_ = manager->cost_cache_[0];
for (i = 1; i < cost_cache_size; ++i) {
const double cost_val = manager->cost_cache_[i];
if (cost_val != cur->cost_) {
++cur;
// Initialize an interval.
cur->start_ = i;
cur->cost_ = cost_val;
}
cur->end_ = i + 1;
}
}
manager->costs_ = (float*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_));
if (manager->costs_ == NULL) {
CostManagerClear(manager);
return 0;
}
// Set the initial costs_ high for every pixel as we will keep the minimum.
for (i = 0; i < pix_count; ++i) manager->costs_[i] = 1e38f;
return 1;
}
// Given the cost and the position that define an interval, update the cost at
// pixel 'i' if it is smaller than the previously computed value.
static WEBP_INLINE void UpdateCost(CostManager* const manager, int i,
int position, float cost) {
const int k = i - position;
assert(k >= 0 && k < MAX_LENGTH);
if (manager->costs_[i] > cost) {
manager->costs_[i] = cost;
manager->dist_array_[i] = k + 1;
}
}
// Given the cost and the position that define an interval, update the cost for
// all the pixels between 'start' and 'end' excluded.
static WEBP_INLINE void UpdateCostPerInterval(CostManager* const manager,
int start, int end, int position,
float cost) {
int i;
for (i = start; i < end; ++i) UpdateCost(manager, i, position, cost);
}
// Given two intervals, make 'prev' be the previous one of 'next' in 'manager'.
static WEBP_INLINE void ConnectIntervals(CostManager* const manager,
CostInterval* const prev,
CostInterval* const next) {
if (prev != NULL) {
prev->next_ = next;
} else {
manager->head_ = next;
}
if (next != NULL) next->previous_ = prev;
}
// Pop an interval in the manager.
static WEBP_INLINE void PopInterval(CostManager* const manager,
CostInterval* const interval) {
if (interval == NULL) return;
ConnectIntervals(manager, interval->previous_, interval->next_);
if (CostIntervalIsInFreeList(manager, interval)) {
CostIntervalAddToFreeList(manager, interval);
} else { // recycle regularly malloc'd intervals too
interval->next_ = manager->recycled_intervals_;
manager->recycled_intervals_ = interval;
}
--manager->count_;
assert(manager->count_ >= 0);
}
// Update the cost at index i by going over all the stored intervals that
// overlap with i.
// If 'do_clean_intervals' is set to something different than 0, intervals that
// end before 'i' will be popped.
static WEBP_INLINE void UpdateCostAtIndex(CostManager* const manager, int i,
int do_clean_intervals) {
CostInterval* current = manager->head_;
while (current != NULL && current->start_ <= i) {
CostInterval* const next = current->next_;
if (current->end_ <= i) {
if (do_clean_intervals) {
// We have an outdated interval, remove it.
PopInterval(manager, current);
}
} else {
UpdateCost(manager, i, current->index_, current->cost_);
}
current = next;
}
}
// Given a current orphan interval and its previous interval, before
// it was orphaned (which can be NULL), set it at the right place in the list
// of intervals using the start_ ordering and the previous interval as a hint.
static WEBP_INLINE void PositionOrphanInterval(CostManager* const manager,
CostInterval* const current,
CostInterval* previous) {
assert(current != NULL);
if (previous == NULL) previous = manager->head_;
while (previous != NULL && current->start_ < previous->start_) {
previous = previous->previous_;
}
while (previous != NULL && previous->next_ != NULL &&
previous->next_->start_ < current->start_) {
previous = previous->next_;
}
if (previous != NULL) {
ConnectIntervals(manager, current, previous->next_);
} else {
ConnectIntervals(manager, current, manager->head_);
}
ConnectIntervals(manager, previous, current);
}
// Insert an interval in the list contained in the manager by starting at
// interval_in as a hint. The intervals are sorted by start_ value.
static WEBP_INLINE void InsertInterval(CostManager* const manager,
CostInterval* const interval_in,
float cost, int position, int start,
int end) {
CostInterval* interval_new;
if (start >= end) return;
if (manager->count_ >= COST_CACHE_INTERVAL_SIZE_MAX) {
// Serialize the interval if we cannot store it.
UpdateCostPerInterval(manager, start, end, position, cost);
return;
}
if (manager->free_intervals_ != NULL) {
interval_new = manager->free_intervals_;
manager->free_intervals_ = interval_new->next_;
} else if (manager->recycled_intervals_ != NULL) {
interval_new = manager->recycled_intervals_;
manager->recycled_intervals_ = interval_new->next_;
} else { // malloc for good
interval_new = (CostInterval*)WebPSafeMalloc(1, sizeof(*interval_new));
if (interval_new == NULL) {
// Write down the interval if we cannot create it.
UpdateCostPerInterval(manager, start, end, position, cost);
return;
}
}
interval_new->cost_ = cost;
interval_new->index_ = position;
interval_new->start_ = start;
interval_new->end_ = end;
PositionOrphanInterval(manager, interval_new, interval_in);
++manager->count_;
}
// Given a new cost interval defined by its start at position, its length value
// and distance_cost, add its contributions to the previous intervals and costs.
// If handling the interval or one of its subintervals becomes to heavy, its
// contribution is added to the costs right away.
static WEBP_INLINE void PushInterval(CostManager* const manager,
double distance_cost, int position,
int len) {
size_t i;
CostInterval* interval = manager->head_;
CostInterval* interval_next;
const CostCacheInterval* const cost_cache_intervals =
manager->cache_intervals_;
// If the interval is small enough, no need to deal with the heavy
// interval logic, just serialize it right away. This constant is empirical.
const int kSkipDistance = 10;
if (len < kSkipDistance) {
int j;
for (j = position; j < position + len; ++j) {
const int k = j - position;
float cost_tmp;
assert(k >= 0 && k < MAX_LENGTH);
cost_tmp = (float)(distance_cost + manager->cost_cache_[k]);
if (manager->costs_[j] > cost_tmp) {
manager->costs_[j] = cost_tmp;
manager->dist_array_[j] = k + 1;
}
}
return;
}
for (i = 0; i < manager->cache_intervals_size_ &&
cost_cache_intervals[i].start_ < len;
++i) {
// Define the intersection of the ith interval with the new one.
int start = position + cost_cache_intervals[i].start_;
const int end = position + (cost_cache_intervals[i].end_ > len
? len
: cost_cache_intervals[i].end_);
const float cost = (float)(distance_cost + cost_cache_intervals[i].cost_);
for (; interval != NULL && interval->start_ < end;
interval = interval_next) {
interval_next = interval->next_;
// Make sure we have some overlap
if (start >= interval->end_) continue;
if (cost >= interval->cost_) {
// When intervals are represented, the lower, the better.
// [**********************************************************[
// start end
// [----------------------------------[
// interval->start_ interval->end_
// If we are worse than what we already have, add whatever we have so
// far up to interval.
const int start_new = interval->end_;
InsertInterval(manager, interval, cost, position, start,
interval->start_);
start = start_new;
if (start >= end) break;
continue;
}
if (start <= interval->start_) {
if (interval->end_ <= end) {
// [----------------------------------[
// interval->start_ interval->end_
// [**************************************************************[
// start end
// We can safely remove the old interval as it is fully included.
PopInterval(manager, interval);
} else {
// [------------------------------------[
// interval->start_ interval->end_
// [*****************************[
// start end
interval->start_ = end;
break;
}
} else {
if (end < interval->end_) {
// [--------------------------------------------------------------[
// interval->start_ interval->end_
// [*****************************[
// start end
// We have to split the old interval as it fully contains the new one.
const int end_original = interval->end_;
interval->end_ = start;
InsertInterval(manager, interval, interval->cost_, interval->index_,
end, end_original);
interval = interval->next_;
break;
} else {
// [------------------------------------[
// interval->start_ interval->end_
// [*****************************[
// start end
interval->end_ = start;
}
}
}
// Insert the remaining interval from start to end.
InsertInterval(manager, interval, cost, position, start, end);
}
}
static int BackwardReferencesHashChainDistanceOnly(
int xsize, int ysize, const uint32_t* const argb, int cache_bits,
const VP8LHashChain* const hash_chain, const VP8LBackwardRefs* const refs,
uint16_t* const dist_array) {
int i;
int ok = 0;
int cc_init = 0;
const int pix_count = xsize * ysize;
const int use_color_cache = (cache_bits > 0);
const size_t literal_array_size =
sizeof(double) * (NUM_LITERAL_CODES + NUM_LENGTH_CODES +
((cache_bits > 0) ? (1 << cache_bits) : 0));
const size_t cost_model_size = sizeof(CostModel) + literal_array_size;
CostModel* const cost_model =
(CostModel*)WebPSafeCalloc(1ULL, cost_model_size);
VP8LColorCache hashers;
CostManager* cost_manager =
(CostManager*)WebPSafeMalloc(1ULL, sizeof(*cost_manager));
int offset_prev = -1, len_prev = -1;
double offset_cost = -1;
int first_offset_is_constant = -1; // initialized with 'impossible' value
int reach = 0;
if (cost_model == NULL || cost_manager == NULL) goto Error;
cost_model->literal_ = (double*)(cost_model + 1);
if (use_color_cache) {
cc_init = VP8LColorCacheInit(&hashers, cache_bits);
if (!cc_init) goto Error;
}
if (!CostModelBuild(cost_model, xsize, cache_bits, refs)) {
goto Error;
}
if (!CostManagerInit(cost_manager, dist_array, pix_count, cost_model)) {
goto Error;
}
// We loop one pixel at a time, but store all currently best points to
// non-processed locations from this point.
dist_array[0] = 0;
// Add first pixel as literal.
AddSingleLiteralWithCostModel(argb, &hashers, cost_model, 0, use_color_cache,
0.f, cost_manager->costs_, dist_array);
for (i = 1; i < pix_count; ++i) {
const float prev_cost = cost_manager->costs_[i - 1];
int offset, len;
VP8LHashChainFindCopy(hash_chain, i, &offset, &len);
// Try adding the pixel as a literal.
AddSingleLiteralWithCostModel(argb, &hashers, cost_model, i,
use_color_cache, prev_cost,
cost_manager->costs_, dist_array);
// If we are dealing with a non-literal.
if (len >= 2) {
if (offset != offset_prev) {
const int code = VP8LDistanceToPlaneCode(xsize, offset);
offset_cost = GetDistanceCost(cost_model, code);
first_offset_is_constant = 1;
PushInterval(cost_manager, prev_cost + offset_cost, i, len);
} else {
assert(offset_cost >= 0);
assert(len_prev >= 0);
assert(first_offset_is_constant == 0 || first_offset_is_constant == 1);
// Instead of considering all contributions from a pixel i by calling:
// PushInterval(cost_manager, prev_cost + offset_cost, i, len);
// we optimize these contributions in case offset_cost stays the same
// for consecutive pixels. This describes a set of pixels similar to a
// previous set (e.g. constant color regions).
if (first_offset_is_constant) {
reach = i - 1 + len_prev - 1;
first_offset_is_constant = 0;
}
if (i + len - 1 > reach) {
// We can only be go further with the same offset if the previous
// length was maxed, hence len_prev == len == MAX_LENGTH.
// TODO(vrabaud), bump i to the end right away (insert cache and
// update cost).
// TODO(vrabaud), check if one of the points in between does not have
// a lower cost.
// Already consider the pixel at "reach" to add intervals that are
// better than whatever we add.
int offset_j, len_j = 0;
int j;
assert(len == MAX_LENGTH || len == pix_count - i);
// Figure out the last consecutive pixel within [i, reach + 1] with
// the same offset.
for (j = i; j <= reach; ++j) {
VP8LHashChainFindCopy(hash_chain, j + 1, &offset_j, &len_j);
if (offset_j != offset) {
VP8LHashChainFindCopy(hash_chain, j, &offset_j, &len_j);
break;
}
}
// Update the cost at j - 1 and j.
UpdateCostAtIndex(cost_manager, j - 1, 0);
UpdateCostAtIndex(cost_manager, j, 0);
PushInterval(cost_manager, cost_manager->costs_[j - 1] + offset_cost,
j, len_j);
reach = j + len_j - 1;
}
}
}
UpdateCostAtIndex(cost_manager, i, 1);
offset_prev = offset;
len_prev = len;
}
ok = !refs->error_;
Error:
if (cc_init) VP8LColorCacheClear(&hashers);
CostManagerClear(cost_manager);
WebPSafeFree(cost_model);
WebPSafeFree(cost_manager);
return ok;
}
// We pack the path at the end of *dist_array and return
// a pointer to this part of the array. Example:
// dist_array = [1x2xx3x2] => packed [1x2x1232], chosen_path = [1232]
static void TraceBackwards(uint16_t* const dist_array,
int dist_array_size,
uint16_t** const chosen_path,
int* const chosen_path_size) {
uint16_t* path = dist_array + dist_array_size;
uint16_t* cur = dist_array + dist_array_size - 1;
while (cur >= dist_array) {
const int k = *cur;
--path;
*path = k;
cur -= k;
}
*chosen_path = path;
*chosen_path_size = (int)(dist_array + dist_array_size - path);
}
static int BackwardReferencesHashChainFollowChosenPath(
const uint32_t* const argb, int cache_bits,
const uint16_t* const chosen_path, int chosen_path_size,
const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs) {
const int use_color_cache = (cache_bits > 0);
int ix;
int i = 0;
int ok = 0;
int cc_init = 0;
VP8LColorCache hashers;
if (use_color_cache) {
cc_init = VP8LColorCacheInit(&hashers, cache_bits);
if (!cc_init) goto Error;
}
VP8LClearBackwardRefs(refs);
for (ix = 0; ix < chosen_path_size; ++ix) {
const int len = chosen_path[ix];
if (len != 1) {
int k;
const int offset = VP8LHashChainFindOffset(hash_chain, i);
VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
if (use_color_cache) {
for (k = 0; k < len; ++k) {
VP8LColorCacheInsert(&hashers, argb[i + k]);
}
}
i += len;
} else {
PixOrCopy v;
const int idx =
use_color_cache ? VP8LColorCacheContains(&hashers, argb[i]) : -1;
if (idx >= 0) {
// use_color_cache is true and hashers contains argb[i]
// push pixel as a color cache index
v = PixOrCopyCreateCacheIdx(idx);
} else {
if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
v = PixOrCopyCreateLiteral(argb[i]);
}
VP8LBackwardRefsCursorAdd(refs, v);
++i;
}
}
ok = !refs->error_;
Error:
if (cc_init) VP8LColorCacheClear(&hashers);
return ok;
}
// Returns 1 on success.
extern int VP8LBackwardReferencesTraceBackwards(
int xsize, int ysize, const uint32_t* const argb, int cache_bits,
const VP8LHashChain* const hash_chain,
const VP8LBackwardRefs* const refs_src, VP8LBackwardRefs* const refs_dst);
int VP8LBackwardReferencesTraceBackwards(int xsize, int ysize,
const uint32_t* const argb,
int cache_bits,
const VP8LHashChain* const hash_chain,
const VP8LBackwardRefs* const refs_src,
VP8LBackwardRefs* const refs_dst) {
int ok = 0;
const int dist_array_size = xsize * ysize;
uint16_t* chosen_path = NULL;
int chosen_path_size = 0;
uint16_t* dist_array =
(uint16_t*)WebPSafeMalloc(dist_array_size, sizeof(*dist_array));
if (dist_array == NULL) goto Error;
if (!BackwardReferencesHashChainDistanceOnly(
xsize, ysize, argb, cache_bits, hash_chain, refs_src, dist_array)) {
goto Error;
}
TraceBackwards(dist_array, dist_array_size, &chosen_path, &chosen_path_size);
if (!BackwardReferencesHashChainFollowChosenPath(
argb, cache_bits, chosen_path, chosen_path_size, hash_chain,
refs_dst)) {
goto Error;
}
ok = 1;
Error:
WebPSafeFree(dist_array);
return ok;
}

View file

@ -0,0 +1,943 @@
// Copyright 2012 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Author: Jyrki Alakuijala (jyrki@google.com)
//
#include <assert.h>
#include <math.h>
#include "src/enc/backward_references_enc.h"
#include "src/enc/histogram_enc.h"
#include "src/dsp/lossless.h"
#include "src/dsp/lossless_common.h"
#include "src/dsp/dsp.h"
#include "src/utils/color_cache_utils.h"
#include "src/utils/utils.h"
#define MIN_BLOCK_SIZE 256 // minimum block size for backward references
#define MAX_ENTROPY (1e30f)
// 1M window (4M bytes) minus 120 special codes for short distances.
#define WINDOW_SIZE ((1 << WINDOW_SIZE_BITS) - 120)
// Minimum number of pixels for which it is cheaper to encode a
// distance + length instead of each pixel as a literal.
#define MIN_LENGTH 4
// -----------------------------------------------------------------------------
static const uint8_t plane_to_code_lut[128] = {
96, 73, 55, 39, 23, 13, 5, 1, 255, 255, 255, 255, 255, 255, 255, 255,
101, 78, 58, 42, 26, 16, 8, 2, 0, 3, 9, 17, 27, 43, 59, 79,
102, 86, 62, 46, 32, 20, 10, 6, 4, 7, 11, 21, 33, 47, 63, 87,
105, 90, 70, 52, 37, 28, 18, 14, 12, 15, 19, 29, 38, 53, 71, 91,
110, 99, 82, 66, 48, 35, 30, 24, 22, 25, 31, 36, 49, 67, 83, 100,
115, 108, 94, 76, 64, 50, 44, 40, 34, 41, 45, 51, 65, 77, 95, 109,
118, 113, 103, 92, 80, 68, 60, 56, 54, 57, 61, 69, 81, 93, 104, 114,
119, 116, 111, 106, 97, 88, 84, 74, 72, 75, 85, 89, 98, 107, 112, 117
};
extern int VP8LDistanceToPlaneCode(int xsize, int dist);
int VP8LDistanceToPlaneCode(int xsize, int dist) {
const int yoffset = dist / xsize;
const int xoffset = dist - yoffset * xsize;
if (xoffset <= 8 && yoffset < 8) {
return plane_to_code_lut[yoffset * 16 + 8 - xoffset] + 1;
} else if (xoffset > xsize - 8 && yoffset < 7) {
return plane_to_code_lut[(yoffset + 1) * 16 + 8 + (xsize - xoffset)] + 1;
}
return dist + 120;
}
// Returns the exact index where array1 and array2 are different. For an index
// inferior or equal to best_len_match, the return value just has to be strictly
// inferior to best_len_match. The current behavior is to return 0 if this index
// is best_len_match, and the index itself otherwise.
// If no two elements are the same, it returns max_limit.
static WEBP_INLINE int FindMatchLength(const uint32_t* const array1,
const uint32_t* const array2,
int best_len_match, int max_limit) {
// Before 'expensive' linear match, check if the two arrays match at the
// current best length index.
if (array1[best_len_match] != array2[best_len_match]) return 0;
return VP8LVectorMismatch(array1, array2, max_limit);
}
// -----------------------------------------------------------------------------
// VP8LBackwardRefs
struct PixOrCopyBlock {
PixOrCopyBlock* next_; // next block (or NULL)
PixOrCopy* start_; // data start
int size_; // currently used size
};
extern void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs);
void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs) {
assert(refs != NULL);
if (refs->tail_ != NULL) {
*refs->tail_ = refs->free_blocks_; // recycle all blocks at once
}
refs->free_blocks_ = refs->refs_;
refs->tail_ = &refs->refs_;
refs->last_block_ = NULL;
refs->refs_ = NULL;
}
void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs) {
assert(refs != NULL);
VP8LClearBackwardRefs(refs);
while (refs->free_blocks_ != NULL) {
PixOrCopyBlock* const next = refs->free_blocks_->next_;
WebPSafeFree(refs->free_blocks_);
refs->free_blocks_ = next;
}
}
void VP8LBackwardRefsInit(VP8LBackwardRefs* const refs, int block_size) {
assert(refs != NULL);
memset(refs, 0, sizeof(*refs));
refs->tail_ = &refs->refs_;
refs->block_size_ =
(block_size < MIN_BLOCK_SIZE) ? MIN_BLOCK_SIZE : block_size;
}
VP8LRefsCursor VP8LRefsCursorInit(const VP8LBackwardRefs* const refs) {
VP8LRefsCursor c;
c.cur_block_ = refs->refs_;
if (refs->refs_ != NULL) {
c.cur_pos = c.cur_block_->start_;
c.last_pos_ = c.cur_pos + c.cur_block_->size_;
} else {
c.cur_pos = NULL;
c.last_pos_ = NULL;
}
return c;
}
void VP8LRefsCursorNextBlock(VP8LRefsCursor* const c) {
PixOrCopyBlock* const b = c->cur_block_->next_;
c->cur_pos = (b == NULL) ? NULL : b->start_;
c->last_pos_ = (b == NULL) ? NULL : b->start_ + b->size_;
c->cur_block_ = b;
}
// Create a new block, either from the free list or allocated
static PixOrCopyBlock* BackwardRefsNewBlock(VP8LBackwardRefs* const refs) {
PixOrCopyBlock* b = refs->free_blocks_;
if (b == NULL) { // allocate new memory chunk
const size_t total_size =
sizeof(*b) + refs->block_size_ * sizeof(*b->start_);
b = (PixOrCopyBlock*)WebPSafeMalloc(1ULL, total_size);
if (b == NULL) {
refs->error_ |= 1;
return NULL;
}
b->start_ = (PixOrCopy*)((uint8_t*)b + sizeof(*b)); // not always aligned
} else { // recycle from free-list
refs->free_blocks_ = b->next_;
}
*refs->tail_ = b;
refs->tail_ = &b->next_;
refs->last_block_ = b;
b->next_ = NULL;
b->size_ = 0;
return b;
}
extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
const PixOrCopy v);
void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
const PixOrCopy v) {
PixOrCopyBlock* b = refs->last_block_;
if (b == NULL || b->size_ == refs->block_size_) {
b = BackwardRefsNewBlock(refs);
if (b == NULL) return; // refs->error_ is set
}
b->start_[b->size_++] = v;
}
// -----------------------------------------------------------------------------
// Hash chains
int VP8LHashChainInit(VP8LHashChain* const p, int size) {
assert(p->size_ == 0);
assert(p->offset_length_ == NULL);
assert(size > 0);
p->offset_length_ =
(uint32_t*)WebPSafeMalloc(size, sizeof(*p->offset_length_));
if (p->offset_length_ == NULL) return 0;
p->size_ = size;
return 1;
}
void VP8LHashChainClear(VP8LHashChain* const p) {
assert(p != NULL);
WebPSafeFree(p->offset_length_);
p->size_ = 0;
p->offset_length_ = NULL;
}
// -----------------------------------------------------------------------------
#define HASH_MULTIPLIER_HI (0xc6a4a793ULL)
#define HASH_MULTIPLIER_LO (0x5bd1e996ULL)
static WEBP_INLINE uint32_t GetPixPairHash64(const uint32_t* const argb) {
uint32_t key;
key = (argb[1] * HASH_MULTIPLIER_HI) & 0xffffffffu;
key += (argb[0] * HASH_MULTIPLIER_LO) & 0xffffffffu;
key = key >> (32 - HASH_BITS);
return key;
}
// Returns the maximum number of hash chain lookups to do for a
// given compression quality. Return value in range [8, 86].
static int GetMaxItersForQuality(int quality) {
return 8 + (quality * quality) / 128;
}
static int GetWindowSizeForHashChain(int quality, int xsize) {
const int max_window_size = (quality > 75) ? WINDOW_SIZE
: (quality > 50) ? (xsize << 8)
: (quality > 25) ? (xsize << 6)
: (xsize << 4);
assert(xsize > 0);
return (max_window_size > WINDOW_SIZE) ? WINDOW_SIZE : max_window_size;
}
static WEBP_INLINE int MaxFindCopyLength(int len) {
return (len < MAX_LENGTH) ? len : MAX_LENGTH;
}
int VP8LHashChainFill(VP8LHashChain* const p, int quality,
const uint32_t* const argb, int xsize, int ysize,
int low_effort) {
const int size = xsize * ysize;
const int iter_max = GetMaxItersForQuality(quality);
const uint32_t window_size = GetWindowSizeForHashChain(quality, xsize);
int pos;
int argb_comp;
uint32_t base_position;
int32_t* hash_to_first_index;
// Temporarily use the p->offset_length_ as a hash chain.
int32_t* chain = (int32_t*)p->offset_length_;
assert(size > 0);
assert(p->size_ != 0);
assert(p->offset_length_ != NULL);
if (size <= 2) {
p->offset_length_[0] = p->offset_length_[size - 1] = 0;
return 1;
}
hash_to_first_index =
(int32_t*)WebPSafeMalloc(HASH_SIZE, sizeof(*hash_to_first_index));
if (hash_to_first_index == NULL) return 0;
// Set the int32_t array to -1.
memset(hash_to_first_index, 0xff, HASH_SIZE * sizeof(*hash_to_first_index));
// Fill the chain linking pixels with the same hash.
argb_comp = (argb[0] == argb[1]);
for (pos = 0; pos < size - 2;) {
uint32_t hash_code;
const int argb_comp_next = (argb[pos + 1] == argb[pos + 2]);
if (argb_comp && argb_comp_next) {
// Consecutive pixels with the same color will share the same hash.
// We therefore use a different hash: the color and its repetition
// length.
uint32_t tmp[2];
uint32_t len = 1;
tmp[0] = argb[pos];
// Figure out how far the pixels are the same.
// The last pixel has a different 64 bit hash, as its next pixel does
// not have the same color, so we just need to get to the last pixel equal
// to its follower.
while (pos + (int)len + 2 < size && argb[pos + len + 2] == argb[pos]) {
++len;
}
if (len > MAX_LENGTH) {
// Skip the pixels that match for distance=1 and length>MAX_LENGTH
// because they are linked to their predecessor and we automatically
// check that in the main for loop below. Skipping means setting no
// predecessor in the chain, hence -1.
memset(chain + pos, 0xff, (len - MAX_LENGTH) * sizeof(*chain));
pos += len - MAX_LENGTH;
len = MAX_LENGTH;
}
// Process the rest of the hash chain.
while (len) {
tmp[1] = len--;
hash_code = GetPixPairHash64(tmp);
chain[pos] = hash_to_first_index[hash_code];
hash_to_first_index[hash_code] = pos++;
}
argb_comp = 0;
} else {
// Just move one pixel forward.
hash_code = GetPixPairHash64(argb + pos);
chain[pos] = hash_to_first_index[hash_code];
hash_to_first_index[hash_code] = pos++;
argb_comp = argb_comp_next;
}
}
// Process the penultimate pixel.
chain[pos] = hash_to_first_index[GetPixPairHash64(argb + pos)];
WebPSafeFree(hash_to_first_index);
// Find the best match interval at each pixel, defined by an offset to the
// pixel and a length. The right-most pixel cannot match anything to the right
// (hence a best length of 0) and the left-most pixel nothing to the left
// (hence an offset of 0).
assert(size > 2);
p->offset_length_[0] = p->offset_length_[size - 1] = 0;
for (base_position = size - 2; base_position > 0;) {
const int max_len = MaxFindCopyLength(size - 1 - base_position);
const uint32_t* const argb_start = argb + base_position;
int iter = iter_max;
int best_length = 0;
uint32_t best_distance = 0;
uint32_t best_argb;
const int min_pos =
(base_position > window_size) ? base_position - window_size : 0;
const int length_max = (max_len < 256) ? max_len : 256;
uint32_t max_base_position;
pos = chain[base_position];
if (!low_effort) {
int curr_length;
// Heuristic: use the comparison with the above line as an initialization.
if (base_position >= (uint32_t)xsize) {
curr_length = FindMatchLength(argb_start - xsize, argb_start,
best_length, max_len);
if (curr_length > best_length) {
best_length = curr_length;
best_distance = xsize;
}
--iter;
}
// Heuristic: compare to the previous pixel.
curr_length =
FindMatchLength(argb_start - 1, argb_start, best_length, max_len);
if (curr_length > best_length) {
best_length = curr_length;
best_distance = 1;
}
--iter;
// Skip the for loop if we already have the maximum.
if (best_length == MAX_LENGTH) pos = min_pos - 1;
}
best_argb = argb_start[best_length];
for (; pos >= min_pos && --iter; pos = chain[pos]) {
int curr_length;
assert(base_position > (uint32_t)pos);
if (argb[pos + best_length] != best_argb) continue;
curr_length = VP8LVectorMismatch(argb + pos, argb_start, max_len);
if (best_length < curr_length) {
best_length = curr_length;
best_distance = base_position - pos;
best_argb = argb_start[best_length];
// Stop if we have reached a good enough length.
if (best_length >= length_max) break;
}
}
// We have the best match but in case the two intervals continue matching
// to the left, we have the best matches for the left-extended pixels.
max_base_position = base_position;
while (1) {
assert(best_length <= MAX_LENGTH);
assert(best_distance <= WINDOW_SIZE);
p->offset_length_[base_position] =
(best_distance << MAX_LENGTH_BITS) | (uint32_t)best_length;
--base_position;
// Stop if we don't have a match or if we are out of bounds.
if (best_distance == 0 || base_position == 0) break;
// Stop if we cannot extend the matching intervals to the left.
if (base_position < best_distance ||
argb[base_position - best_distance] != argb[base_position]) {
break;
}
// Stop if we are matching at its limit because there could be a closer
// matching interval with the same maximum length. Then again, if the
// matching interval is as close as possible (best_distance == 1), we will
// never find anything better so let's continue.
if (best_length == MAX_LENGTH && best_distance != 1 &&
base_position + MAX_LENGTH < max_base_position) {
break;
}
if (best_length < MAX_LENGTH) {
++best_length;
max_base_position = base_position;
}
}
}
return 1;
}
static WEBP_INLINE void AddSingleLiteral(uint32_t pixel, int use_color_cache,
VP8LColorCache* const hashers,
VP8LBackwardRefs* const refs) {
PixOrCopy v;
if (use_color_cache) {
const uint32_t key = VP8LColorCacheGetIndex(hashers, pixel);
if (VP8LColorCacheLookup(hashers, key) == pixel) {
v = PixOrCopyCreateCacheIdx(key);
} else {
v = PixOrCopyCreateLiteral(pixel);
VP8LColorCacheSet(hashers, key, pixel);
}
} else {
v = PixOrCopyCreateLiteral(pixel);
}
VP8LBackwardRefsCursorAdd(refs, v);
}
static int BackwardReferencesRle(int xsize, int ysize,
const uint32_t* const argb,
int cache_bits, VP8LBackwardRefs* const refs) {
const int pix_count = xsize * ysize;
int i, k;
const int use_color_cache = (cache_bits > 0);
VP8LColorCache hashers;
if (use_color_cache && !VP8LColorCacheInit(&hashers, cache_bits)) {
return 0;
}
VP8LClearBackwardRefs(refs);
// Add first pixel as literal.
AddSingleLiteral(argb[0], use_color_cache, &hashers, refs);
i = 1;
while (i < pix_count) {
const int max_len = MaxFindCopyLength(pix_count - i);
const int rle_len = FindMatchLength(argb + i, argb + i - 1, 0, max_len);
const int prev_row_len = (i < xsize) ? 0 :
FindMatchLength(argb + i, argb + i - xsize, 0, max_len);
if (rle_len >= prev_row_len && rle_len >= MIN_LENGTH) {
VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(1, rle_len));
// We don't need to update the color cache here since it is always the
// same pixel being copied, and that does not change the color cache
// state.
i += rle_len;
} else if (prev_row_len >= MIN_LENGTH) {
VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(xsize, prev_row_len));
if (use_color_cache) {
for (k = 0; k < prev_row_len; ++k) {
VP8LColorCacheInsert(&hashers, argb[i + k]);
}
}
i += prev_row_len;
} else {
AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
i++;
}
}
if (use_color_cache) VP8LColorCacheClear(&hashers);
return !refs->error_;
}
static int BackwardReferencesLz77(int xsize, int ysize,
const uint32_t* const argb, int cache_bits,
const VP8LHashChain* const hash_chain,
VP8LBackwardRefs* const refs) {
int i;
int i_last_check = -1;
int ok = 0;
int cc_init = 0;
const int use_color_cache = (cache_bits > 0);
const int pix_count = xsize * ysize;
VP8LColorCache hashers;
if (use_color_cache) {
cc_init = VP8LColorCacheInit(&hashers, cache_bits);
if (!cc_init) goto Error;
}
VP8LClearBackwardRefs(refs);
for (i = 0; i < pix_count;) {
// Alternative#1: Code the pixels starting at 'i' using backward reference.
int offset = 0;
int len = 0;
int j;
VP8LHashChainFindCopy(hash_chain, i, &offset, &len);
if (len >= MIN_LENGTH) {
const int len_ini = len;
int max_reach = 0;
const int j_max =
(i + len_ini >= pix_count) ? pix_count - 1 : i + len_ini;
// Only start from what we have not checked already.
i_last_check = (i > i_last_check) ? i : i_last_check;
// We know the best match for the current pixel but we try to find the
// best matches for the current pixel AND the next one combined.
// The naive method would use the intervals:
// [i,i+len) + [i+len, length of best match at i+len)
// while we check if we can use:
// [i,j) (where j<=i+len) + [j, length of best match at j)
for (j = i_last_check + 1; j <= j_max; ++j) {
const int len_j = VP8LHashChainFindLength(hash_chain, j);
const int reach =
j + (len_j >= MIN_LENGTH ? len_j : 1); // 1 for single literal.
if (reach > max_reach) {
len = j - i;
max_reach = reach;
if (max_reach >= pix_count) break;
}
}
} else {
len = 1;
}
// Go with literal or backward reference.
assert(len > 0);
if (len == 1) {
AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
} else {
VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
if (use_color_cache) {
for (j = i; j < i + len; ++j) VP8LColorCacheInsert(&hashers, argb[j]);
}
}
i += len;
}
ok = !refs->error_;
Error:
if (cc_init) VP8LColorCacheClear(&hashers);
return ok;
}
// Compute an LZ77 by forcing matches to happen within a given distance cost.
// We therefore limit the algorithm to the lowest 32 values in the PlaneCode
// definition.
#define WINDOW_OFFSETS_SIZE_MAX 32
static int BackwardReferencesLz77Box(int xsize, int ysize,
const uint32_t* const argb, int cache_bits,
const VP8LHashChain* const hash_chain_best,
VP8LHashChain* hash_chain,
VP8LBackwardRefs* const refs) {
int i;
const int pix_count = xsize * ysize;
uint16_t* counts;
int window_offsets[WINDOW_OFFSETS_SIZE_MAX] = {0};
int window_offsets_new[WINDOW_OFFSETS_SIZE_MAX] = {0};
int window_offsets_size = 0;
int window_offsets_new_size = 0;
uint16_t* const counts_ini =
(uint16_t*)WebPSafeMalloc(xsize * ysize, sizeof(*counts_ini));
int best_offset_prev = -1, best_length_prev = -1;
if (counts_ini == NULL) return 0;
// counts[i] counts how many times a pixel is repeated starting at position i.
i = pix_count - 2;
counts = counts_ini + i;
counts[1] = 1;
for (; i >= 0; --i, --counts) {
if (argb[i] == argb[i + 1]) {
// Max out the counts to MAX_LENGTH.
counts[0] = counts[1] + (counts[1] != MAX_LENGTH);
} else {
counts[0] = 1;
}
}
// Figure out the window offsets around a pixel. They are stored in a
// spiraling order around the pixel as defined by VP8LDistanceToPlaneCode.
{
int x, y;
for (y = 0; y <= 6; ++y) {
for (x = -6; x <= 6; ++x) {
const int offset = y * xsize + x;
int plane_code;
// Ignore offsets that bring us after the pixel.
if (offset <= 0) continue;
plane_code = VP8LDistanceToPlaneCode(xsize, offset) - 1;
if (plane_code >= WINDOW_OFFSETS_SIZE_MAX) continue;
window_offsets[plane_code] = offset;
}
}
// For narrow images, not all plane codes are reached, so remove those.
for (i = 0; i < WINDOW_OFFSETS_SIZE_MAX; ++i) {
if (window_offsets[i] == 0) continue;
window_offsets[window_offsets_size++] = window_offsets[i];
}
// Given a pixel P, find the offsets that reach pixels unreachable from P-1
// with any of the offsets in window_offsets[].
for (i = 0; i < window_offsets_size; ++i) {
int j;
int is_reachable = 0;
for (j = 0; j < window_offsets_size && !is_reachable; ++j) {
is_reachable |= (window_offsets[i] == window_offsets[j] + 1);
}
if (!is_reachable) {
window_offsets_new[window_offsets_new_size] = window_offsets[i];
++window_offsets_new_size;
}
}
}
hash_chain->offset_length_[0] = 0;
for (i = 1; i < pix_count; ++i) {
int ind;
int best_length = VP8LHashChainFindLength(hash_chain_best, i);
int best_offset;
int do_compute = 1;
if (best_length >= MAX_LENGTH) {
// Do not recompute the best match if we already have a maximal one in the
// window.
best_offset = VP8LHashChainFindOffset(hash_chain_best, i);
for (ind = 0; ind < window_offsets_size; ++ind) {
if (best_offset == window_offsets[ind]) {
do_compute = 0;
break;
}
}
}
if (do_compute) {
// Figure out if we should use the offset/length from the previous pixel
// as an initial guess and therefore only inspect the offsets in
// window_offsets_new[].
const int use_prev =
(best_length_prev > 1) && (best_length_prev < MAX_LENGTH);
const int num_ind =
use_prev ? window_offsets_new_size : window_offsets_size;
best_length = use_prev ? best_length_prev - 1 : 0;
best_offset = use_prev ? best_offset_prev : 0;
// Find the longest match in a window around the pixel.
for (ind = 0; ind < num_ind; ++ind) {
int curr_length = 0;
int j = i;
int j_offset =
use_prev ? i - window_offsets_new[ind] : i - window_offsets[ind];
if (j_offset < 0 || argb[j_offset] != argb[i]) continue;
// The longest match is the sum of how many times each pixel is
// repeated.
do {
const int counts_j_offset = counts_ini[j_offset];
const int counts_j = counts_ini[j];
if (counts_j_offset != counts_j) {
curr_length +=
(counts_j_offset < counts_j) ? counts_j_offset : counts_j;
break;
}
// The same color is repeated counts_pos times at j_offset and j.
curr_length += counts_j_offset;
j_offset += counts_j_offset;
j += counts_j_offset;
} while (curr_length <= MAX_LENGTH && j < pix_count &&
argb[j_offset] == argb[j]);
if (best_length < curr_length) {
best_offset =
use_prev ? window_offsets_new[ind] : window_offsets[ind];
if (curr_length >= MAX_LENGTH) {
best_length = MAX_LENGTH;
break;
} else {
best_length = curr_length;
}
}
}
}
assert(i + best_length <= pix_count);
assert(best_length <= MAX_LENGTH);
if (best_length <= MIN_LENGTH) {
hash_chain->offset_length_[i] = 0;
best_offset_prev = 0;
best_length_prev = 0;
} else {
hash_chain->offset_length_[i] =
(best_offset << MAX_LENGTH_BITS) | (uint32_t)best_length;
best_offset_prev = best_offset;
best_length_prev = best_length;
}
}
hash_chain->offset_length_[0] = 0;
WebPSafeFree(counts_ini);
return BackwardReferencesLz77(xsize, ysize, argb, cache_bits, hash_chain,
refs);
}
// -----------------------------------------------------------------------------
static void BackwardReferences2DLocality(int xsize,
const VP8LBackwardRefs* const refs) {
VP8LRefsCursor c = VP8LRefsCursorInit(refs);
while (VP8LRefsCursorOk(&c)) {
if (PixOrCopyIsCopy(c.cur_pos)) {
const int dist = c.cur_pos->argb_or_distance;
const int transformed_dist = VP8LDistanceToPlaneCode(xsize, dist);
c.cur_pos->argb_or_distance = transformed_dist;
}
VP8LRefsCursorNext(&c);
}
}
// Evaluate optimal cache bits for the local color cache.
// The input *best_cache_bits sets the maximum cache bits to use (passing 0
// implies disabling the local color cache). The local color cache is also
// disabled for the lower (<= 25) quality.
// Returns 0 in case of memory error.
static int CalculateBestCacheSize(const uint32_t* argb, int quality,
const VP8LBackwardRefs* const refs,
int* const best_cache_bits) {
int i;
const int cache_bits_max = (quality <= 25) ? 0 : *best_cache_bits;
double entropy_min = MAX_ENTROPY;
int cc_init[MAX_COLOR_CACHE_BITS + 1] = { 0 };
VP8LColorCache hashers[MAX_COLOR_CACHE_BITS + 1];
VP8LRefsCursor c = VP8LRefsCursorInit(refs);
VP8LHistogram* histos[MAX_COLOR_CACHE_BITS + 1] = { NULL };
int ok = 0;
assert(cache_bits_max >= 0 && cache_bits_max <= MAX_COLOR_CACHE_BITS);
if (cache_bits_max == 0) {
*best_cache_bits = 0;
// Local color cache is disabled.
return 1;
}
// Allocate data.
for (i = 0; i <= cache_bits_max; ++i) {
histos[i] = VP8LAllocateHistogram(i);
if (histos[i] == NULL) goto Error;
if (i == 0) continue;
cc_init[i] = VP8LColorCacheInit(&hashers[i], i);
if (!cc_init[i]) goto Error;
}
// Find the cache_bits giving the lowest entropy. The search is done in a
// brute-force way as the function (entropy w.r.t cache_bits) can be
// anything in practice.
while (VP8LRefsCursorOk(&c)) {
const PixOrCopy* const v = c.cur_pos;
if (PixOrCopyIsLiteral(v)) {
const uint32_t pix = *argb++;
const uint32_t a = (pix >> 24) & 0xff;
const uint32_t r = (pix >> 16) & 0xff;
const uint32_t g = (pix >> 8) & 0xff;
const uint32_t b = (pix >> 0) & 0xff;
// The keys of the caches can be derived from the longest one.
int key = VP8LHashPix(pix, 32 - cache_bits_max);
// Do not use the color cache for cache_bits = 0.
++histos[0]->blue_[b];
++histos[0]->literal_[g];
++histos[0]->red_[r];
++histos[0]->alpha_[a];
// Deal with cache_bits > 0.
for (i = cache_bits_max; i >= 1; --i, key >>= 1) {
if (VP8LColorCacheLookup(&hashers[i], key) == pix) {
++histos[i]->literal_[NUM_LITERAL_CODES + NUM_LENGTH_CODES + key];
} else {
VP8LColorCacheSet(&hashers[i], key, pix);
++histos[i]->blue_[b];
++histos[i]->literal_[g];
++histos[i]->red_[r];
++histos[i]->alpha_[a];
}
}
} else {
// We should compute the contribution of the (distance,length)
// histograms but those are the same independently from the cache size.
// As those constant contributions are in the end added to the other
// histogram contributions, we can safely ignore them.
int len = PixOrCopyLength(v);
uint32_t argb_prev = *argb ^ 0xffffffffu;
// Update the color caches.
do {
if (*argb != argb_prev) {
// Efficiency: insert only if the color changes.
int key = VP8LHashPix(*argb, 32 - cache_bits_max);
for (i = cache_bits_max; i >= 1; --i, key >>= 1) {
hashers[i].colors_[key] = *argb;
}
argb_prev = *argb;
}
argb++;
} while (--len != 0);
}
VP8LRefsCursorNext(&c);
}
for (i = 0; i <= cache_bits_max; ++i) {
const double entropy = VP8LHistogramEstimateBits(histos[i]);
if (i == 0 || entropy < entropy_min) {
entropy_min = entropy;
*best_cache_bits = i;
}
}
ok = 1;
Error:
for (i = 0; i <= cache_bits_max; ++i) {
if (cc_init[i]) VP8LColorCacheClear(&hashers[i]);
VP8LFreeHistogram(histos[i]);
}
return ok;
}
// Update (in-place) backward references for specified cache_bits.
static int BackwardRefsWithLocalCache(const uint32_t* const argb,
int cache_bits,
VP8LBackwardRefs* const refs) {
int pixel_index = 0;
VP8LColorCache hashers;
VP8LRefsCursor c = VP8LRefsCursorInit(refs);
if (!VP8LColorCacheInit(&hashers, cache_bits)) return 0;
while (VP8LRefsCursorOk(&c)) {
PixOrCopy* const v = c.cur_pos;
if (PixOrCopyIsLiteral(v)) {
const uint32_t argb_literal = v->argb_or_distance;
const int ix = VP8LColorCacheContains(&hashers, argb_literal);
if (ix >= 0) {
// hashers contains argb_literal
*v = PixOrCopyCreateCacheIdx(ix);
} else {
VP8LColorCacheInsert(&hashers, argb_literal);
}
++pixel_index;
} else {
// refs was created without local cache, so it can not have cache indexes.
int k;
assert(PixOrCopyIsCopy(v));
for (k = 0; k < v->len; ++k) {
VP8LColorCacheInsert(&hashers, argb[pixel_index++]);
}
}
VP8LRefsCursorNext(&c);
}
VP8LColorCacheClear(&hashers);
return 1;
}
static VP8LBackwardRefs* GetBackwardReferencesLowEffort(
int width, int height, const uint32_t* const argb,
int* const cache_bits, const VP8LHashChain* const hash_chain,
VP8LBackwardRefs* const refs_lz77) {
*cache_bits = 0;
if (!BackwardReferencesLz77(width, height, argb, 0, hash_chain, refs_lz77)) {
return NULL;
}
BackwardReferences2DLocality(width, refs_lz77);
return refs_lz77;
}
extern int VP8LBackwardReferencesTraceBackwards(
int xsize, int ysize, const uint32_t* const argb, int cache_bits,
const VP8LHashChain* const hash_chain,
const VP8LBackwardRefs* const refs_src, VP8LBackwardRefs* const refs_dst);
static VP8LBackwardRefs* GetBackwardReferences(
int width, int height, const uint32_t* const argb, int quality,
int lz77_types_to_try, int* const cache_bits,
const VP8LHashChain* const hash_chain, VP8LBackwardRefs* best,
VP8LBackwardRefs* worst) {
const int cache_bits_initial = *cache_bits;
double bit_cost_best = -1;
VP8LHistogram* histo = NULL;
int lz77_type, lz77_type_best = 0;
VP8LHashChain hash_chain_box;
memset(&hash_chain_box, 0, sizeof(hash_chain_box));
histo = VP8LAllocateHistogram(MAX_COLOR_CACHE_BITS);
if (histo == NULL) goto Error;
for (lz77_type = 1; lz77_types_to_try;
lz77_types_to_try &= ~lz77_type, lz77_type <<= 1) {
int res = 0;
double bit_cost;
int cache_bits_tmp = cache_bits_initial;
if ((lz77_types_to_try & lz77_type) == 0) continue;
switch (lz77_type) {
case kLZ77RLE:
res = BackwardReferencesRle(width, height, argb, 0, worst);
break;
case kLZ77Standard:
// Compute LZ77 with no cache (0 bits), as the ideal LZ77 with a color
// cache is not that different in practice.
res = BackwardReferencesLz77(width, height, argb, 0, hash_chain, worst);
break;
case kLZ77Box:
if (!VP8LHashChainInit(&hash_chain_box, width * height)) goto Error;
res = BackwardReferencesLz77Box(width, height, argb, 0, hash_chain,
&hash_chain_box, worst);
break;
default:
assert(0);
}
if (!res) goto Error;
// Next, try with a color cache and update the references.
if (!CalculateBestCacheSize(argb, quality, worst, &cache_bits_tmp)) {
goto Error;
}
if (cache_bits_tmp > 0) {
if (!BackwardRefsWithLocalCache(argb, cache_bits_tmp, worst)) {
goto Error;
}
}
// Keep the best backward references.
VP8LHistogramCreate(histo, worst, cache_bits_tmp);
bit_cost = VP8LHistogramEstimateBits(histo);
if (lz77_type_best == 0 || bit_cost < bit_cost_best) {
VP8LBackwardRefs* const tmp = worst;
worst = best;
best = tmp;
bit_cost_best = bit_cost;
*cache_bits = cache_bits_tmp;
lz77_type_best = lz77_type;
}
}
assert(lz77_type_best > 0);
// Improve on simple LZ77 but only for high quality (TraceBackwards is
// costly).
if ((lz77_type_best == kLZ77Standard || lz77_type_best == kLZ77Box) &&
quality >= 25) {
const VP8LHashChain* const hash_chain_tmp =
(lz77_type_best == kLZ77Standard) ? hash_chain : &hash_chain_box;
if (VP8LBackwardReferencesTraceBackwards(width, height, argb, *cache_bits,
hash_chain_tmp, best, worst)) {
double bit_cost_trace;
VP8LHistogramCreate(histo, worst, *cache_bits);
bit_cost_trace = VP8LHistogramEstimateBits(histo);
if (bit_cost_trace < bit_cost_best) best = worst;
}
}
BackwardReferences2DLocality(width, best);
Error:
VP8LHashChainClear(&hash_chain_box);
VP8LFreeHistogram(histo);
return best;
}
VP8LBackwardRefs* VP8LGetBackwardReferences(
int width, int height, const uint32_t* const argb, int quality,
int low_effort, int lz77_types_to_try, int* const cache_bits,
const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_tmp1,
VP8LBackwardRefs* const refs_tmp2) {
if (low_effort) {
return GetBackwardReferencesLowEffort(width, height, argb, cache_bits,
hash_chain, refs_tmp1);
} else {
return GetBackwardReferences(width, height, argb, quality,
lz77_types_to_try, cache_bits, hash_chain,
refs_tmp1, refs_tmp2);
}
}

View file

@ -10,13 +10,13 @@
// Author: Jyrki Alakuijala (jyrki@google.com)
//
#ifndef WEBP_ENC_BACKWARD_REFERENCES_H_
#define WEBP_ENC_BACKWARD_REFERENCES_H_
#ifndef WEBP_ENC_BACKWARD_REFERENCES_ENC_H_
#define WEBP_ENC_BACKWARD_REFERENCES_ENC_H_
#include <assert.h>
#include <stdlib.h>
#include "../webp/types.h"
#include "../webp/format_constants.h"
#include "src/webp/types.h"
#include "src/webp/format_constants.h"
#ifdef __cplusplus
extern "C" {
@ -91,11 +91,6 @@ static WEBP_INLINE uint32_t PixOrCopyLength(const PixOrCopy* const p) {
return p->len;
}
static WEBP_INLINE uint32_t PixOrCopyArgb(const PixOrCopy* const p) {
assert(p->mode == kLiteral);
return p->argb_or_distance;
}
static WEBP_INLINE uint32_t PixOrCopyCacheIdx(const PixOrCopy* const p) {
assert(p->mode == kCacheIdx);
assert(p->argb_or_distance < (1U << MAX_COLOR_CACHE_BITS));
@ -113,6 +108,16 @@ static WEBP_INLINE uint32_t PixOrCopyDistance(const PixOrCopy* const p) {
#define HASH_BITS 18
#define HASH_SIZE (1 << HASH_BITS)
// If you change this, you need MAX_LENGTH_BITS + WINDOW_SIZE_BITS <= 32 as it
// is used in VP8LHashChain.
#define MAX_LENGTH_BITS 12
#define WINDOW_SIZE_BITS 20
// We want the max value to be attainable and stored in MAX_LENGTH_BITS bits.
#define MAX_LENGTH ((1 << MAX_LENGTH_BITS) - 1)
#if MAX_LENGTH_BITS + WINDOW_SIZE_BITS > 32
#error "MAX_LENGTH_BITS + WINDOW_SIZE_BITS > 32"
#endif
typedef struct VP8LHashChain VP8LHashChain;
struct VP8LHashChain {
// The 20 most significant bits contain the offset at which the best match
@ -134,6 +139,24 @@ int VP8LHashChainFill(VP8LHashChain* const p, int quality,
int low_effort);
void VP8LHashChainClear(VP8LHashChain* const p); // release memory
static WEBP_INLINE int VP8LHashChainFindOffset(const VP8LHashChain* const p,
const int base_position) {
return p->offset_length_[base_position] >> MAX_LENGTH_BITS;
}
static WEBP_INLINE int VP8LHashChainFindLength(const VP8LHashChain* const p,
const int base_position) {
return p->offset_length_[base_position] & ((1U << MAX_LENGTH_BITS) - 1);
}
static WEBP_INLINE void VP8LHashChainFindCopy(const VP8LHashChain* const p,
int base_position,
int* const offset_ptr,
int* const length_ptr) {
*offset_ptr = VP8LHashChainFindOffset(p, base_position);
*length_ptr = VP8LHashChainFindLength(p, base_position);
}
// -----------------------------------------------------------------------------
// VP8LBackwardRefs (block-based backward-references storage)
@ -158,9 +181,6 @@ struct VP8LBackwardRefs {
void VP8LBackwardRefsInit(VP8LBackwardRefs* const refs, int block_size);
// Release memory for backward references.
void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs);
// Copies the 'src' backward refs to the 'dst'. Returns 0 in case of error.
int VP8LBackwardRefsCopy(const VP8LBackwardRefs* const src,
VP8LBackwardRefs* const dst);
// Cursor for iterating on references content
typedef struct {
@ -189,6 +209,12 @@ static WEBP_INLINE void VP8LRefsCursorNext(VP8LRefsCursor* const c) {
// -----------------------------------------------------------------------------
// Main entry points
enum VP8LLZ77Type {
kLZ77Standard = 1,
kLZ77RLE = 2,
kLZ77Box = 4
};
// Evaluates best possible backward references for specified quality.
// The input cache_bits to 'VP8LGetBackwardReferences' sets the maximum cache
// bits to use (passing 0 implies disabling the local color cache).
@ -197,11 +223,12 @@ static WEBP_INLINE void VP8LRefsCursorNext(VP8LRefsCursor* const c) {
// refs[0] or refs[1].
VP8LBackwardRefs* VP8LGetBackwardReferences(
int width, int height, const uint32_t* const argb, int quality,
int low_effort, int* const cache_bits,
const VP8LHashChain* const hash_chain, VP8LBackwardRefs refs[2]);
int low_effort, int lz77_types_to_try, int* const cache_bits,
const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_tmp1,
VP8LBackwardRefs* const refs_tmp2);
#ifdef __cplusplus
}
#endif
#endif // WEBP_ENC_BACKWARD_REFERENCES_H_
#endif // WEBP_ENC_BACKWARD_REFERENCES_ENC_H_

View file

@ -12,10 +12,10 @@
// Author: Skal (pascal.massimino@gmail.com)
#ifdef HAVE_CONFIG_H
#include "../webp/config.h"
#include "src/webp/config.h"
#endif
#include "../webp/encode.h"
#include "src/webp/encode.h"
//------------------------------------------------------------------------------
// WebPConfig

View file

@ -11,7 +11,7 @@
//
// Author: Skal (pascal.massimino@gmail.com)
#include "./cost_enc.h"
#include "src/enc/cost_enc.h"
//------------------------------------------------------------------------------
// Level cost tables

Some files were not shown because too many files have changed in this diff Show more