272 lines
8.9 KiB
C
272 lines
8.9 KiB
C
/*
|
|
* Copyright (c) 2017 The WebM project authors. All Rights Reserved.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
#include <assert.h>
|
|
|
|
#include "./vpx_config.h"
|
|
#include "./vpx_dsp_rtcd.h"
|
|
#include "vpx_dsp/ppc/types_vsx.h"
|
|
|
|
uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *src_ptr, int src_stride,
|
|
const uint8_t *ref_ptr, int ref_stride) {
|
|
int distortion;
|
|
|
|
const int16x8_t a0 = unpack_to_s16_h(read4x2(src_ptr, src_stride));
|
|
const int16x8_t a1 =
|
|
unpack_to_s16_h(read4x2(src_ptr + src_stride * 2, src_stride));
|
|
const int16x8_t b0 = unpack_to_s16_h(read4x2(ref_ptr, ref_stride));
|
|
const int16x8_t b1 =
|
|
unpack_to_s16_h(read4x2(ref_ptr + ref_stride * 2, ref_stride));
|
|
const int16x8_t d0 = vec_sub(a0, b0);
|
|
const int16x8_t d1 = vec_sub(a1, b1);
|
|
const int32x4_t ds = vec_msum(d1, d1, vec_msum(d0, d0, vec_splat_s32(0)));
|
|
const int32x4_t d = vec_splat(vec_sums(ds, vec_splat_s32(0)), 3);
|
|
|
|
vec_ste(d, 0, &distortion);
|
|
|
|
return distortion;
|
|
}
|
|
|
|
// TODO(lu_zero): Unroll
|
|
uint32_t vpx_get_mb_ss_vsx(const int16_t *src_ptr) {
|
|
unsigned int i, sum = 0;
|
|
int32x4_t s = vec_splat_s32(0);
|
|
|
|
for (i = 0; i < 256; i += 8) {
|
|
const int16x8_t v = vec_vsx_ld(0, src_ptr + i);
|
|
s = vec_msum(v, v, s);
|
|
}
|
|
|
|
s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3);
|
|
|
|
vec_ste((uint32x4_t)s, 0, &sum);
|
|
|
|
return sum;
|
|
}
|
|
|
|
void vpx_comp_avg_pred_vsx(uint8_t *comp_pred, const uint8_t *pred, int width,
|
|
int height, const uint8_t *ref, int ref_stride) {
|
|
int i, j;
|
|
/* comp_pred and pred must be 16 byte aligned. */
|
|
assert(((intptr_t)comp_pred & 0xf) == 0);
|
|
assert(((intptr_t)pred & 0xf) == 0);
|
|
if (width >= 16) {
|
|
for (i = 0; i < height; ++i) {
|
|
for (j = 0; j < width; j += 16) {
|
|
const uint8x16_t v = vec_avg(vec_vsx_ld(j, pred), vec_vsx_ld(j, ref));
|
|
vec_vsx_st(v, j, comp_pred);
|
|
}
|
|
comp_pred += width;
|
|
pred += width;
|
|
ref += ref_stride;
|
|
}
|
|
} else if (width == 8) {
|
|
// Process 2 lines at time
|
|
for (i = 0; i < height / 2; ++i) {
|
|
const uint8x16_t r0 = vec_vsx_ld(0, ref);
|
|
const uint8x16_t r1 = vec_vsx_ld(0, ref + ref_stride);
|
|
const uint8x16_t r = xxpermdi(r0, r1, 0);
|
|
const uint8x16_t v = vec_avg(vec_vsx_ld(0, pred), r);
|
|
vec_vsx_st(v, 0, comp_pred);
|
|
comp_pred += 16; // width * 2;
|
|
pred += 16; // width * 2;
|
|
ref += ref_stride * 2;
|
|
}
|
|
} else {
|
|
assert(width == 4);
|
|
// process 4 lines at time
|
|
for (i = 0; i < height / 4; ++i) {
|
|
const uint32x4_t r0 = (uint32x4_t)vec_vsx_ld(0, ref);
|
|
const uint32x4_t r1 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride);
|
|
const uint32x4_t r2 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride * 2);
|
|
const uint32x4_t r3 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride * 3);
|
|
const uint8x16_t r =
|
|
(uint8x16_t)xxpermdi(vec_mergeh(r0, r1), vec_mergeh(r2, r3), 0);
|
|
const uint8x16_t v = vec_avg(vec_vsx_ld(0, pred), r);
|
|
vec_vsx_st(v, 0, comp_pred);
|
|
comp_pred += 16; // width * 4;
|
|
pred += 16; // width * 4;
|
|
ref += ref_stride * 4;
|
|
}
|
|
}
|
|
}
|
|
|
|
static INLINE void variance_inner_32(const uint8_t *src_ptr,
|
|
const uint8_t *ref_ptr,
|
|
int32x4_t *sum_squared, int32x4_t *sum) {
|
|
int32x4_t s = *sum;
|
|
int32x4_t ss = *sum_squared;
|
|
|
|
const uint8x16_t va0 = vec_vsx_ld(0, src_ptr);
|
|
const uint8x16_t vb0 = vec_vsx_ld(0, ref_ptr);
|
|
const uint8x16_t va1 = vec_vsx_ld(16, src_ptr);
|
|
const uint8x16_t vb1 = vec_vsx_ld(16, ref_ptr);
|
|
|
|
const int16x8_t a0 = unpack_to_s16_h(va0);
|
|
const int16x8_t b0 = unpack_to_s16_h(vb0);
|
|
const int16x8_t a1 = unpack_to_s16_l(va0);
|
|
const int16x8_t b1 = unpack_to_s16_l(vb0);
|
|
const int16x8_t a2 = unpack_to_s16_h(va1);
|
|
const int16x8_t b2 = unpack_to_s16_h(vb1);
|
|
const int16x8_t a3 = unpack_to_s16_l(va1);
|
|
const int16x8_t b3 = unpack_to_s16_l(vb1);
|
|
const int16x8_t d0 = vec_sub(a0, b0);
|
|
const int16x8_t d1 = vec_sub(a1, b1);
|
|
const int16x8_t d2 = vec_sub(a2, b2);
|
|
const int16x8_t d3 = vec_sub(a3, b3);
|
|
|
|
s = vec_sum4s(d0, s);
|
|
ss = vec_msum(d0, d0, ss);
|
|
s = vec_sum4s(d1, s);
|
|
ss = vec_msum(d1, d1, ss);
|
|
s = vec_sum4s(d2, s);
|
|
ss = vec_msum(d2, d2, ss);
|
|
s = vec_sum4s(d3, s);
|
|
ss = vec_msum(d3, d3, ss);
|
|
*sum = s;
|
|
*sum_squared = ss;
|
|
}
|
|
|
|
static INLINE void variance(const uint8_t *src_ptr, int src_stride,
|
|
const uint8_t *ref_ptr, int ref_stride, int w,
|
|
int h, uint32_t *sse, int *sum) {
|
|
int i;
|
|
|
|
int32x4_t s = vec_splat_s32(0);
|
|
int32x4_t ss = vec_splat_s32(0);
|
|
|
|
switch (w) {
|
|
case 4:
|
|
for (i = 0; i < h / 2; ++i) {
|
|
const int16x8_t a0 = unpack_to_s16_h(read4x2(src_ptr, src_stride));
|
|
const int16x8_t b0 = unpack_to_s16_h(read4x2(ref_ptr, ref_stride));
|
|
const int16x8_t d = vec_sub(a0, b0);
|
|
s = vec_sum4s(d, s);
|
|
ss = vec_msum(d, d, ss);
|
|
src_ptr += src_stride * 2;
|
|
ref_ptr += ref_stride * 2;
|
|
}
|
|
break;
|
|
case 8:
|
|
for (i = 0; i < h; ++i) {
|
|
const int16x8_t a0 = unpack_to_s16_h(vec_vsx_ld(0, src_ptr));
|
|
const int16x8_t b0 = unpack_to_s16_h(vec_vsx_ld(0, ref_ptr));
|
|
const int16x8_t d = vec_sub(a0, b0);
|
|
|
|
s = vec_sum4s(d, s);
|
|
ss = vec_msum(d, d, ss);
|
|
src_ptr += src_stride;
|
|
ref_ptr += ref_stride;
|
|
}
|
|
break;
|
|
case 16:
|
|
for (i = 0; i < h; ++i) {
|
|
const uint8x16_t va = vec_vsx_ld(0, src_ptr);
|
|
const uint8x16_t vb = vec_vsx_ld(0, ref_ptr);
|
|
const int16x8_t a0 = unpack_to_s16_h(va);
|
|
const int16x8_t b0 = unpack_to_s16_h(vb);
|
|
const int16x8_t a1 = unpack_to_s16_l(va);
|
|
const int16x8_t b1 = unpack_to_s16_l(vb);
|
|
const int16x8_t d0 = vec_sub(a0, b0);
|
|
const int16x8_t d1 = vec_sub(a1, b1);
|
|
|
|
s = vec_sum4s(d0, s);
|
|
ss = vec_msum(d0, d0, ss);
|
|
s = vec_sum4s(d1, s);
|
|
ss = vec_msum(d1, d1, ss);
|
|
|
|
src_ptr += src_stride;
|
|
ref_ptr += ref_stride;
|
|
}
|
|
break;
|
|
case 32:
|
|
for (i = 0; i < h; ++i) {
|
|
variance_inner_32(src_ptr, ref_ptr, &ss, &s);
|
|
src_ptr += src_stride;
|
|
ref_ptr += ref_stride;
|
|
}
|
|
break;
|
|
case 64:
|
|
for (i = 0; i < h; ++i) {
|
|
variance_inner_32(src_ptr, ref_ptr, &ss, &s);
|
|
variance_inner_32(src_ptr + 32, ref_ptr + 32, &ss, &s);
|
|
|
|
src_ptr += src_stride;
|
|
ref_ptr += ref_stride;
|
|
}
|
|
break;
|
|
}
|
|
|
|
s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3);
|
|
|
|
vec_ste(s, 0, sum);
|
|
|
|
ss = vec_splat(vec_sums(ss, vec_splat_s32(0)), 3);
|
|
|
|
vec_ste((uint32x4_t)ss, 0, sse);
|
|
}
|
|
|
|
/* Identical to the variance call except it takes an additional parameter, sum,
|
|
* and returns that value using pass-by-reference instead of returning
|
|
* sse - sum^2 / w*h
|
|
*/
|
|
#define GET_VAR(W, H) \
|
|
void vpx_get##W##x##H##var_vsx(const uint8_t *src_ptr, int src_stride, \
|
|
const uint8_t *ref_ptr, int ref_stride, \
|
|
uint32_t *sse, int *sum) { \
|
|
variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, sum); \
|
|
}
|
|
|
|
/* Identical to the variance call except it does not calculate the
|
|
* sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
|
|
* variable.
|
|
*/
|
|
#define MSE(W, H) \
|
|
uint32_t vpx_mse##W##x##H##_vsx(const uint8_t *src_ptr, int src_stride, \
|
|
const uint8_t *ref_ptr, int ref_stride, \
|
|
uint32_t *sse) { \
|
|
int sum; \
|
|
variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \
|
|
return *sse; \
|
|
}
|
|
|
|
#define VAR(W, H) \
|
|
uint32_t vpx_variance##W##x##H##_vsx(const uint8_t *src_ptr, int src_stride, \
|
|
const uint8_t *ref_ptr, int ref_stride, \
|
|
uint32_t *sse) { \
|
|
int sum; \
|
|
variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \
|
|
return *sse - (uint32_t)(((int64_t)sum * sum) / ((W) * (H))); \
|
|
}
|
|
|
|
#define VARIANCES(W, H) VAR(W, H)
|
|
|
|
VARIANCES(64, 64)
|
|
VARIANCES(64, 32)
|
|
VARIANCES(32, 64)
|
|
VARIANCES(32, 32)
|
|
VARIANCES(32, 16)
|
|
VARIANCES(16, 32)
|
|
VARIANCES(16, 16)
|
|
VARIANCES(16, 8)
|
|
VARIANCES(8, 16)
|
|
VARIANCES(8, 8)
|
|
VARIANCES(8, 4)
|
|
VARIANCES(4, 8)
|
|
VARIANCES(4, 4)
|
|
|
|
GET_VAR(16, 16)
|
|
GET_VAR(8, 8)
|
|
|
|
MSE(16, 16)
|
|
MSE(16, 8)
|
|
MSE(8, 16)
|
|
MSE(8, 8)
|