365 lines
13 KiB
C++
365 lines
13 KiB
C++
/*
|
|
* Copyright 2013 The LibYuv Project Authors. All rights reserved.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
#include "../util/ssim.h" // NOLINT
|
|
|
|
#include <string.h>
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
typedef unsigned int uint32_t; // NOLINT
|
|
typedef unsigned short uint16_t; // NOLINT
|
|
|
|
#if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \
|
|
(defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))
|
|
#define __SSE2__
|
|
#endif
|
|
#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
|
|
#include <emmintrin.h>
|
|
#endif
|
|
|
|
#ifdef _OPENMP
|
|
#include <omp.h>
|
|
#endif
|
|
|
|
// SSIM
|
|
enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 };
|
|
|
|
// Symmetric Gaussian kernel: K[i] = ~11 * exp(-0.3 * i * i)
|
|
// The maximum value (11 x 11) must be less than 128 to avoid sign
|
|
// problems during the calls to _mm_mullo_epi16().
|
|
static const int K[KERNEL_SIZE] = {
|
|
1, 3, 7, 11, 7, 3, 1 // ~11 * exp(-0.3 * i * i)
|
|
};
|
|
static const double kiW[KERNEL + 1 + 1] = {
|
|
1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j]
|
|
1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j]
|
|
1. / 1056., // 1 / sum(i:0..5, j..6) K[i]*K[j]
|
|
1. / 957., // 1 / sum(i:0..4, j..6) K[i]*K[j]
|
|
1. / 726., // 1 / sum(i:0..3, j..6) K[i]*K[j]
|
|
};
|
|
|
|
#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
|
|
|
|
#define PWEIGHT(A, B) static_cast<uint16_t>(K[(A)] * K[(B)]) // weight product
|
|
#define MAKE_WEIGHT(L) \
|
|
{ \
|
|
{ \
|
|
{ \
|
|
PWEIGHT(L, 0) \
|
|
, PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), PWEIGHT(L, 4), \
|
|
PWEIGHT(L, 5), PWEIGHT(L, 6), 0 \
|
|
} \
|
|
} \
|
|
}
|
|
|
|
// We need this union trick to be able to initialize constant static __m128i
|
|
// values. We can't call _mm_set_epi16() for static compile-time initialization.
|
|
static const struct {
|
|
union {
|
|
uint16_t i16_[8];
|
|
__m128i m_;
|
|
} values_;
|
|
} W0 = MAKE_WEIGHT(0), W1 = MAKE_WEIGHT(1), W2 = MAKE_WEIGHT(2),
|
|
W3 = MAKE_WEIGHT(3);
|
|
// ... the rest is symmetric.
|
|
#undef MAKE_WEIGHT
|
|
#undef PWEIGHT
|
|
#endif
|
|
|
|
// Common final expression for SSIM, once the weighted sums are known.
|
|
static double FinalizeSSIM(double iw,
|
|
double xm,
|
|
double ym,
|
|
double xxm,
|
|
double xym,
|
|
double yym) {
|
|
const double iwx = xm * iw;
|
|
const double iwy = ym * iw;
|
|
double sxx = xxm * iw - iwx * iwx;
|
|
double syy = yym * iw - iwy * iwy;
|
|
// small errors are possible, due to rounding. Clamp to zero.
|
|
if (sxx < 0.) {
|
|
sxx = 0.;
|
|
}
|
|
if (syy < 0.) {
|
|
syy = 0.;
|
|
}
|
|
const double sxsy = sqrt(sxx * syy);
|
|
const double sxy = xym * iw - iwx * iwy;
|
|
static const double C11 = (0.01 * 0.01) * (255 * 255);
|
|
static const double C22 = (0.03 * 0.03) * (255 * 255);
|
|
static const double C33 = (0.015 * 0.015) * (255 * 255);
|
|
const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
|
|
const double c = (2. * sxsy + C22) / (sxx + syy + C22);
|
|
const double s = (sxy + C33) / (sxsy + C33);
|
|
return l * c * s;
|
|
}
|
|
|
|
// GetSSIM() does clipping. GetSSIMFullKernel() does not
|
|
|
|
// TODO(skal): use summed tables?
|
|
// Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1)
|
|
// with a diff of 255, squared. The maximum error is thus 0x4388241,
|
|
// which fits into 32 bits integers.
|
|
double GetSSIM(const uint8_t* org,
|
|
const uint8_t* rec,
|
|
int xo,
|
|
int yo,
|
|
int W,
|
|
int H,
|
|
int stride) {
|
|
uint32_t ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
|
|
org += (yo - KERNEL) * stride;
|
|
org += (xo - KERNEL);
|
|
rec += (yo - KERNEL) * stride;
|
|
rec += (xo - KERNEL);
|
|
for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) {
|
|
if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) {
|
|
continue;
|
|
}
|
|
const int Wy = K[y_];
|
|
for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) {
|
|
const int Wxy = Wy * K[x_];
|
|
if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) {
|
|
const int org_x = org[x_];
|
|
const int rec_x = rec[x_];
|
|
ws += Wxy;
|
|
xm += Wxy * org_x;
|
|
ym += Wxy * rec_x;
|
|
xxm += Wxy * org_x * org_x;
|
|
xym += Wxy * org_x * rec_x;
|
|
yym += Wxy * rec_x * rec_x;
|
|
}
|
|
}
|
|
}
|
|
return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym);
|
|
}
|
|
|
|
double GetSSIMFullKernel(const uint8_t* org,
|
|
const uint8_t* rec,
|
|
int xo,
|
|
int yo,
|
|
int stride,
|
|
double area_weight) {
|
|
uint32_t xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
|
|
|
|
#if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__)
|
|
|
|
org += yo * stride + xo;
|
|
rec += yo * stride + xo;
|
|
for (int y = 1; y <= KERNEL; y++) {
|
|
const int dy1 = y * stride;
|
|
const int dy2 = y * stride;
|
|
const int Wy = K[KERNEL + y];
|
|
|
|
for (int x = 1; x <= KERNEL; x++) {
|
|
// Compute the contributions of upper-left (ul), upper-right (ur)
|
|
// lower-left (ll) and lower-right (lr) points (see the diagram below).
|
|
// Symmetric Kernel will have same weight on those points.
|
|
// - - - - - - -
|
|
// - ul - - - ur -
|
|
// - - - - - - -
|
|
// - - - 0 - - -
|
|
// - - - - - - -
|
|
// - ll - - - lr -
|
|
// - - - - - - -
|
|
const int Wxy = Wy * K[KERNEL + x];
|
|
const int ul1 = org[-dy1 - x];
|
|
const int ur1 = org[-dy1 + x];
|
|
const int ll1 = org[dy1 - x];
|
|
const int lr1 = org[dy1 + x];
|
|
|
|
const int ul2 = rec[-dy2 - x];
|
|
const int ur2 = rec[-dy2 + x];
|
|
const int ll2 = rec[dy2 - x];
|
|
const int lr2 = rec[dy2 + x];
|
|
|
|
xm += Wxy * (ul1 + ur1 + ll1 + lr1);
|
|
ym += Wxy * (ul2 + ur2 + ll2 + lr2);
|
|
xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1);
|
|
xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2);
|
|
yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2);
|
|
}
|
|
|
|
// Compute the contributions of up (u), down (d), left (l) and right (r)
|
|
// points across the main axes (see the diagram below).
|
|
// Symmetric Kernel will have same weight on those points.
|
|
// - - - - - - -
|
|
// - - - u - - -
|
|
// - - - - - - -
|
|
// - l - 0 - r -
|
|
// - - - - - - -
|
|
// - - - d - - -
|
|
// - - - - - - -
|
|
const int Wxy = Wy * K[KERNEL];
|
|
const int u1 = org[-dy1];
|
|
const int d1 = org[dy1];
|
|
const int l1 = org[-y];
|
|
const int r1 = org[y];
|
|
|
|
const int u2 = rec[-dy2];
|
|
const int d2 = rec[dy2];
|
|
const int l2 = rec[-y];
|
|
const int r2 = rec[y];
|
|
|
|
xm += Wxy * (u1 + d1 + l1 + r1);
|
|
ym += Wxy * (u2 + d2 + l2 + r2);
|
|
xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1);
|
|
xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2);
|
|
yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2);
|
|
}
|
|
|
|
// Lastly the contribution of (x0, y0) point.
|
|
const int Wxy = K[KERNEL] * K[KERNEL];
|
|
const int s1 = org[0];
|
|
const int s2 = rec[0];
|
|
|
|
xm += Wxy * s1;
|
|
ym += Wxy * s2;
|
|
xxm += Wxy * s1 * s1;
|
|
xym += Wxy * s1 * s2;
|
|
yym += Wxy * s2 * s2;
|
|
|
|
#else // __SSE2__
|
|
|
|
org += (yo - KERNEL) * stride + (xo - KERNEL);
|
|
rec += (yo - KERNEL) * stride + (xo - KERNEL);
|
|
|
|
const __m128i zero = _mm_setzero_si128();
|
|
__m128i x = zero;
|
|
__m128i y = zero;
|
|
__m128i xx = zero;
|
|
__m128i xy = zero;
|
|
__m128i yy = zero;
|
|
|
|
// Read 8 pixels at line #L, and convert to 16bit, perform weighting
|
|
// and acccumulate.
|
|
#define LOAD_LINE_PAIR(L, WEIGHT) \
|
|
do { \
|
|
const __m128i v0 = \
|
|
_mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L)*stride)); \
|
|
const __m128i v1 = \
|
|
_mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L)*stride)); \
|
|
const __m128i w0 = _mm_unpacklo_epi8(v0, zero); \
|
|
const __m128i w1 = _mm_unpacklo_epi8(v1, zero); \
|
|
const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_); \
|
|
const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_); \
|
|
x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero)); \
|
|
y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero)); \
|
|
x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero)); \
|
|
y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero)); \
|
|
xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0)); \
|
|
xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1)); \
|
|
yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1)); \
|
|
} while (0)
|
|
|
|
#define ADD_AND_STORE_FOUR_EPI32(M, OUT) \
|
|
do { \
|
|
uint32_t tmp[4]; \
|
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \
|
|
(OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0]; \
|
|
} while (0)
|
|
|
|
LOAD_LINE_PAIR(0, W0);
|
|
LOAD_LINE_PAIR(1, W1);
|
|
LOAD_LINE_PAIR(2, W2);
|
|
LOAD_LINE_PAIR(3, W3);
|
|
LOAD_LINE_PAIR(4, W2);
|
|
LOAD_LINE_PAIR(5, W1);
|
|
LOAD_LINE_PAIR(6, W0);
|
|
|
|
ADD_AND_STORE_FOUR_EPI32(x, xm);
|
|
ADD_AND_STORE_FOUR_EPI32(y, ym);
|
|
ADD_AND_STORE_FOUR_EPI32(xx, xxm);
|
|
ADD_AND_STORE_FOUR_EPI32(xy, xym);
|
|
ADD_AND_STORE_FOUR_EPI32(yy, yym);
|
|
|
|
#undef LOAD_LINE_PAIR
|
|
#undef ADD_AND_STORE_FOUR_EPI32
|
|
#endif
|
|
|
|
return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym);
|
|
}
|
|
|
|
static int start_max(int x, int y) {
|
|
return (x > y) ? x : y;
|
|
}
|
|
|
|
double CalcSSIM(const uint8_t* org,
|
|
const uint8_t* rec,
|
|
const int image_width,
|
|
const int image_height) {
|
|
double SSIM = 0.;
|
|
const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL;
|
|
const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL;
|
|
const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X);
|
|
const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y);
|
|
const int stride = image_width;
|
|
|
|
for (int j = 0; j < KERNEL_Y; ++j) {
|
|
for (int i = 0; i < image_width; ++i) {
|
|
SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
|
|
}
|
|
}
|
|
|
|
#ifdef _OPENMP
|
|
#pragma omp parallel for reduction(+ : SSIM)
|
|
#endif
|
|
for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) {
|
|
for (int i = 0; i < KERNEL_X; ++i) {
|
|
SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
|
|
}
|
|
for (int i = KERNEL_X; i < start_x; ++i) {
|
|
SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]);
|
|
}
|
|
if (start_x < image_width) {
|
|
// GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we
|
|
// copy the 8 rightmost pixels on a cache area, and pad this area with
|
|
// zeros which won't contribute to the overall SSIM value (but we need
|
|
// to pass the correct normalizing constant!). By using this cache, we can
|
|
// still call GetSSIMFullKernel() instead of the slower GetSSIM().
|
|
// NOTE: we could use similar method for the left-most pixels too.
|
|
const int kScratchWidth = 8;
|
|
const int kScratchStride = kScratchWidth + KERNEL + 1;
|
|
uint8_t scratch_org[KERNEL_SIZE * kScratchStride] = {0};
|
|
uint8_t scratch_rec[KERNEL_SIZE * kScratchStride] = {0};
|
|
|
|
for (int k = 0; k < KERNEL_SIZE; ++k) {
|
|
const int offset =
|
|
(j - KERNEL + k) * stride + image_width - kScratchWidth;
|
|
memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth);
|
|
memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth);
|
|
}
|
|
for (int k = 0; k <= KERNEL_X + 1; ++k) {
|
|
SSIM += GetSSIMFullKernel(scratch_org, scratch_rec, KERNEL + k, KERNEL,
|
|
kScratchStride, kiW[k]);
|
|
}
|
|
}
|
|
}
|
|
|
|
for (int j = start_y; j < image_height; ++j) {
|
|
for (int i = 0; i < image_width; ++i) {
|
|
SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
|
|
}
|
|
}
|
|
return SSIM;
|
|
}
|
|
|
|
double CalcLSSIM(double ssim) {
|
|
return -10.0 * log10(1.0 - ssim);
|
|
}
|
|
|
|
#ifdef __cplusplus
|
|
} // extern "C"
|
|
#endif
|