This commit is contained in:
Anthony Minessale
2019-09-11 15:53:30 +00:00
committed by Andrey Volk
parent 6175c55b2f
commit 1b1c66aae4
45 changed files with 3684 additions and 574 deletions

View File

@@ -1154,6 +1154,48 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
}
#endif // HAS_ARGBTOYROW_AVX2
#ifdef HAS_ABGRTOYROW_AVX2
// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
"vmovdqu %5,%%ymm6 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vmovdqu 0x40(%0),%%ymm2 \n"
"vmovdqu 0x60(%0),%%ymm3 \n"
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
"lea 0x80(%0),%0 \n"
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
"vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
"vpsrlw $0x7,%%ymm0,%%ymm0 \n"
"vpsrlw $0x7,%%ymm2,%%ymm2 \n"
"vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
"vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
"vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
"vmovdqu %%ymm0,(%1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kABGRToY), // %3
"m"(kAddY16), // %4
"m"(kPermdARGBToY_AVX) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif // HAS_ABGRTOYROW_AVX2
#ifdef HAS_ARGBTOYJROW_AVX2
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@@ -1328,6 +1370,69 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
}
#endif // HAS_ARGBTOUVROW_AVX2
#ifdef HAS_ABGRTOUVROW_AVX2
void ABGRToUVRow_AVX2(const uint8_t* src_abgr0,
int src_stride_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile(
"vbroadcastf128 %5,%%ymm5 \n"
"vbroadcastf128 %6,%%ymm6 \n"
"vbroadcastf128 %7,%%ymm7 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vmovdqu 0x40(%0),%%ymm2 \n"
"vmovdqu 0x60(%0),%%ymm3 \n"
"vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
"vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
"vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
"vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
"lea 0x80(%0),%0 \n"
"vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
"vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
"vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
"vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
"vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
"vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
"vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
"vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
"vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
"vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
"vpsraw $0x8,%%ymm1,%%ymm1 \n"
"vpsraw $0x8,%%ymm0,%%ymm0 \n"
"vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpshufb %8,%%ymm0,%%ymm0 \n"
"vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm0,(%1) \n"
"vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x20,%3 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_abgr0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
: "r"((intptr_t)(src_stride_abgr)), // %4
"m"(kAddUV128), // %5
"m"(kABGRToV), // %6
"m"(kABGRToU), // %7
"m"(kShufARGBToUV_AVX) // %8
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif // HAS_ABGRTOUVROW_AVX2
#ifdef HAS_ARGBTOUVJROW_AVX2
void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
int src_stride_argb,
@@ -5238,7 +5343,7 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
,
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
);
);
}
#endif // HAS_ARGBMULTIPLYROW_AVX2
@@ -6120,24 +6225,24 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y,
int width) {
asm volatile(
"sub %1,%2 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movq (%1),%%xmm2 \n"
"movq 0x00(%1,%2,1),%%xmm1 \n"
"add $0x8,%1 \n"
"punpcklbw %%xmm1,%%xmm2 \n"
"movdqu (%0),%%xmm0 \n"
"add $0x10,%0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm0 \n"
"punpckhbw %%xmm2,%%xmm1 \n"
"movdqu %%xmm0,(%3) \n"
"movdqu %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x10,%4 \n"
"jg 1b \n"
"1: \n"
"movq (%1),%%xmm2 \n"
"movq 0x00(%1,%2,1),%%xmm1 \n"
"add $0x8,%1 \n"
"punpcklbw %%xmm1,%%xmm2 \n"
"movdqu (%0),%%xmm0 \n"
"add $0x10,%0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm0 \n"
"punpckhbw %%xmm2,%%xmm1 \n"
"movdqu %%xmm0,(%3) \n"
"movdqu %%xmm1,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x10,%4 \n"
"jg 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -6156,24 +6261,24 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y,
int width) {
asm volatile(
"sub %1,%2 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movq (%1),%%xmm2 \n"
"movq 0x00(%1,%2,1),%%xmm1 \n"
"add $0x8,%1 \n"
"punpcklbw %%xmm1,%%xmm2 \n"
"movdqu (%0),%%xmm0 \n"
"movdqa %%xmm2,%%xmm1 \n"
"add $0x10,%0 \n"
"punpcklbw %%xmm0,%%xmm1 \n"
"punpckhbw %%xmm0,%%xmm2 \n"
"movdqu %%xmm1,(%3) \n"
"movdqu %%xmm2,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x10,%4 \n"
"jg 1b \n"
"1: \n"
"movq (%1),%%xmm2 \n"
"movq 0x00(%1,%2,1),%%xmm1 \n"
"add $0x8,%1 \n"
"punpcklbw %%xmm1,%%xmm2 \n"
"movdqu (%0),%%xmm0 \n"
"movdqa %%xmm2,%%xmm1 \n"
"add $0x10,%0 \n"
"punpcklbw %%xmm0,%%xmm1 \n"
"punpckhbw %%xmm0,%%xmm2 \n"
"movdqu %%xmm1,(%3) \n"
"movdqu %%xmm2,0x10(%3) \n"
"lea 0x20(%3),%3 \n"
"sub $0x10,%4 \n"
"jg 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -6192,27 +6297,27 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y,
int width) {
asm volatile(
"sub %1,%2 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"vpmovzxbw (%1),%%ymm1 \n"
"vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
"add $0x10,%1 \n"
"vpsllw $0x8,%%ymm2,%%ymm2 \n"
"vpor %%ymm1,%%ymm2,%%ymm2 \n"
"vmovdqu (%0),%%ymm0 \n"
"add $0x20,%0 \n"
"vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
"vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
"vextractf128 $0x0,%%ymm1,(%3) \n"
"vextractf128 $0x0,%%ymm2,0x10(%3) \n"
"vextractf128 $0x1,%%ymm1,0x20(%3) \n"
"vextractf128 $0x1,%%ymm2,0x30(%3) \n"
"lea 0x40(%3),%3 \n"
"sub $0x20,%4 \n"
"jg 1b \n"
"vzeroupper \n"
"1: \n"
"vpmovzxbw (%1),%%ymm1 \n"
"vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
"add $0x10,%1 \n"
"vpsllw $0x8,%%ymm2,%%ymm2 \n"
"vpor %%ymm1,%%ymm2,%%ymm2 \n"
"vmovdqu (%0),%%ymm0 \n"
"add $0x20,%0 \n"
"vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
"vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
"vextractf128 $0x0,%%ymm1,(%3) \n"
"vextractf128 $0x0,%%ymm2,0x10(%3) \n"
"vextractf128 $0x1,%%ymm1,0x20(%3) \n"
"vextractf128 $0x1,%%ymm2,0x30(%3) \n"
"lea 0x40(%3),%3 \n"
"sub $0x20,%4 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -6231,27 +6336,27 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y,
int width) {
asm volatile(
"sub %1,%2 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"vpmovzxbw (%1),%%ymm1 \n"
"vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
"add $0x10,%1 \n"
"vpsllw $0x8,%%ymm2,%%ymm2 \n"
"vpor %%ymm1,%%ymm2,%%ymm2 \n"
"vmovdqu (%0),%%ymm0 \n"
"add $0x20,%0 \n"
"vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
"vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
"vextractf128 $0x0,%%ymm1,(%3) \n"
"vextractf128 $0x0,%%ymm2,0x10(%3) \n"
"vextractf128 $0x1,%%ymm1,0x20(%3) \n"
"vextractf128 $0x1,%%ymm2,0x30(%3) \n"
"lea 0x40(%3),%3 \n"
"sub $0x20,%4 \n"
"jg 1b \n"
"vzeroupper \n"
"1: \n"
"vpmovzxbw (%1),%%ymm1 \n"
"vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
"add $0x10,%1 \n"
"vpsllw $0x8,%%ymm2,%%ymm2 \n"
"vpor %%ymm1,%%ymm2,%%ymm2 \n"
"vmovdqu (%0),%%ymm0 \n"
"add $0x20,%0 \n"
"vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
"vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
"vextractf128 $0x0,%%ymm1,(%3) \n"
"vextractf128 $0x0,%%ymm2,0x10(%3) \n"
"vextractf128 $0x1,%%ymm1,0x20(%3) \n"
"vextractf128 $0x1,%%ymm2,0x30(%3) \n"
"lea 0x40(%3),%3 \n"
"sub $0x20,%4 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@@ -6669,6 +6774,186 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
}
#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
#ifdef HAS_NV21TOYUV24ROW_AVX2
// begin NV21ToYUV24Row_C avx2 constants
static const ulvec8 kBLEND0 = {0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, 0x00,
0x80, 0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80,
0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00};
static const ulvec8 kBLEND1 = {0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80};
static const ulvec8 kBLEND2 = {0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00};
static const ulvec8 kSHUF0 = {0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05,
0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05};
static const ulvec8 kSHUF1 = {0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80,
0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80};
static const ulvec8 kSHUF2 = {0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f,
0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f};
static const ulvec8 kSHUF3 = {0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80,
0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80};
static const ulvec8 kSHUF4 = {0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a,
0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a};
static const ulvec8 kSHUF5 = {0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80,
0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80};
// NV21ToYUV24Row_AVX2
void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_yuv24,
int width) {
uint8_t* src_y_ptr;
uint64_t src_offset = 0;
uint64_t width64;
width64 = width;
src_y_ptr = (uint8_t*)src_y;
asm volatile(
"vmovdqu %5, %%ymm0 \n" // init blend value
"vmovdqu %6, %%ymm1 \n" // init blend value
"vmovdqu %7, %%ymm2 \n" // init blend value
// "sub $0x20, %3 \n" //sub 32 from width for final loop
LABELALIGN
"1: \n" // label 1
"vmovdqu (%0,%4), %%ymm3 \n" // src_y
"vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1
"vmovdqu (%1), %%ymm5 \n" // src_uv
"vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf
"vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for
// shuf
"vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for
// shuf
"vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf
"vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for
// shuf
"vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0
"vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0
"vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2
"vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1
"vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const
"vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results
"vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h
"vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results
"add $0x20, %4 \n" // add to src buffer
// ptr
"vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert
"vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert
"vmovdqu %%ymm4, (%2) \n" // store dst_yuv
"vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h
"add $0x60,%2 \n" // add to dst buffer
// ptr
// "cmp %3, %4 \n" //(width64 -
// 32 bytes) and src_offset
"sub $0x20,%3 \n" // 32 pixels per loop
"jg 1b \n"
"vzeroupper \n" // sse-avx2
// transistions
: "+r"(src_y), //%0
"+r"(src_vu), //%1
"+r"(dst_yuv24), //%2
"+r"(width64), //%3
"+r"(src_offset) //%4
: "m"(kBLEND0), //%5
"m"(kBLEND1), //%6
"m"(kBLEND2), //%7
"m"(kSHUF0), //%8
"m"(kSHUF1), //%9
"m"(kSHUF2), //%10
"m"(kSHUF3), //%11
"m"(kSHUF4), //%12
"m"(kSHUF5) //%13
: "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12",
"xmm13", "xmm14", "xmm15");
}
#endif // HAS_NV21TOYUV24ROW_AVX2
#ifdef HAS_SWAPUVROW_SSSE3
// Shuffle table for reversing the bytes.
static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
// Convert UV plane of NV12 to VU of NV21.
void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
asm volatile(
"movdqu %3,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"lea 0x20(%0),%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n"
"movdqu %%xmm0,(%1) \n"
"movdqu %%xmm1,0x10(%1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_vu), // %1
"+r"(width) // %2
: "m"(kShuffleUVToVU) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm5");
}
#endif // HAS_SWAPUVROW_SSSE3
#ifdef HAS_SWAPUVROW_AVX2
void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm5 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"lea 0x40(%0),%0 \n"
"vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
"vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
"vmovdqu %%ymm0,(%1) \n"
"vmovdqu %%ymm1,0x20(%1) \n"
"lea 0x40(%1),%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_uv), // %0
"+r"(dst_vu), // %1
"+r"(width) // %2
: "m"(kShuffleUVToVU) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm5");
}
#endif // HAS_SWAPUVROW_AVX2
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus