41void yuyv2rgb_avx(
unsigned char *YUV,
unsigned char *RGB,
int NumPixels);
43#define SIMD_INLINE inline __attribute__((always_inline))
51 return (
reinterpret_cast<char *
>(&value))[index];
54#define SIMD_CHAR_AS_LONGLONG(a) (((int64_t)a) & 0xFF)
56#define SIMD_SHORT_AS_LONGLONG(a) (((int64_t)a) & 0xFFFF)
58#define SIMD_INT_AS_LONGLONG(a) (((int64_t)a) & 0xFFFFFFFF)
60#define SIMD_LL_SET1_EPI8(a) \
61 SIMD_CHAR_AS_LONGLONG(a) | (SIMD_CHAR_AS_LONGLONG(a) << 8) | \
62 (SIMD_CHAR_AS_LONGLONG(a) << 16) | (SIMD_CHAR_AS_LONGLONG(a) << 24) | \
63 (SIMD_CHAR_AS_LONGLONG(a) << 32) | (SIMD_CHAR_AS_LONGLONG(a) << 40) | \
64 (SIMD_CHAR_AS_LONGLONG(a) << 48) | (SIMD_CHAR_AS_LONGLONG(a) << 56)
66#define SIMD_LL_SET2_EPI8(a, b) \
67 SIMD_CHAR_AS_LONGLONG(a) | (SIMD_CHAR_AS_LONGLONG(b) << 8) | \
68 (SIMD_CHAR_AS_LONGLONG(a) << 16) | (SIMD_CHAR_AS_LONGLONG(b) << 24) | \
69 (SIMD_CHAR_AS_LONGLONG(a) << 32) | (SIMD_CHAR_AS_LONGLONG(b) << 40) | \
70 (SIMD_CHAR_AS_LONGLONG(a) << 48) | (SIMD_CHAR_AS_LONGLONG(b) << 56)
72#define SIMD_LL_SETR_EPI8(a, b, c, d, e, f, g, h) \
73 SIMD_CHAR_AS_LONGLONG(a) | (SIMD_CHAR_AS_LONGLONG(b) << 8) | \
74 (SIMD_CHAR_AS_LONGLONG(c) << 16) | (SIMD_CHAR_AS_LONGLONG(d) << 24) | \
75 (SIMD_CHAR_AS_LONGLONG(e) << 32) | (SIMD_CHAR_AS_LONGLONG(f) << 40) | \
76 (SIMD_CHAR_AS_LONGLONG(g) << 48) | (SIMD_CHAR_AS_LONGLONG(h) << 56)
78#define SIMD_LL_SET1_EPI16(a) \
79 SIMD_SHORT_AS_LONGLONG(a) | (SIMD_SHORT_AS_LONGLONG(a) << 16) | \
80 (SIMD_SHORT_AS_LONGLONG(a) << 32) | (SIMD_SHORT_AS_LONGLONG(a) << 48)
82#define SIMD_LL_SET2_EPI16(a, b) \
83 SIMD_SHORT_AS_LONGLONG(a) | (SIMD_SHORT_AS_LONGLONG(b) << 16) | \
84 (SIMD_SHORT_AS_LONGLONG(a) << 32) | (SIMD_SHORT_AS_LONGLONG(b) << 48)
86#define SIMD_LL_SETR_EPI16(a, b, c, d) \
87 SIMD_SHORT_AS_LONGLONG(a) | (SIMD_SHORT_AS_LONGLONG(b) << 16) | \
88 (SIMD_SHORT_AS_LONGLONG(c) << 32) | (SIMD_SHORT_AS_LONGLONG(d) << 48)
90#define SIMD_LL_SET1_EPI32(a) \
91 SIMD_INT_AS_LONGLONG(a) | (SIMD_INT_AS_LONGLONG(a) << 32)
93#define SIMD_LL_SET2_EPI32(a, b) \
94 SIMD_INT_AS_LONGLONG(a) | (SIMD_INT_AS_LONGLONG(b) << 32)
96#define SIMD_MM256_SET1_EPI8(a) \
98 SIMD_LL_SET1_EPI8(a) \
99 , SIMD_LL_SET1_EPI8(a), SIMD_LL_SET1_EPI8(a), SIMD_LL_SET1_EPI8(a) \
102#define SIMD_MM256_SET2_EPI8(a0, a1) \
104 SIMD_LL_SET2_EPI8(a0, a1) \
105 , SIMD_LL_SET2_EPI8(a0, a1), SIMD_LL_SET2_EPI8(a0, a1), \
106 SIMD_LL_SET2_EPI8(a0, a1) \
109#define SIMD_MM256_SETR_EPI8(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, \
110 ac, ad, ae, af, b0, b1, b2, b3, b4, b5, b6, b7, \
111 b8, b9, ba, bb, bc, bd, be, bf) \
113 SIMD_LL_SETR_EPI8(a0, a1, a2, a3, a4, a5, a6, a7) \
114 , SIMD_LL_SETR_EPI8(a8, a9, aa, ab, ac, ad, ae, af), \
115 SIMD_LL_SETR_EPI8(b0, b1, b2, b3, b4, b5, b6, b7), \
116 SIMD_LL_SETR_EPI8(b8, b9, ba, bb, bc, bd, be, bf) \
119#define SIMD_MM256_SET1_EPI16(a) \
121 SIMD_LL_SET1_EPI16(a) \
122 , SIMD_LL_SET1_EPI16(a), SIMD_LL_SET1_EPI16(a), SIMD_LL_SET1_EPI16(a) \
125#define SIMD_MM256_SET2_EPI16(a0, a1) \
127 SIMD_LL_SET2_EPI16(a0, a1) \
128 , SIMD_LL_SET2_EPI16(a0, a1), SIMD_LL_SET2_EPI16(a0, a1), \
129 SIMD_LL_SET2_EPI16(a0, a1) \
132#define SIMD_MM256_SETR_EPI16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, \
135 SIMD_LL_SETR_EPI16(a0, a1, a2, a3) \
136 , SIMD_LL_SETR_EPI16(a4, a5, a6, a7), SIMD_LL_SETR_EPI16(a8, a9, aa, ab), \
137 SIMD_LL_SETR_EPI16(ac, ad, ae, af) \
140#define SIMD_MM256_SET1_EPI32(a) \
142 SIMD_LL_SET1_EPI32(a) \
143 , SIMD_LL_SET1_EPI32(a), SIMD_LL_SET1_EPI32(a), SIMD_LL_SET1_EPI32(a) \
146#define SIMD_MM256_SET2_EPI32(a0, a1) \
148 SIMD_LL_SET2_EPI32(a0, a1) \
149 , SIMD_LL_SET2_EPI32(a0, a1), SIMD_LL_SET2_EPI32(a0, a1), \
150 SIMD_LL_SET2_EPI32(a0, a1) \
153#define SIMD_MM256_SETR_EPI32(a0, a1, a2, a3, a4, a5, a6, a7) \
155 SIMD_LL_SET2_EPI32(a0, a1) \
156 , SIMD_LL_SET2_EPI32(a2, a3), SIMD_LL_SET2_EPI32(a4, a5), \
157 SIMD_LL_SET2_EPI32(a6, a7) \
160const size_t A =
sizeof(__m256i);
205 0x0, 0x3, 0x6, 0x9, 0xC, 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
206 -1, -1, -1, -1, -1, 0x2, 0x5, 0x8, 0xB, 0xE, -1, -1, -1, -1, -1);
208 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, 0x4, 0x7, 0xA, 0xD, 0x0,
209 0x3, 0x6, 0x9, 0xC, 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
211 -1, -1, -1, -1, -1, -1, 0x2, 0x5, 0x8, 0xB, 0xE, -1, -1, -1, -1, -1, -1, -1,
212 -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, 0x4, 0x7, 0xA, 0xD);
215 0x0, 0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, -1, -1, -1, -1, -1, -1, -1, -1, 0x0,
216 0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, -1, -1, -1, -1, -1, -1, -1, -1);
218 -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, -1,
219 -1, -1, -1, -1, -1, -1, -1, 0x0, 0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe);
222 0x1, 0x5, 0x9, 0xd, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1,
223 0x5, 0x9, 0xd, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
225 -1, -1, -1, -1, 0x1, 0x5, 0x9, 0xd, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
226 -1, -1, 0x1, 0x5, 0x9, 0xd, -1, -1, -1, -1, -1, -1, -1, -1);
228 -1, -1, -1, -1, -1, -1, -1, -1, 0x1, 0x5, 0x9, 0xd, -1, -1, -1, -1, -1, -1,
229 -1, -1, -1, -1, -1, -1, 0x1, 0x5, 0x9, 0xd, -1, -1, -1, -1);
232 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, 0x5, 0x9, 0xd, -1, -1,
233 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, 0x5, 0x9, 0xd);
235 SIMD_MM256_SETR_EPI8(0x0, 0x0, 0x0, 0x0, 0x4, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0,
236 0x0, 0x5, 0x0, 0x0, 0x0, 0x2, 0x0, 0x0, 0x0, 0x6, 0x0,
237 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x7, 0x0, 0x0, 0x0);
240 0x3, 0x7, 0xb, 0xf, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x3,
241 0x7, 0xb, 0xf, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
243 -1, -1, -1, -1, 0x3, 0x7, 0xb, 0xf, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
244 -1, -1, 0x3, 0x7, 0xb, 0xf, -1, -1, -1, -1, -1, -1, -1, -1);
246 -1, -1, -1, -1, -1, -1, -1, -1, 0x3, 0x7, 0xb, 0xf, -1, -1, -1, -1, -1, -1,
247 -1, -1, -1, -1, -1, -1, 0x3, 0x7, 0xb, 0xf, -1, -1, -1, -1);
250 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x3, 0x7, 0xb, 0xf, -1, -1,
251 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x3, 0x7, 0xb, 0xf);
254 0x0, -1, -1, 0x1, -1, -1, 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, -1, 0x5, -1,
255 -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA, -1);
257 -1, 0x3, -1, -1, 0x4, -1, -1, 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8,
258 -1, -1, 0x9, -1, -1, 0xA, -1, -1, 0xB, -1, -1, 0xC, -1, -1, 0xD);
260 -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA, -1, -1,
261 0xB, -1, -1, 0xC, -1, -1, 0xD, -1, -1, 0xE, -1, -1, 0xF, -1, -1);
264 -1, 0x0, -1, -1, 0x1, -1, -1, 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, -1, 0x5,
265 -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA);
267 -1, -1, 0x3, -1, -1, 0x4, -1, -1, 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1,
268 0x8, -1, -1, 0x9, -1, -1, 0xA, -1, -1, 0xB, -1, -1, 0xC, -1, -1);
270 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA, -1,
271 -1, 0xB, -1, -1, 0xC, -1, -1, 0xD, -1, -1, 0xE, -1, -1, 0xF, -1);
274 -1, -1, 0x0, -1, -1, 0x1, -1, -1, 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, -1,
275 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1);
277 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, -1, 0x5, -1, -1, 0x6, -1, -1, 0x7, -1,
278 -1, 0x8, -1, -1, 0x9, -1, -1, 0xA, -1, -1, 0xB, -1, -1, 0xC, -1);
280 -1, 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA,
281 -1, -1, 0xB, -1, -1, 0xC, -1, -1, 0xD, -1, -1, 0xE, -1, -1, 0xF);
284 0x1, 0x4, 0x7, 0xA, 0xD, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
285 -1, -1, -1, 0x0, 0x3, 0x6, 0x9, 0xC, 0xF, -1, -1, -1, -1, -1);
287 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x2, 0x5, 0x8, 0xB, 0xE, 0x1,
288 0x4, 0x7, 0xA, 0xD, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
290 -1, -1, -1, -1, -1, 0x0, 0x3, 0x6, 0x9, 0xC, 0xF, -1, -1, -1, -1, -1, -1,
291 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x2, 0x5, 0x8, 0xB, 0xE);
294 0x2, 0x5, 0x8, 0xB, 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
295 -1, -1, -1, 0x1, 0x4, 0x7, 0xA, 0xD, -1, -1, -1, -1, -1, -1);
297 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x3, 0x6, 0x9, 0xC, 0xF, 0x2,
298 0x5, 0x8, 0xB, 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
300 -1, -1, -1, -1, -1, 0x1, 0x4, 0x7, 0xA, 0xD, -1, -1, -1, -1, -1, -1, -1, -1,
301 -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x3, 0x6, 0x9, 0xC, 0xF);
332 return _mm256_loadu_si256(p);
337 return _mm256_load_si256(p);
341 return reinterpret_cast<void *
>(((size_t)ptr) & ~(align - 1));
345 return ptr ==
AlignLo(ptr, align);
353 _mm256_storeu_si256(p, a);
358 _mm256_store_si256(p, a);
362 return _mm256_min_epi16(
K16_00FF, _mm256_max_epi16(value,
K_ZERO));
378 return _mm256_srai_epi32(
379 _mm256_add_epi32(_mm256_madd_epi16(y16_1,
K16_YRGB_RT),
380 _mm256_madd_epi16(v16_0,
K16_VR_0)),
390 _mm256_unpacklo_epi16(v16,
K_ZERO)),
392 _mm256_unpackhi_epi16(v16,
K_ZERO))));
396 return _mm256_srai_epi32(
397 _mm256_add_epi32(_mm256_madd_epi16(y16_1,
K16_YRGB_RT),
406 _mm256_unpacklo_epi16(u16, v16)),
408 _mm256_unpackhi_epi16(u16, v16))));
412 return _mm256_srai_epi32(
413 _mm256_add_epi32(_mm256_madd_epi16(y16_1,
K16_YRGB_RT),
414 _mm256_madd_epi16(u16_0,
K16_UB_0)),
421 _mm256_unpacklo_epi16(u16,
K_ZERO)),
423 _mm256_unpackhi_epi16(u16,
K_ZERO))));
434 return _mm256_packus_epi16(lo, hi);
446 return _mm256_packus_epi16(lo, hi);
454 return _mm256_packus_epi16(lo, hi);
462 return _mm256_or_si256(
463 _mm256_shuffle_epi8(_mm256_permute4x64_epi64(blue, 0x44),
465 _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(green, 0x44),
467 _mm256_shuffle_epi8(_mm256_permute4x64_epi64(red, 0x44),
473 return _mm256_or_si256(
474 _mm256_shuffle_epi8(_mm256_permute4x64_epi64(blue, 0x99),
476 _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(green, 0x99),
478 _mm256_shuffle_epi8(_mm256_permute4x64_epi64(red, 0x99),
484 return _mm256_or_si256(
485 _mm256_shuffle_epi8(_mm256_permute4x64_epi64(blue, 0xEE),
487 _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(green, 0xEE),
489 _mm256_shuffle_epi8(_mm256_permute4x64_epi64(red, 0xEE),
496 return _mm256_or_si256(
497 _mm256_permute2x128_si256(b0, b2, 0x20),
499 _mm256_permute2x128_si256(b0, b2, 0x31)));
505 return _mm256_or_si256(
506 _mm256_permute2x128_si256(g0, g2, 0x20),
508 _mm256_permute2x128_si256(g0, g2, 0x31)));
514 return _mm256_or_si256(
515 _mm256_permute2x128_si256(r0, r2, 0x20),
517 _mm256_permute2x128_si256(r0, r2, 0x31)));
522 return _mm256_permute4x64_epi64(Load<align>(p), 0xD8);
#define SIMD_MM256_SET1_EPI16(a)
#define SIMD_MM256_SET2_EPI8(a0, a1)
#define SIMD_MM256_SET1_EPI32(a)
#define SIMD_MM256_SETR_EPI8(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, ac, ad, ae, af, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)
#define SIMD_MM256_SET1_EPI8(a)
#define SIMD_MM256_SET2_EPI16(a0, a1)
const int U_TO_GREEN_WEIGHT
SIMD_INLINE __m256i BgrToGreen(__m256i bgr[3])
const __m256i K32_00000004
const __m256i K8_SHUFFLE_PERMUTED_RED_TO_BGR2
const __m256i K8_SHUFFLE_BGR1_TO_BLUE
const __m256i K16_YRGB_RT
void print_m256(__m256i a)
const int Y_TO_RGB_WEIGHT
const __m256i K8_SHUFFLE_BGR1_TO_RED
SIMD_INLINE __m256i YuvToRed(__m256i y, __m256i v)
SIMD_INLINE __m256i InterleaveBgr< 2 >(__m256i blue, __m256i green, __m256i red)
const __m256i K8_SHUFFLE_PERMUTED_BLUE_TO_BGR0
void print_m256_i16(const __m256i a)
SIMD_INLINE __m256i SaturateI16ToU8(__m256i value)
const __m256i K32_00010000
const int YUV_TO_BGR_AVERAGING_SHIFT
SIMD_INLINE void Store< true >(__m256i *p, __m256i a)
SIMD_INLINE __m256i AdjustY16(__m256i y16)
const __m256i K8_SHUFFLE_PERMUTED_BLUE_TO_BGR1
SIMD_INLINE __m256i BgrToRed(__m256i bgr[3])
SIMD_INLINE __m256i YuvToBlue(__m256i y, __m256i u)
const __m256i K8_SHUFFLE_PERMUTED_GREEN_TO_BGR2
SIMD_INLINE __m256i Load< false >(const __m256i *p)
SIMD_INLINE __m256i AdjustUV16(__m256i uv16)
const __m256i K16_UV_ADJUST
SIMD_INLINE __m256i AdjustedYuvToBlue16(__m256i y16, __m256i u16)
const __m256i K32_00000008
const int V_TO_RED_WEIGHT
SIMD_INLINE char GetChar(T value, size_t index)
void print_m256_i32(const __m256i a)
const __m256i K8_SHUFFLE_PERMUTED_BLUE_TO_BGR2
SIMD_INLINE __m256i AdjustedYuvToRed16(__m256i y16, __m256i v16)
const __m256i K32_00000002
SIMD_INLINE void Store(__m256i *p, __m256i a)
SIMD_INLINE __m256i AdjustedYuvToBlue32(__m256i y16_1, __m256i u16_0)
const __m256i K8_SHUFFLE_BGR0_TO_RED
SIMD_INLINE __m256i InterleaveBgr< 1 >(__m256i blue, __m256i green, __m256i red)
const int V_TO_GREEN_WEIGHT
SIMD_INLINE __m256i AdjustedYuvToGreen32(__m256i y16_1, __m256i u16_v16)
const __m256i K8_SHUFFLE_BGR2_TO_RED
const __m256i K16_Y_ADJUST
SIMD_INLINE __m256i AdjustedYuvToGreen16(__m256i y16, __m256i u16, __m256i v16)
const __m256i K8_SHUFFLE_BGR0_TO_BLUE
const __m256i K8_SHUFFLE_BGR2_TO_GREEN
SIMD_INLINE void * AlignLo(const void *ptr, size_t align)
SIMD_INLINE __m256i BgrToBlue(__m256i bgr[3])
const int YUV_TO_BGR_ROUND_TERM
const __m256i K8_SHUFFLE_PERMUTED_RED_TO_BGR1
SIMD_INLINE __m256i AdjustedYuvToRed32(__m256i y16_1, __m256i v16_0)
const int U_TO_BLUE_WEIGHT
const __m256i K8_SHUFFLE_PERMUTED_RED_TO_BGR0
__m256i InterleaveBgr(__m256i blue, __m256i green, __m256i red)
const __m256i K32_01000000
SIMD_INLINE __m256i YuvToGreen(__m256i y, __m256i u, __m256i v)
const __m256i K8_SHUFFLE_BGR1_TO_GREEN
SIMD_INLINE __m256i InterleaveBgr< 0 >(__m256i blue, __m256i green, __m256i red)
const __m256i K8_SHUFFLE_PERMUTED_GREEN_TO_BGR1
const __m256i K32_00000001
SIMD_INLINE void Store< false >(__m256i *p, __m256i a)
const __m256i K32_0000FFFF
const __m256i K8_SHUFFLE_PERMUTED_GREEN_TO_BGR0
SIMD_INLINE __m256i Load(const __m256i *p)
const __m256i K32_000000FF
void yuyv2rgb_avx(unsigned char *YUV, unsigned char *RGB, int NumPixels)
const __m256i K8_SHUFFLE_BGR0_TO_GREEN
SIMD_INLINE __m256i LoadPermuted(const __m256i *p)
SIMD_INLINE __m256i Load< true >(const __m256i *p)
const __m256i K32_FFFFFF00
const __m256i K8_SHUFFLE_BGR2_TO_BLUE
SIMD_INLINE bool Aligned(const void *ptr, size_t align=sizeof(__m256))