96 { \
97 SIMD_LL_SET1_EPI8(a) \
99 }
100
101#define SIMD_MM256_SET2_EPI8(a0, a1) \
102 { \
103 SIMD_LL_SET2_EPI8(a0, a1) \
104 , SIMD_LL_SET2_EPI8(a0, a1), SIMD_LL_SET2_EPI8(a0, a1), \
105 SIMD_LL_SET2_EPI8(a0, a1) \
106 }
107
108#define SIMD_MM256_SETR_EPI8(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, \
109 ac, ad, ae, af, b0, b1, b2, b3, b4, b5, b6, b7, \
110 b8, b9, ba, bb, bc, bd, be, bf) \
111 { \
112 SIMD_LL_SETR_EPI8(a0, a1, a2, a3, a4, a5, a6, a7) \
113 , SIMD_LL_SETR_EPI8(a8, a9, aa, ab, ac, ad, ae, af), \
114 SIMD_LL_SETR_EPI8(b0, b1, b2, b3, b4, b5, b6, b7), \
115 SIMD_LL_SETR_EPI8(b8, b9, ba, bb, bc, bd, be, bf) \
116 }
117
118#define SIMD_MM256_SET1_EPI16(a) \
119 { \
120 SIMD_LL_SET1_EPI16(a) \
121 , SIMD_LL_SET1_EPI16(a), SIMD_LL_SET1_EPI16(a), SIMD_LL_SET1_EPI16(a) \
122 }
123
124#define SIMD_MM256_SET2_EPI16(a0, a1) \
125 { \
126 SIMD_LL_SET2_EPI16(a0, a1) \
127 , SIMD_LL_SET2_EPI16(a0, a1), SIMD_LL_SET2_EPI16(a0, a1), \
128 SIMD_LL_SET2_EPI16(a0, a1) \
129 }
130
131#define SIMD_MM256_SETR_EPI16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, \
132 ac, ad, ae, af) \
133 { \
134 SIMD_LL_SETR_EPI16(a0, a1, a2, a3) \
135 , SIMD_LL_SETR_EPI16(a4, a5, a6, a7), SIMD_LL_SETR_EPI16(a8, a9, aa, ab), \
136 SIMD_LL_SETR_EPI16(ac, ad, ae, af) \
137 }
138
139#define SIMD_MM256_SET1_EPI32(a) \
140 { \
141 SIMD_LL_SET1_EPI32(a) \
142 , SIMD_LL_SET1_EPI32(a), SIMD_LL_SET1_EPI32(a), SIMD_LL_SET1_EPI32(a) \
143 }
144
145#define SIMD_MM256_SET2_EPI32(a0, a1) \
146 { \
147 SIMD_LL_SET2_EPI32(a0, a1) \
148 , SIMD_LL_SET2_EPI32(a0, a1), SIMD_LL_SET2_EPI32(a0, a1), \
149 SIMD_LL_SET2_EPI32(a0, a1) \
150 }
151
152#define SIMD_MM256_SETR_EPI32(a0, a1, a2, a3, a4, a5, a6, a7) \
153 { \
154 SIMD_LL_SET2_EPI32(a0, a1) \
155 , SIMD_LL_SET2_EPI32(a2, a3), SIMD_LL_SET2_EPI32(a4, a5), \
156 SIMD_LL_SET2_EPI32(a6, a7) \
157 }
158
159const size_t A =
sizeof(__m256i);
160const size_t DA = 2 *
A;
161const size_t QA = 4 *
A;
162const size_t OA = 8 *
A;
163const size_t HA =
A / 2;
164
167
176
178
192
202
204 0x0, 0x3, 0x6, 0x9, 0xC, 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
205 -1, -1, -1, -1, -1, 0x2, 0x5, 0x8, 0xB, 0xE, -1, -1, -1, -1, -1);
207 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, 0x4, 0x7, 0xA, 0xD, 0x0,
208 0x3, 0x6, 0x9, 0xC, 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
210 -1, -1, -1, -1, -1, -1, 0x2, 0x5, 0x8, 0xB, 0xE, -1, -1, -1, -1, -1, -1, -1,
211 -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, 0x4, 0x7, 0xA, 0xD);
212
214 0x0, 0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, -1, -1, -1, -1, -1, -1, -1, -1, 0x0,
215 0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, -1, -1, -1, -1, -1, -1, -1, -1);
217 -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe, -1,
218 -1, -1, -1, -1, -1, -1, -1, 0x0, 0x2, 0x4, 0x6, 0x8, 0xa, 0xc, 0xe);
219
221 0x1, 0x5, 0x9, 0xd, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1,
222 0x5, 0x9, 0xd, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
224 -1, -1, -1, -1, 0x1, 0x5, 0x9, 0xd, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
225 -1, -1, 0x1, 0x5, 0x9, 0xd, -1, -1, -1, -1, -1, -1, -1, -1);
227 -1, -1, -1, -1, -1, -1, -1, -1, 0x1, 0x5, 0x9, 0xd, -1, -1, -1, -1, -1, -1,
228 -1, -1, -1, -1, -1, -1, 0x1, 0x5, 0x9, 0xd, -1, -1, -1, -1);
229
231 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, 0x5, 0x9, 0xd, -1, -1,
232 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, 0x5, 0x9, 0xd);
234 SIMD_MM256_SETR_EPI8(0x0, 0x0, 0x0, 0x0, 0x4, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0,
235 0x0, 0x5, 0x0, 0x0, 0x0, 0x2, 0x0, 0x0, 0x0, 0x6, 0x0,
236 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x7, 0x0, 0x0, 0x0);
237
239 0x3, 0x7, 0xb, 0xf, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x3,
240 0x7, 0xb, 0xf, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
242 -1, -1, -1, -1, 0x3, 0x7, 0xb, 0xf, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
243 -1, -1, 0x3, 0x7, 0xb, 0xf, -1, -1, -1, -1, -1, -1, -1, -1);
245 -1, -1, -1, -1, -1, -1, -1, -1, 0x3, 0x7, 0xb, 0xf, -1, -1, -1, -1, -1, -1,
246 -1, -1, -1, -1, -1, -1, 0x3, 0x7, 0xb, 0xf, -1, -1, -1, -1);
247
249 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x3, 0x7, 0xb, 0xf, -1, -1,
250 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x3, 0x7, 0xb, 0xf);
251
253 0x0, -1, -1, 0x1, -1, -1, 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, -1, 0x5, -1,
254 -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA, -1);
256 -1, 0x3, -1, -1, 0x4, -1, -1, 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8,
257 -1, -1, 0x9, -1, -1, 0xA, -1, -1, 0xB, -1, -1, 0xC, -1, -1, 0xD);
259 -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA, -1, -1,
260 0xB, -1, -1, 0xC, -1, -1, 0xD, -1, -1, 0xE, -1, -1, 0xF, -1, -1);
261
263 -1, 0x0, -1, -1, 0x1, -1, -1, 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, -1, 0x5,
264 -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA);
266 -1, -1, 0x3, -1, -1, 0x4, -1, -1, 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1,
267 0x8, -1, -1, 0x9, -1, -1, 0xA, -1, -1, 0xB, -1, -1, 0xC, -1, -1);
269 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA, -1,
270 -1, 0xB, -1, -1, 0xC, -1, -1, 0xD, -1, -1, 0xE, -1, -1, 0xF, -1);
271
273 -1, -1, 0x0, -1, -1, 0x1, -1, -1, 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, -1,
274 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1);
276 0x2, -1, -1, 0x3, -1, -1, 0x4, -1, -1, 0x5, -1, -1, 0x6, -1, -1, 0x7, -1,
277 -1, 0x8, -1, -1, 0x9, -1, -1, 0xA, -1, -1, 0xB, -1, -1, 0xC, -1);
279 -1, 0x5, -1, -1, 0x6, -1, -1, 0x7, -1, -1, 0x8, -1, -1, 0x9, -1, -1, 0xA,
280 -1, -1, 0xB, -1, -1, 0xC, -1, -1, 0xD, -1, -1, 0xE, -1, -1, 0xF);
281
283 0x1, 0x4, 0x7, 0xA, 0xD, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
284 -1, -1, -1, 0x0, 0x3, 0x6, 0x9, 0xC, 0xF, -1, -1, -1, -1, -1);
286 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x2, 0x5, 0x8, 0xB, 0xE, 0x1,
287 0x4, 0x7, 0xA, 0xD, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
289 -1, -1, -1, -1, -1, 0x0, 0x3, 0x6, 0x9, 0xC, 0xF, -1, -1, -1, -1, -1, -1,
290 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x2, 0x5, 0x8, 0xB, 0xE);
291
293 0x2, 0x5, 0x8, 0xB, 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
294 -1, -1, -1, 0x1, 0x4, 0x7, 0xA, 0xD, -1, -1, -1, -1, -1, -1);
296 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x3, 0x6, 0x9, 0xC, 0xF, 0x2,
297 0x5, 0x8, 0xB, 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
299 -1, -1, -1, -1, -1, 0x1, 0x4, 0x7, 0xA, 0xD, -1, -1, -1, -1, -1, -1, -1, -1,
300 -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x3, 0x6, 0x9, 0xC, 0xF);
301
318
325
326template <bool align>
328
329template <>
331 return _mm256_loadu_si256(p);
332}
333
334template <>
336 return _mm256_load_si256(p);
337}
338
340 return reinterpret_cast<void *>(((size_t)ptr) & ~(align - 1));
341}
342
344 return ptr ==
AlignLo(ptr, align);
345}
346
347template <bool align>
349
350template <>
351SIMD_INLINE void Store<false>(__m256i *p, __m256i a) {
352 _mm256_storeu_si256(p, a);
353}
354
355template <>
356SIMD_INLINE void Store<true>(__m256i *p, __m256i a) {
357 _mm256_store_si256(p, a);
358}
359
361 return _mm256_min_epi16(K16_00FF, _mm256_max_epi16(value, K_ZERO));
362}
363
365 return _mm256_subs_epi16(y16, K16_Y_ADJUST);
366}
367
369 return _mm256_subs_epi16(uv16, K16_UV_ADJUST);
370}
371
373
374
375
376
377 return _mm256_srai_epi32(
378 _mm256_add_epi32(_mm256_madd_epi16(y16_1, K16_YRGB_RT),
379 _mm256_madd_epi16(v16_0, K16_VR_0)),
380 YUV_TO_BGR_AVERAGING_SHIFT);
381}
382
384
385
386
389 _mm256_unpacklo_epi16(v16, K_ZERO)),
391 _mm256_unpackhi_epi16(v16, K_ZERO))));
392}
393
395 return _mm256_srai_epi32(
396 _mm256_add_epi32(_mm256_madd_epi16(y16_1, K16_YRGB_RT),
397 _mm256_madd_epi16(u16_v16, K16_UG_VG)),
398 YUV_TO_BGR_AVERAGING_SHIFT);
399}
400
402 __m256i v16) {
405 _mm256_unpacklo_epi16(u16, v16)),
407 _mm256_unpackhi_epi16(u16, v16))));
408}
409
411 return _mm256_srai_epi32(
412 _mm256_add_epi32(_mm256_madd_epi16(y16_1, K16_YRGB_RT),
413 _mm256_madd_epi16(u16_0, K16_UB_0)),
414 YUV_TO_BGR_AVERAGING_SHIFT);
415}
416
420 _mm256_unpacklo_epi16(u16, K_ZERO)),
422 _mm256_unpackhi_epi16(u16, K_ZERO))));
423}
424
430
431
432
433 return _mm256_packus_epi16(lo, hi);
434}
435
437 __m256i lo =
441 __m256i hi =
445 return _mm256_packus_epi16(lo, hi);
446}
447
453 return _mm256_packus_epi16(lo, hi);
454}
455
456template <int index>
457__m256i
InterleaveBgr(__m256i blue, __m256i green, __m256i red);
458
459template <>
460SIMD_INLINE __m256i InterleaveBgr<0>(__m256i blue, __m256i green, __m256i red) {
461 return _mm256_or_si256(
462 _mm256_shuffle_epi8(_mm256_permute4x64_epi64(blue, 0x44),
463 K8_SHUFFLE_PERMUTED_BLUE_TO_BGR0),
464 _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(green, 0x44),
465 K8_SHUFFLE_PERMUTED_GREEN_TO_BGR0),
466 _mm256_shuffle_epi8(_mm256_permute4x64_epi64(red, 0x44),
467 K8_SHUFFLE_PERMUTED_RED_TO_BGR0)));
468}
469
470template <>
471SIMD_INLINE __m256i InterleaveBgr<1>(__m256i blue, __m256i green, __m256i red) {
472 return _mm256_or_si256(
473 _mm256_shuffle_epi8(_mm256_permute4x64_epi64(blue, 0x99),
474 K8_SHUFFLE_PERMUTED_BLUE_TO_BGR1),
475 _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(green, 0x99),
476 K8_SHUFFLE_PERMUTED_GREEN_TO_BGR1),
477 _mm256_shuffle_epi8(_mm256_permute4x64_epi64(red, 0x99),
478 K8_SHUFFLE_PERMUTED_RED_TO_BGR1)));
479}
480
481template <>
482SIMD_INLINE __m256i InterleaveBgr<2>(__m256i blue, __m256i green, __m256i red) {
483 return _mm256_or_si256(
484 _mm256_shuffle_epi8(_mm256_permute4x64_epi64(blue, 0xEE),
485 K8_SHUFFLE_PERMUTED_BLUE_TO_BGR2),
486 _mm256_or_si256(_mm256_shuffle_epi8(_mm256_permute4x64_epi64(green, 0xEE),
487 K8_SHUFFLE_PERMUTED_GREEN_TO_BGR2),
488 _mm256_shuffle_epi8(_mm256_permute4x64_epi64(red, 0xEE),
489 K8_SHUFFLE_PERMUTED_RED_TO_BGR2)));
490}
491
493 __m256i b0 = _mm256_shuffle_epi8(bgr[0], K8_SHUFFLE_BGR0_TO_BLUE);
494 __m256i b2 = _mm256_shuffle_epi8(bgr[2], K8_SHUFFLE_BGR2_TO_BLUE);
495 return _mm256_or_si256(
496 _mm256_permute2x128_si256(b0, b2, 0x20),
497 _mm256_or_si256(_mm256_shuffle_epi8(bgr[1], K8_SHUFFLE_BGR1_TO_BLUE),
498 _mm256_permute2x128_si256(b0, b2, 0x31)));
499}
500
502 __m256i g0 = _mm256_shuffle_epi8(bgr[0], K8_SHUFFLE_BGR0_TO_GREEN);
503 __m256i g2 = _mm256_shuffle_epi8(bgr[2], K8_SHUFFLE_BGR2_TO_GREEN);
504 return _mm256_or_si256(
505 _mm256_permute2x128_si256(g0, g2, 0x20),
506 _mm256_or_si256(_mm256_shuffle_epi8(bgr[1], K8_SHUFFLE_BGR1_TO_GREEN),
507 _mm256_permute2x128_si256(g0, g2, 0x31)));
508}
509
511 __m256i r0 = _mm256_shuffle_epi8(bgr[0], K8_SHUFFLE_BGR0_TO_RED);
512 __m256i r2 = _mm256_shuffle_epi8(bgr[2], K8_SHUFFLE_BGR2_TO_RED);
513 return _mm256_or_si256(
514 _mm256_permute2x128_si256(r0, r2, 0x20),
515 _mm256_or_si256(_mm256_shuffle_epi8(bgr[1], K8_SHUFFLE_BGR1_TO_RED),
516 _mm256_permute2x128_si256(r0, r2, 0x31)));
517}
518
519template <bool align>
521 return _mm256_permute4x64_epi64(Load<align>(p), 0xD8);
522}
523
524}
525}
526}
#define SIMD_MM256_SET1_EPI16(a)
#define SIMD_LL_SET1_EPI8(a)
#define SIMD_MM256_SET2_EPI8(a0, a1)
#define SIMD_MM256_SET1_EPI32(a)
#define SIMD_MM256_SETR_EPI8(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab, ac, ad, ae, af, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)
#define SIMD_MM256_SET1_EPI8(a)
#define SIMD_MM256_SET2_EPI16(a0, a1)
const int U_TO_GREEN_WEIGHT
SIMD_INLINE __m256i BgrToGreen(__m256i bgr[3])
const __m256i K32_00000004
const __m256i K8_SHUFFLE_PERMUTED_RED_TO_BGR2
const __m256i K8_SHUFFLE_BGR1_TO_BLUE
const __m256i K16_YRGB_RT
const int Y_TO_RGB_WEIGHT
const __m256i K8_SHUFFLE_BGR1_TO_RED
SIMD_INLINE __m256i YuvToRed(__m256i y, __m256i v)
const __m256i K8_SHUFFLE_PERMUTED_BLUE_TO_BGR0
SIMD_INLINE __m256i SaturateI16ToU8(__m256i value)
const __m256i K32_00010000
const int YUV_TO_BGR_AVERAGING_SHIFT
SIMD_INLINE __m256i AdjustY16(__m256i y16)
const __m256i K8_SHUFFLE_PERMUTED_BLUE_TO_BGR1
SIMD_INLINE __m256i BgrToRed(__m256i bgr[3])
SIMD_INLINE __m256i YuvToBlue(__m256i y, __m256i u)
const __m256i K8_SHUFFLE_PERMUTED_GREEN_TO_BGR2
SIMD_INLINE __m256i AdjustUV16(__m256i uv16)
const __m256i K16_UV_ADJUST
SIMD_INLINE __m256i AdjustedYuvToBlue16(__m256i y16, __m256i u16)
const __m256i K32_00000008
const int V_TO_RED_WEIGHT
const __m256i K8_SHUFFLE_PERMUTED_BLUE_TO_BGR2
SIMD_INLINE __m256i AdjustedYuvToRed16(__m256i y16, __m256i v16)
const __m256i K32_00000002
SIMD_INLINE void Store(__m256i *p, __m256i a)
SIMD_INLINE __m256i AdjustedYuvToBlue32(__m256i y16_1, __m256i u16_0)
const __m256i K8_SHUFFLE_BGR0_TO_RED
const int V_TO_GREEN_WEIGHT
SIMD_INLINE __m256i AdjustedYuvToGreen32(__m256i y16_1, __m256i u16_v16)
const __m256i K8_SHUFFLE_BGR2_TO_RED
const __m256i K16_Y_ADJUST
SIMD_INLINE __m256i AdjustedYuvToGreen16(__m256i y16, __m256i u16, __m256i v16)
const __m256i K8_SHUFFLE_BGR0_TO_BLUE
const __m256i K8_SHUFFLE_BGR2_TO_GREEN
SIMD_INLINE void * AlignLo(const void *ptr, size_t align)
SIMD_INLINE __m256i BgrToBlue(__m256i bgr[3])
const int YUV_TO_BGR_ROUND_TERM
const __m256i K8_SHUFFLE_PERMUTED_RED_TO_BGR1
SIMD_INLINE __m256i AdjustedYuvToRed32(__m256i y16_1, __m256i v16_0)
const int U_TO_BLUE_WEIGHT
const __m256i K8_SHUFFLE_PERMUTED_RED_TO_BGR0
__m256i InterleaveBgr(__m256i blue, __m256i green, __m256i red)
const __m256i K32_01000000
SIMD_INLINE __m256i YuvToGreen(__m256i y, __m256i u, __m256i v)
const __m256i K8_SHUFFLE_BGR1_TO_GREEN
const __m256i K8_SHUFFLE_PERMUTED_GREEN_TO_BGR1
const __m256i K32_00000001
const __m256i K32_0000FFFF
const __m256i K8_SHUFFLE_PERMUTED_GREEN_TO_BGR0
SIMD_INLINE __m256i Load(const __m256i *p)
const __m256i K32_000000FF
const __m256i K8_SHUFFLE_BGR0_TO_GREEN
SIMD_INLINE __m256i LoadPermuted(const __m256i *p)
const __m256i K32_FFFFFF00
const __m256i K8_SHUFFLE_BGR2_TO_BLUE
SIMD_INLINE bool Aligned(const void *ptr, size_t align=sizeof(__m256))