63 __m256i* u0, __m256i* v0) {
67 yuv_m256[0] =
Load<true>(
reinterpret_cast<__m256i*
>(y));
68 yuv_m256[1] =
Load<true>(
reinterpret_cast<__m256i*
>(y) + 1);
69 yuv_m256[2] =
Load<true>(
reinterpret_cast<__m256i*
>(y) + 2);
70 yuv_m256[3] =
Load<true>(
reinterpret_cast<__m256i*
>(y) + 3);
72 yuv_m256[0] =
Load<false>(
reinterpret_cast<__m256i*
>(y));
73 yuv_m256[1] =
Load<false>(
reinterpret_cast<__m256i*
>(y) + 1);
74 yuv_m256[2] =
Load<false>(
reinterpret_cast<__m256i*
>(y) + 2);
75 yuv_m256[3] =
Load<false>(
reinterpret_cast<__m256i*
>(y) + 3);
79 _mm256_or_si256(_mm256_permute4x64_epi64(
80 _mm256_shuffle_epi8(yuv_m256[0],
Y_SHUFFLE0), 0xD8),
81 _mm256_permute4x64_epi64(
82 _mm256_shuffle_epi8(yuv_m256[1],
Y_SHUFFLE1), 0xD8));
84 _mm256_or_si256(_mm256_permute4x64_epi64(
85 _mm256_shuffle_epi8(yuv_m256[2],
Y_SHUFFLE0), 0xD8),
86 _mm256_permute4x64_epi64(
87 _mm256_shuffle_epi8(yuv_m256[3],
Y_SHUFFLE1), 0xD8));
89 *u0 = _mm256_permutevar8x32_epi32(
91 _mm256_or_si256(_mm256_shuffle_epi8(yuv_m256[0],
U_SHUFFLE0),
93 _mm256_or_si256(_mm256_shuffle_epi8(yuv_m256[2],
U_SHUFFLE2),
94 _mm256_shuffle_epi8(yuv_m256[3],
U_SHUFFLE3))),
96 *v0 = _mm256_permutevar8x32_epi32(
98 _mm256_or_si256(_mm256_shuffle_epi8(yuv_m256[0],
V_SHUFFLE0),
100 _mm256_or_si256(_mm256_shuffle_epi8(yuv_m256[2],
V_SHUFFLE2),
101 _mm256_shuffle_epi8(yuv_m256[3],
V_SHUFFLE3))),
121 __m256i y0, y1, u0, v0;
123 yuv_separate_avx2<align>(yuv, &y0, &y1, &u0, &v0);
124 __m256i u0_u0 = _mm256_permute4x64_epi64(u0, 0xD8);
125 __m256i v0_v0 = _mm256_permute4x64_epi64(v0, 0xD8);
126 yuv2rgb_avx2<align>(y0, _mm256_unpacklo_epi8(u0_u0, u0_u0),
127 _mm256_unpacklo_epi8(v0_v0, v0_v0), rgb);
128 yuv2rgb_avx2<align>(y1, _mm256_unpackhi_epi8(u0_u0, u0_u0),
129 _mm256_unpackhi_epi8(v0_v0, v0_v0),
130 rgb + 3 *
sizeof(__m256i));
133void yuyv2rgb_avx(
unsigned char* YUV,
unsigned char* RGB,
int NumPixels) {
134 assert(NumPixels == (1920 * 1080));
136 uint8_t* yuv_offset = YUV;
137 uint8_t* rgb_offset = RGB;
139 for (
int i = 0; i < NumPixels;
140 i = i +
static_cast<int>(2 *
sizeof(__m256i)),
141 yuv_offset += 4 *
static_cast<int>(
sizeof(__m256i)),
142 rgb_offset += 6 *
static_cast<int>(
sizeof(__m256i))) {
143 yuv2rgb_avx2<true>(yuv_offset, rgb_offset);
146 for (
int i = 0; i < NumPixels;
147 i = i +
static_cast<int>(2 *
sizeof(__m256i)),
148 yuv_offset += 4 *
static_cast<int>(
sizeof(__m256i)),
149 rgb_offset += 6 *
static_cast<int>(
sizeof(__m256i))) {
150 yuv2rgb_avx2<false>(yuv_offset, rgb_offset);