Skip to content

Commit 6152a01

Browse files
committed
Add speed_rgb2yuv_sse.cpp
1 parent 77c1588 commit 6152a01

File tree

1 file changed

+107
-1
lines changed

1 file changed

+107
-1
lines changed

speed_rgb2yuv_sse.cpp

Lines changed: 107 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,112 @@ void YUVToRGB(unsigned char *Y, unsigned char *U, unsigned char *V, unsigned cha
264264
}
265265
}
266266

267+
void YUVToRGBSSE_1(unsigned char *Y, unsigned char *U, unsigned char *V, unsigned char *RGB, int Width, int Height, int Stride) {
268+
const int Shift = 13;
269+
const int HalfV = 1 << (Shift - 1);
270+
const int B_Y_WT = 1 << Shift, B_U_WT = 2.03211f * (1 << Shift), B_V_WT = 0;
271+
const int G_Y_WT = 1 << Shift, G_U_WT = -0.39465f * (1 << Shift), G_V_WT = -0.58060f * (1 << Shift);
272+
const int R_Y_WT = 1 << Shift, R_U_WT = 0, R_V_WT = 1.13983 * (1 << Shift);
273+
__m128i Weight_B_Y = _mm_set1_epi32(B_Y_WT), Weight_B_U = _mm_set1_epi32(B_U_WT), Weight_B_V = _mm_set1_epi32(B_V_WT);
274+
__m128i Weight_G_Y = _mm_set1_epi32(G_Y_WT), Weight_G_U = _mm_set1_epi32(G_U_WT), Weight_G_V = _mm_set1_epi32(G_V_WT);
275+
__m128i Weight_R_Y = _mm_set1_epi32(R_Y_WT), Weight_R_U = _mm_set1_epi32(R_U_WT), Weight_R_V = _mm_set1_epi32(R_V_WT);
276+
__m128i Half = _mm_set1_epi32(HalfV);
277+
__m128i C128 = _mm_set1_epi32(128);
278+
__m128i Zero = _mm_setzero_si128();
279+
280+
const int BlockSize = 16, Block = Width / BlockSize;
281+
for (int YY = 0; YY < Height; YY++) {
282+
unsigned char *LinePD = RGB + YY * Stride;
283+
unsigned char *LinePY = Y + YY * Width;
284+
unsigned char *LinePU = U + YY * Width;
285+
unsigned char *LinePV = V + YY * Width;
286+
for (int XX = 0; XX < Block * BlockSize; XX += BlockSize, LinePY += BlockSize, LinePU += BlockSize, LinePV += BlockSize) {
287+
__m128i Blue, Green, Red, YV, UV, VV, Dest1, Dest2, Dest3;
288+
YV = _mm_loadu_si128((__m128i *)(LinePY + 0));
289+
UV = _mm_loadu_si128((__m128i *)(LinePU + 0));
290+
VV = _mm_loadu_si128((__m128i *)(LinePV + 0));
291+
//UV = _mm_sub_epi32(UV, C128);
292+
//VV = _mm_sub_epi32(VV, C128);
293+
294+
__m128i YV16L = _mm_unpacklo_epi8(YV, Zero);
295+
__m128i YV16H = _mm_unpackhi_epi8(YV, Zero);
296+
__m128i YV32LL = _mm_unpacklo_epi16(YV16L, Zero);
297+
__m128i YV32LH = _mm_unpackhi_epi16(YV16L, Zero);
298+
__m128i YV32HL = _mm_unpacklo_epi16(YV16H, Zero);
299+
__m128i YV32HH = _mm_unpackhi_epi16(YV16H, Zero);
300+
301+
302+
__m128i UV16L = _mm_unpacklo_epi8(UV, Zero);
303+
__m128i UV16H = _mm_unpackhi_epi8(UV, Zero);
304+
__m128i UV32LL = _mm_unpacklo_epi16(UV16L, Zero);
305+
__m128i UV32LH = _mm_unpackhi_epi16(UV16L, Zero);
306+
__m128i UV32HL = _mm_unpacklo_epi16(UV16H, Zero);
307+
__m128i UV32HH = _mm_unpackhi_epi16(UV16H, Zero);
308+
UV32LL = _mm_sub_epi32(UV32LL, C128);
309+
UV32LH = _mm_sub_epi32(UV32LH, C128);
310+
UV32HL = _mm_sub_epi32(UV32HL, C128);
311+
UV32HH = _mm_sub_epi32(UV32HH, C128);
312+
313+
__m128i VV16L = _mm_unpacklo_epi8(VV, Zero);
314+
__m128i VV16H = _mm_unpackhi_epi8(VV, Zero);
315+
__m128i VV32LL = _mm_unpacklo_epi16(VV16L, Zero);
316+
__m128i VV32LH = _mm_unpackhi_epi16(VV16L, Zero);
317+
__m128i VV32HL = _mm_unpacklo_epi16(VV16H, Zero);
318+
__m128i VV32HH = _mm_unpackhi_epi16(VV16H, Zero);
319+
VV32LL = _mm_sub_epi32(VV32LL, C128);
320+
VV32LH = _mm_sub_epi32(VV32LH, C128);
321+
VV32HL = _mm_sub_epi32(VV32HL, C128);
322+
VV32HH = _mm_sub_epi32(VV32HH, C128);
323+
324+
__m128i LL_B = _mm_add_epi32(YV32LL, _mm_srai_epi32(_mm_add_epi32(Half, _mm_mullo_epi32(UV32LL, Weight_B_U)), Shift));
325+
__m128i LH_B = _mm_add_epi32(YV32LH, _mm_srai_epi32(_mm_add_epi32(Half, _mm_mullo_epi32(UV32LH, Weight_B_U)), Shift));
326+
__m128i HL_B = _mm_add_epi32(YV32HL, _mm_srai_epi32(_mm_add_epi32(Half, _mm_mullo_epi32(UV32HL, Weight_B_U)), Shift));
327+
__m128i HH_B = _mm_add_epi32(YV32HH, _mm_srai_epi32(_mm_add_epi32(Half, _mm_mullo_epi32(UV32HH, Weight_B_U)), Shift));
328+
Blue = _mm_packus_epi16(_mm_packus_epi32(LL_B, LH_B), _mm_packus_epi32(HL_B, HH_B));
329+
330+
__m128i LL_G = _mm_add_epi32(YV32LL, _mm_srai_epi32(_mm_add_epi32(Half, _mm_add_epi32(_mm_mullo_epi32(Weight_G_U, UV32LL), _mm_mullo_epi32(Weight_G_V, VV32LL))), Shift));
331+
__m128i LH_G = _mm_add_epi32(YV32LH, _mm_srai_epi32(_mm_add_epi32(Half, _mm_add_epi32(_mm_mullo_epi32(Weight_G_U, UV32LH), _mm_mullo_epi32(Weight_G_V, VV32LH))), Shift));
332+
__m128i HL_G = _mm_add_epi32(YV32HL, _mm_srai_epi32(_mm_add_epi32(Half, _mm_add_epi32(_mm_mullo_epi32(Weight_G_U, UV32HL), _mm_mullo_epi32(Weight_G_V, VV32HL))), Shift));
333+
__m128i HH_G = _mm_add_epi32(YV32HH, _mm_srai_epi32(_mm_add_epi32(Half, _mm_add_epi32(_mm_mullo_epi32(Weight_G_U, UV32HH), _mm_mullo_epi32(Weight_G_V, VV32HH))), Shift));
334+
Green = _mm_packus_epi16(_mm_packus_epi32(LL_G, LH_G), _mm_packus_epi32(HL_G, HH_G));
335+
336+
__m128i LL_R = _mm_add_epi32(YV32LL, _mm_srai_epi32(_mm_add_epi32(Half, _mm_mullo_epi32(VV32LL, Weight_R_V)), Shift));
337+
__m128i LH_R = _mm_add_epi32(YV32LH, _mm_srai_epi32(_mm_add_epi32(Half, _mm_mullo_epi32(VV32LH, Weight_R_V)), Shift));
338+
__m128i HL_R = _mm_add_epi32(YV32HL, _mm_srai_epi32(_mm_add_epi32(Half, _mm_mullo_epi32(VV32HL, Weight_R_V)), Shift));
339+
__m128i HH_R = _mm_add_epi32(YV32HH, _mm_srai_epi32(_mm_add_epi32(Half, _mm_mullo_epi32(VV32HH, Weight_R_V)), Shift));
340+
Red = _mm_packus_epi16(_mm_packus_epi32(LL_R, LH_R), _mm_packus_epi32(HL_R, HH_R));
341+
342+
Dest1 = _mm_shuffle_epi8(Blue, _mm_setr_epi8(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5));
343+
Dest1 = _mm_or_si128(Dest1, _mm_shuffle_epi8(Green, _mm_setr_epi8(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1)));
344+
Dest1 = _mm_or_si128(Dest1, _mm_shuffle_epi8(Red, _mm_setr_epi8(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1)));
345+
346+
Dest2 = _mm_shuffle_epi8(Blue, _mm_setr_epi8(-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1));
347+
Dest2 = _mm_or_si128(Dest2, _mm_shuffle_epi8(Green, _mm_setr_epi8(5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10)));
348+
Dest2 = _mm_or_si128(Dest2, _mm_shuffle_epi8(Red, _mm_setr_epi8(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1)));
349+
350+
Dest3 = _mm_shuffle_epi8(Blue, _mm_setr_epi8(-1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1));
351+
Dest3 = _mm_or_si128(Dest3, _mm_shuffle_epi8(Green, _mm_setr_epi8(-1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1)));
352+
Dest3 = _mm_or_si128(Dest3, _mm_shuffle_epi8(Red, _mm_setr_epi8(10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15)));
353+
354+
_mm_storeu_si128((__m128i*)(LinePD + (XX / BlockSize) * BlockSize * 3), Dest1);
355+
_mm_storeu_si128((__m128i*)(LinePD + (XX / BlockSize) * BlockSize * 3 + BlockSize), Dest2);
356+
_mm_storeu_si128((__m128i*)(LinePD + (XX / BlockSize) * BlockSize * 3 + BlockSize * 2), Dest3);
357+
}
358+
for (int XX = Block * BlockSize; XX < Width; XX++, LinePU++, LinePV++, LinePY++) {
359+
int YV = LinePY[XX], UV = LinePU[XX] - 128, VV = LinePV[XX] - 128;
360+
LinePD[XX + 0] = ClampToByte(YV + ((B_U_WT * UV + HalfV) >> Shift));
361+
LinePD[XX + 1] = ClampToByte(YV + ((G_U_WT * UV + G_V_WT * VV + HalfV) >> Shift));
362+
LinePD[XX + 2] = ClampToByte(YV + ((R_V_WT * VV + HalfV) >> Shift));
363+
}
364+
}
365+
}
366+
367+
void YUVToRGBSSE_2(unsigned char *Y, unsigned char *U, unsigned char *V, unsigned char *RGB, int Width, int Height, int Stride) {
368+
369+
}
370+
371+
372+
267373
int main() {
268374
Mat src = imread("F:\\car.jpg");
269375
int Height = src.rows;
@@ -282,7 +388,7 @@ int main() {
282388
double duration = (cv::getTickCount() - st) / cv::getTickFrequency() * 100;
283389
printf("%.5f\n", duration);
284390
RGBToYUVSSE_2(Src, Y, U, V, Width, Height, Stride);
285-
YUVToRGB(Y, U, V, Dest, Width, Height, Stride);
391+
YUVToRGBSSE_1(Y, U, V, Dest, Width, Height, Stride);
286392
Mat dst(Height, Width, CV_8UC3, Dest);
287393
imshow("origin", src);
288394
imshow("result", dst);

0 commit comments

Comments
 (0)