@@ -264,6 +264,112 @@ void YUVToRGB(unsigned char *Y, unsigned char *U, unsigned char *V, unsigned cha
264264	}
265265}
266266
267+ void  YUVToRGBSSE_1 (unsigned  char  *Y, unsigned  char  *U, unsigned  char  *V, unsigned  char  *RGB, int  Width, int  Height, int  Stride) {
268+ 	const  int  Shift = 13 ;
269+ 	const  int  HalfV = 1  << (Shift - 1 );
270+ 	const  int  B_Y_WT = 1  << Shift, B_U_WT = 2 .03211f  * (1  << Shift), B_V_WT = 0 ;
271+ 	const  int  G_Y_WT = 1  << Shift, G_U_WT = -0 .39465f  * (1  << Shift), G_V_WT = -0 .58060f  * (1  << Shift);
272+ 	const  int  R_Y_WT = 1  << Shift, R_U_WT = 0 , R_V_WT = 1.13983  * (1  << Shift);
273+ 	__m128i Weight_B_Y = _mm_set1_epi32 (B_Y_WT), Weight_B_U = _mm_set1_epi32 (B_U_WT), Weight_B_V = _mm_set1_epi32 (B_V_WT);
274+ 	__m128i Weight_G_Y = _mm_set1_epi32 (G_Y_WT), Weight_G_U = _mm_set1_epi32 (G_U_WT), Weight_G_V = _mm_set1_epi32 (G_V_WT);
275+ 	__m128i Weight_R_Y = _mm_set1_epi32 (R_Y_WT), Weight_R_U = _mm_set1_epi32 (R_U_WT), Weight_R_V = _mm_set1_epi32 (R_V_WT);
276+ 	__m128i Half = _mm_set1_epi32 (HalfV);
277+ 	__m128i C128 = _mm_set1_epi32 (128 );
278+ 	__m128i Zero = _mm_setzero_si128 ();
279+ 
280+ 	const  int  BlockSize = 16 , Block = Width / BlockSize;
281+ 	for  (int  YY = 0 ; YY < Height; YY++) {
282+ 		unsigned  char  *LinePD = RGB + YY * Stride;
283+ 		unsigned  char  *LinePY = Y + YY * Width;
284+ 		unsigned  char  *LinePU = U + YY * Width;
285+ 		unsigned  char  *LinePV = V + YY * Width;
286+ 		for  (int  XX = 0 ; XX < Block * BlockSize; XX += BlockSize, LinePY += BlockSize, LinePU += BlockSize, LinePV += BlockSize) {
287+ 			__m128i Blue, Green, Red, YV, UV, VV, Dest1, Dest2, Dest3;
288+ 			YV = _mm_loadu_si128 ((__m128i *)(LinePY + 0 ));
289+ 			UV = _mm_loadu_si128 ((__m128i *)(LinePU + 0 ));
290+ 			VV = _mm_loadu_si128 ((__m128i *)(LinePV + 0 ));
291+ 			// UV = _mm_sub_epi32(UV, C128);
292+ 			// VV = _mm_sub_epi32(VV, C128);
293+ 
294+ 			__m128i YV16L = _mm_unpacklo_epi8 (YV, Zero);
295+ 			__m128i YV16H = _mm_unpackhi_epi8 (YV, Zero);
296+ 			__m128i YV32LL = _mm_unpacklo_epi16 (YV16L, Zero);
297+ 			__m128i YV32LH = _mm_unpackhi_epi16 (YV16L, Zero);
298+ 			__m128i YV32HL = _mm_unpacklo_epi16 (YV16H, Zero);
299+ 			__m128i YV32HH = _mm_unpackhi_epi16 (YV16H, Zero);
300+ 
301+ 
302+ 			__m128i UV16L = _mm_unpacklo_epi8 (UV, Zero);
303+ 			__m128i UV16H = _mm_unpackhi_epi8 (UV, Zero);
304+ 			__m128i UV32LL = _mm_unpacklo_epi16 (UV16L, Zero);
305+ 			__m128i UV32LH = _mm_unpackhi_epi16 (UV16L, Zero);
306+ 			__m128i UV32HL = _mm_unpacklo_epi16 (UV16H, Zero);
307+ 			__m128i UV32HH = _mm_unpackhi_epi16 (UV16H, Zero);
308+ 			UV32LL = _mm_sub_epi32 (UV32LL, C128);
309+ 			UV32LH = _mm_sub_epi32 (UV32LH, C128);
310+ 			UV32HL = _mm_sub_epi32 (UV32HL, C128);
311+ 			UV32HH = _mm_sub_epi32 (UV32HH, C128);
312+ 
313+ 			__m128i VV16L = _mm_unpacklo_epi8 (VV, Zero);
314+ 			__m128i VV16H = _mm_unpackhi_epi8 (VV, Zero);
315+ 			__m128i VV32LL = _mm_unpacklo_epi16 (VV16L, Zero);
316+ 			__m128i VV32LH = _mm_unpackhi_epi16 (VV16L, Zero);
317+ 			__m128i VV32HL = _mm_unpacklo_epi16 (VV16H, Zero);
318+ 			__m128i VV32HH = _mm_unpackhi_epi16 (VV16H, Zero);
319+ 			VV32LL = _mm_sub_epi32 (VV32LL, C128);
320+ 			VV32LH = _mm_sub_epi32 (VV32LH, C128);
321+ 			VV32HL = _mm_sub_epi32 (VV32HL, C128);
322+ 			VV32HH = _mm_sub_epi32 (VV32HH, C128);
323+ 
324+ 			__m128i LL_B = _mm_add_epi32 (YV32LL, _mm_srai_epi32 (_mm_add_epi32 (Half, _mm_mullo_epi32 (UV32LL, Weight_B_U)), Shift));
325+ 			__m128i LH_B = _mm_add_epi32 (YV32LH, _mm_srai_epi32 (_mm_add_epi32 (Half, _mm_mullo_epi32 (UV32LH, Weight_B_U)), Shift));
326+ 			__m128i HL_B = _mm_add_epi32 (YV32HL, _mm_srai_epi32 (_mm_add_epi32 (Half, _mm_mullo_epi32 (UV32HL, Weight_B_U)), Shift));
327+ 			__m128i HH_B = _mm_add_epi32 (YV32HH, _mm_srai_epi32 (_mm_add_epi32 (Half, _mm_mullo_epi32 (UV32HH, Weight_B_U)), Shift));
328+ 			Blue =  _mm_packus_epi16 (_mm_packus_epi32 (LL_B, LH_B), _mm_packus_epi32 (HL_B, HH_B));
329+ 
330+ 			__m128i LL_G = _mm_add_epi32 (YV32LL, _mm_srai_epi32 (_mm_add_epi32 (Half, _mm_add_epi32 (_mm_mullo_epi32 (Weight_G_U, UV32LL), _mm_mullo_epi32 (Weight_G_V, VV32LL))), Shift));
331+ 			__m128i LH_G = _mm_add_epi32 (YV32LH, _mm_srai_epi32 (_mm_add_epi32 (Half, _mm_add_epi32 (_mm_mullo_epi32 (Weight_G_U, UV32LH), _mm_mullo_epi32 (Weight_G_V, VV32LH))), Shift));
332+ 			__m128i HL_G = _mm_add_epi32 (YV32HL, _mm_srai_epi32 (_mm_add_epi32 (Half, _mm_add_epi32 (_mm_mullo_epi32 (Weight_G_U, UV32HL), _mm_mullo_epi32 (Weight_G_V, VV32HL))), Shift));
333+ 			__m128i HH_G = _mm_add_epi32 (YV32HH, _mm_srai_epi32 (_mm_add_epi32 (Half, _mm_add_epi32 (_mm_mullo_epi32 (Weight_G_U, UV32HH), _mm_mullo_epi32 (Weight_G_V, VV32HH))), Shift));
334+ 			Green = _mm_packus_epi16 (_mm_packus_epi32 (LL_G, LH_G), _mm_packus_epi32 (HL_G, HH_G));
335+ 
336+ 			__m128i LL_R = _mm_add_epi32 (YV32LL, _mm_srai_epi32 (_mm_add_epi32 (Half, _mm_mullo_epi32 (VV32LL, Weight_R_V)), Shift));
337+ 			__m128i LH_R = _mm_add_epi32 (YV32LH, _mm_srai_epi32 (_mm_add_epi32 (Half, _mm_mullo_epi32 (VV32LH, Weight_R_V)), Shift));
338+ 			__m128i HL_R = _mm_add_epi32 (YV32HL, _mm_srai_epi32 (_mm_add_epi32 (Half, _mm_mullo_epi32 (VV32HL, Weight_R_V)), Shift));
339+ 			__m128i HH_R = _mm_add_epi32 (YV32HH, _mm_srai_epi32 (_mm_add_epi32 (Half, _mm_mullo_epi32 (VV32HH, Weight_R_V)), Shift));
340+ 			Red = _mm_packus_epi16 (_mm_packus_epi32 (LL_R, LH_R), _mm_packus_epi32 (HL_R, HH_R));
341+ 
342+ 			Dest1 = _mm_shuffle_epi8 (Blue, _mm_setr_epi8 (0 , -1 , -1 , 1 , -1 , -1 , 2 , -1 , -1 , 3 , -1 , -1 , 4 , -1 , -1 , 5 ));
343+ 			Dest1 = _mm_or_si128 (Dest1, _mm_shuffle_epi8 (Green, _mm_setr_epi8 (-1 , 0 , -1 , -1 , 1 , -1 , -1 , 2 , -1 , -1 , 3 , -1 , -1 , 4 , -1 , -1 )));
344+ 			Dest1 = _mm_or_si128 (Dest1, _mm_shuffle_epi8 (Red, _mm_setr_epi8 (-1 , -1 , 0 , -1 , -1 , 1 , -1 , -1 , 2 , -1 , -1 , 3 , -1 , -1 , 4 , -1 )));
345+ 
346+ 			Dest2 = _mm_shuffle_epi8 (Blue, _mm_setr_epi8 (-1 , -1 , 6 , -1 , -1 , 7 , -1 , -1 , 8 , -1 , -1 , 9 , -1 , -1 , 10 , -1 ));
347+ 			Dest2 = _mm_or_si128 (Dest2, _mm_shuffle_epi8 (Green, _mm_setr_epi8 (5 , -1 , -1 , 6 , -1 , -1 , 7 , -1 , -1 , 8 , -1 , -1 , 9 , -1 , -1 , 10 )));
348+ 			Dest2 = _mm_or_si128 (Dest2, _mm_shuffle_epi8 (Red, _mm_setr_epi8 (-1 , 5 , -1 , -1 , 6 , -1 , -1 , 7 , -1 , -1 , 8 , -1 , -1 , 9 , -1 , -1 )));
349+ 
350+ 			Dest3 = _mm_shuffle_epi8 (Blue, _mm_setr_epi8 (-1 , 11 , -1 , -1 , 12 , -1 , -1 , 13 , -1 , -1 , 14 , -1 , -1 , 15 , -1 , -1 ));
351+ 			Dest3 = _mm_or_si128 (Dest3, _mm_shuffle_epi8 (Green, _mm_setr_epi8 (-1 , -1 , 11 , -1 , -1 , 12 , -1 , -1 , 13 , -1 , -1 , 14 , -1 , -1 , 15 , -1 )));
352+ 			Dest3 = _mm_or_si128 (Dest3, _mm_shuffle_epi8 (Red, _mm_setr_epi8 (10 , -1 , -1 , 11 , -1 , -1 , 12 , -1 , -1 , 13 , -1 , -1 , 14 , -1 , -1 , 15 )));
353+ 
354+ 			_mm_storeu_si128 ((__m128i*)(LinePD + (XX / BlockSize) * BlockSize * 3 ), Dest1);
355+ 			_mm_storeu_si128 ((__m128i*)(LinePD + (XX / BlockSize) * BlockSize * 3  + BlockSize), Dest2);
356+ 			_mm_storeu_si128 ((__m128i*)(LinePD + (XX / BlockSize) * BlockSize * 3  + BlockSize * 2 ), Dest3);
357+ 		}
358+ 		for  (int  XX = Block * BlockSize; XX < Width; XX++, LinePU++, LinePV++, LinePY++) {
359+ 			int  YV = LinePY[XX], UV = LinePU[XX] - 128 , VV = LinePV[XX] - 128 ;
360+ 			LinePD[XX + 0 ] = ClampToByte (YV + ((B_U_WT * UV + HalfV) >> Shift));
361+ 			LinePD[XX + 1 ] = ClampToByte (YV + ((G_U_WT * UV + G_V_WT * VV + HalfV) >> Shift));
362+ 			LinePD[XX + 2 ] = ClampToByte (YV + ((R_V_WT * VV + HalfV) >> Shift));
363+ 		}
364+ 	}
365+ }
366+ 
367+ void  YUVToRGBSSE_2 (unsigned  char  *Y, unsigned  char  *U, unsigned  char  *V, unsigned  char  *RGB, int  Width, int  Height, int  Stride) {
368+ 	
369+ }
370+ 
371+ 
372+ 
267373int  main () {
268374	Mat src = imread (" F:\\ car.jpg" 
269375	int  Height = src.rows ;
@@ -282,7 +388,7 @@ int main() {
282388	double  duration = (cv::getTickCount () - st) / cv::getTickFrequency () * 100 ;
283389	printf (" %.5f\n " 
284390	RGBToYUVSSE_2 (Src, Y, U, V, Width, Height, Stride);
285- 	YUVToRGB (Y, U, V, Dest, Width, Height, Stride);
391+ 	YUVToRGBSSE_1 (Y, U, V, Dest, Width, Height, Stride);
286392	Mat dst (Height, Width, CV_8UC3, Dest);
287393	imshow (" origin" 
288394	imshow (" result" 
0 commit comments