1+ #include < stdio.h>
2+ #include < opencv2/opencv.hpp>
3+ #include " ../../OpencvTest/OpencvTest/Core.h"
4+ #include " ../../OpencvTest/OpencvTest/MaxFilter.h"
5+ #include " ../../OpencvTest/OpencvTest/Utility.h"
6+ #include " ../../OpencvTest/OpencvTest/BoxFilter.h"
7+ using namespace std ;
8+ using namespace cv ;
9+ #define __SSSE3__ 1
10+
11+ void BoxBlur_SSE (unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride, int Channel, int Radius) {
12+ TMatrix a, b;
13+ TMatrix *p1 = &a, *p2 = &b;
14+ TMatrix **p3 = &p1, **p4 = &p2;
15+ IS_CreateMatrix (Width, Height, IS_DEPTH_8U, Channel, p3);
16+ IS_CreateMatrix (Width, Height, IS_DEPTH_8U, Channel, p4);
17+ (p1)->Data = Src;
18+ (p2)->Data = Dest;
19+ BoxBlur_SSE (p1, p2, Radius, EdgeMode::Smear);
20+ }
21+
22+ int IM_Sign (int X) {
23+ return (X >> 31 ) | (unsigned (-X)) >> 31 ;
24+ }
25+
26+ inline unsigned char IM_ClampToByte (int Value)
27+ {
28+ if (Value < 0 )
29+ return 0 ;
30+ else if (Value > 255 )
31+ return 255 ;
32+ else
33+ return (unsigned char )Value;
34+ // return ((Value | ((signed int)(255 - Value) >> 31)) & ~((signed int)Value >> 31));
35+ }
36+
37+
38+ inline __m128i _mm_sgn_epi16 (__m128i v) {
39+ #ifdef __SSSE3__
40+ v = _mm_sign_epi16 (_mm_set1_epi16 (1 ), v); // use PSIGNW on SSSE3 and later
41+ #else
42+ v = _mm_min_epi16 (v, _mm_set1_epi16 (1 )); // use PMINSW/PMAXSW on SSE2/SSE3.
43+ v = _mm_max_epi16 (v, _mm_set1_epi16 (-1 ));
44+ // _mm_set1_epi16(1) = _mm_srli_epi16(_mm_cmpeq_epi16(v, v), 15);
45+ // _mm_set1_epi16(-1) = _mm_cmpeq_epi16(v, v);
46+
47+ #endif
48+ return v;
49+ }
50+
51+ void MultiScaleSharpen (unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride, int Radius) {
52+ int Channel = Stride / Width;
53+ unsigned char *B1 = (unsigned char *)malloc (Height * Stride * sizeof (unsigned char ));
54+ unsigned char *B2 = (unsigned char *)malloc (Height * Stride * sizeof (unsigned char ));
55+ unsigned char *B3 = (unsigned char *)malloc (Height * Stride * sizeof (unsigned char ));
56+ BoxBlur_SSE (Src, B1, Width, Height, Channel, Stride, Radius);
57+ BoxBlur_SSE (Src, B2, Width, Height, Channel, Stride, Radius * 2 );
58+ BoxBlur_SSE (Src, B3, Width, Height, Channel, Stride, Radius * 4 );
59+ for (int Y = 0 ; Y < Height * Stride; Y++) {
60+ int DiffB1 = Src[Y] - B1[Y];
61+ int DiffB2 = B1[Y] - B2[Y];
62+ int DiffB3 = B2[Y] - B3[Y];
63+ Dest[Y] = IM_ClampToByte (((4 - 2 * IM_Sign (DiffB1)) * DiffB1 + 2 * DiffB2 + DiffB3) / 4 + Src[Y]);
64+ }
65+ }
66+
67+ void MultiScaleSharpen_SSE (unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride, int Radius) {
68+ int Channel = Stride / Width;
69+ unsigned char *B1 = (unsigned char *)malloc (Height * Stride * sizeof (unsigned char ));
70+ unsigned char *B2 = (unsigned char *)malloc (Height * Stride * sizeof (unsigned char ));
71+ unsigned char *B3 = (unsigned char *)malloc (Height * Stride * sizeof (unsigned char ));
72+ BoxBlur_SSE (Src, B1, Width, Height, Channel, Stride, Radius);
73+ BoxBlur_SSE (Src, B2, Width, Height, Channel, Stride, Radius * 2 );
74+ BoxBlur_SSE (Src, B3, Width, Height, Channel, Stride, Radius * 4 );
75+ int BlockSize = 8 , Block = (Height * Stride) / BlockSize;
76+ __m128i Zero = _mm_setzero_si128 ();
77+ __m128i Four = _mm_set1_epi16 (4 );
78+ for (int Y = 0 ; Y < Block * BlockSize; Y += BlockSize) {
79+ __m128i SrcV = _mm_unpacklo_epi8 (_mm_loadl_epi64 ((__m128i *)(Src + Y)), Zero);
80+ __m128i SrcB1 = _mm_unpacklo_epi8 (_mm_loadl_epi64 ((__m128i *)(B1 + Y)), Zero);
81+ __m128i SrcB2 = _mm_unpacklo_epi8 (_mm_loadl_epi64 ((__m128i *)(B2 + Y)), Zero);
82+ __m128i SrcB3 = _mm_unpacklo_epi8 (_mm_loadl_epi64 ((__m128i *)(B3 + Y)), Zero);
83+ __m128i DiffB1 = _mm_sub_epi16 (SrcV, SrcB1);
84+ __m128i DiffB2 = _mm_sub_epi16 (SrcB1, SrcB2);
85+ __m128i DiffB3 = _mm_sub_epi16 (SrcB2, SrcB3);
86+ // __m128i Offset = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(Four, _mm_slli_epi16(_mm_sgn_epi16(DiffB1), 1)), DiffB1), _mm_slli_epi16(DiffB2, 1)), DiffB3), 2);
87+ __m128i Offset = _mm_add_epi16 (_mm_srai_epi16 (_mm_sub_epi16 (_mm_slli_epi16 (_mm_sub_epi16 (SrcB1, _mm_sign_epi16 (DiffB1, DiffB1)), 1 ), _mm_add_epi16 (SrcB2, SrcB3)), 2 ), DiffB1);
88+ _mm_storel_epi64 ((__m128i *)(Dest + Y), _mm_packus_epi16 (_mm_add_epi16 (SrcV, Offset), Zero));
89+ }
90+ for (int Y = Block * BlockSize; Y < Height * Stride; Y++) {
91+ int DiffB1 = Src[Y] - B1[Y];
92+ int DiffB2 = B1[Y] - B2[Y];
93+ int DiffB3 = B2[Y] - B3[Y];
94+ Dest[Y] = IM_ClampToByte (((4 - 2 * IM_Sign (DiffB1)) * DiffB1 + 2 * DiffB2 + DiffB3) / 4 + Src[Y]);
95+ }
96+ }
97+
98+ int main () {
99+ Mat src = imread (" F:\\ car.jpg" );
100+ int Height = src.rows ;
101+ int Width = src.cols ;
102+ unsigned char *Src = src.data ;
103+ unsigned char *Dest = new unsigned char [Height * Width * 3 ];
104+ int Stride = Width * 3 ;
105+ int Radius = 5 ;
106+ int64 st = cvGetTickCount ();
107+ for (int i = 0 ; i <10 ; i++) {
108+ // Mat temp = MaxFilter(src, Radius);
109+ MultiScaleSharpen_SSE (Src, Dest, Width, Height, Stride, Radius);
110+ }
111+ double duration = (cv::getTickCount () - st) / cv::getTickFrequency () * 100 ;
112+ printf (" %.5f\n " , duration);
113+ MultiScaleSharpen (Src, Dest, Width, Height, Stride, Radius);
114+ Mat dst (Height, Width, CV_8UC3, Dest);
115+ imshow (" origin" , src);
116+ imshow (" result" , dst);
117+ imwrite (" F:\\ res.jpg" , dst);
118+ waitKey (0 );
119+ return 0 ;
120+ }
0 commit comments