00001
00006 #define JOIN_(a,b) a ## b
00007 #define JOIN(a,b) JOIN_(a,b)
00008 #define JOIN3_(a,b,c) a ## b ## c
00009 #define JOIN3(a,b,c) JOIN3_(a,b,c)
00010
00011 #undef FLT
00012 #undef VSIZE
00013 #undef SFX
00014 #undef VTYPE
00015 #undef ALIGNPTR
00016 #undef ALIGNSTRIDE
00017 #undef VL_IMCONVCOL
00018 #undef VL_IMCONVCOLTRI
00019
00020 #if (FLOAT_TYPE == FLOAT_TYPE_FLOAT)
00021 # define FLT float
00022 # define VSIZE 4
00023 # define SFX vf
00024 # define VTYPE __m128
00025 #else
00026 # define FLT double
00027 # define VSFX pd
00028 # define SFX vd
00029 # define VTYPE __m128d
00030 #endif
00031
00032 #define ALIGNPTR (sizeof(FLT) * VSIZE - 1)
00033 #define ALIGNSTRIDE (VSIZE - 1)
00034 #define VL_IMCONVCOL JOIN(vl_imconvcol_, SFX)
00035 #define VL_IMCONVCOLTRI JOIN(vl_imconvcoltri_, SFX)
00036
00037
00038 VL_EXPORT
00039 void
00040 VL_IMCONVCOL (FLT* dst, int dst_stride,
00041 FLT const* src,
00042 int src_width, int src_height, int src_stride,
00043 FLT const* filt, int filt_begin, int filt_end,
00044 int step, unsigned int flags)
00045 {
00046 int x = 0 ;
00047 int y ;
00048 int dheight = (src_height - 1) / step + 1 ;
00049 vl_bool transp = flags & VL_TRANSPOSE ;
00050 vl_bool zeropad = (flags & VL_PAD_MASK) == VL_PAD_BY_ZERO ;
00051
00052
00053 #ifdef VL_SUPPORT_SSE2
00054 if (vl_cpu_has_sse2() && vl_get_simd_enabled()) {
00055 JOIN3(_,VL_IMCONVCOL,_sse2)
00056 (dst,dst_stride,
00057 src,src_width,src_height,src_stride,
00058 filt,filt_begin,filt_end,
00059 step,flags) ;
00060 return ;
00061 }
00062 #endif
00063
00064
00065 filt += filt_end - filt_begin ;
00066
00067 while (x < src_width) {
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077 FLT const *filti ;
00078 int stop ;
00079
00080 for (y = 0 ; y < src_height ; y += step) {
00081 FLT acc = 0 ;
00082 FLT v = 0, c ;
00083 FLT const* srci ;
00084
00085 filti = filt ;
00086 stop = filt_end - y ;
00087 srci = src + x - stop * src_stride ;
00088
00089 if (stop > 0) {
00090 if (zeropad) {
00091 v = 0 ;
00092 } else {
00093 v = *(src + x) ;
00094 }
00095 while (filti > filt - stop) {
00096 c = *filti-- ;
00097 acc += v * c ;
00098 srci += src_stride ;
00099 }
00100 }
00101
00102 stop = filt_end - VL_MAX(filt_begin, y - src_height + 1) + 1 ;
00103 while (filti > filt - stop) {
00104 v = *srci ;
00105 c = *filti-- ;
00106 acc += v * c ;
00107 srci += src_stride ;
00108 }
00109
00110 if (zeropad) v = 0 ;
00111
00112 stop = filt_end - filt_begin + 1 ;
00113 while (filti > filt - stop) {
00114 c = *filti-- ;
00115 acc += v * c ;
00116 }
00117
00118 if (transp) {
00119 *dst = acc ; dst += 1 ;
00120 } else {
00121 *dst = acc ; dst += dst_stride ;
00122 }
00123 }
00124 if (transp) {
00125 dst += 1 * dst_stride - dheight * 1 ;
00126 } else {
00127 dst += 1 * 1 - dheight * dst_stride ;
00128 }
00129 x += 1 ;
00130 }
00131 }
00132
00133
00134
00135 VL_EXPORT
00136 void
00137 VL_IMCONVCOLTRI (FLT* dst, int dst_stride,
00138 FLT const* src,
00139 int src_width, int src_height, int src_stride,
00140 int filt_size,
00141 int step, unsigned int flags)
00142 {
00143 int x = 0 ;
00144 int y ;
00145 int dheight = (src_height - 1) / step + 1 ;
00146 vl_bool transp = flags & VL_TRANSPOSE ;
00147 vl_bool zeropad = (flags & VL_PAD_MASK) == VL_PAD_BY_ZERO ;
00148 #define fa ((double)(filt_size))
00149 FLT scale = ((FLT) (1.0/(fa*fa))) ;
00150 FLT * buff = vl_malloc(sizeof(FLT) * (src_height + filt_size)) ;
00151 buff += filt_size ;
00152
00153 while (x < src_width) {
00154 FLT const *srci ;
00155 srci = src + x + src_stride * (src_height - 1) ;
00156
00157
00158 buff [src_height - 1] = *srci ;
00159 for (y = src_height-2 ; y >= 0 ; --y) {
00160 srci -= src_stride ;
00161 buff [y] = buff [y+1] + *srci ;
00162 }
00163 if (zeropad) {
00164 for ( ; y >= - filt_size ; --y) {
00165 buff [y] = buff [y+1] ;
00166 }
00167 } else {
00168 for ( ; y >= - filt_size ; --y) {
00169 buff [y] = buff[y+1] + *srci ;
00170 }
00171 }
00172
00173
00174 for (y = - filt_size ; y < src_height - filt_size ; ++y) {
00175 buff [y] = buff [y] - buff [y + filt_size] ;
00176 }
00177 if (! zeropad) {
00178 for (y = src_height - filt_size ; y < src_height ; ++y) {
00179 buff [y] = buff [y] - buff [src_height-1] *
00180 (src_height - filt_size - y) ;
00181 }
00182 }
00183
00184
00185 for (y = - filt_size + 1 ; y < src_height ; ++y) {
00186 buff [y] += buff [y - 1] ;
00187 }
00188
00189
00190 {
00191 int stride = transp ? 1 : dst_stride ;
00192 dst += dheight * stride ;
00193 for (y = step * (dheight - 1) ; y >= 0 ; y -= step) {
00194 dst -= stride ;
00195 *dst = scale * (buff [y] - buff [y - filt_size]) ;
00196 }
00197 }
00198 x += 1 ;
00199 dst += transp ? dst_stride : 1 ;
00200 }
00201 vl_free (buff - filt_size) ;
00202 }