VLFeat.org

API docs

  • Home
    • Download and Install
    • API docs
    • Matlab docs
    • About VLFeat
  • Tutorials
    • SIFT
    • MSER
    • IKM
    • HIKM
    • AIB
    • Utils
  • Main Page
  • Related Pages
  • Data Structures
  • Files
  • Examples

imopv_sse2.tc

00001 
00007 #define JOIN_(a,b) a ## b
00008 #define JOIN(a,b) JOIN_(a,b)
00009 #define JOIN3_(a,b,c) a ## b ## c
00010 #define JOIN3(a,b,c) JOIN3_(a,b,c)
00011 
00012 #undef  FLT
00013 #undef  VSIZE
00014 #undef  VSFX
00015 #undef  SFX
00016 #undef  VTYPE
00017 #undef  VMUL
00018 #undef  VADD
00019 #undef  VSTZ
00020 #undef  VLD1
00021 #undef  ALIGNPTR
00022 #undef  ALIGNSTRIDE
00023 #undef  VL_IMCONVCOL
00024 #undef  VL_IMCONVCOLTRI
00025 
00026 #if (FLOAT_TYPE == FLOAT_TYPE_FLOAT)
00027 #  define FLT    float
00028 #  define VSIZE  4
00029 #  define VSFX   ps
00030 #  define SFX    vf
00031 #  define VTYPE  __m128
00032 #else
00033 #  define FLT    double
00034 #  define VSIZE  2
00035 #  define VSFX   pd
00036 #  define SFX    vd
00037 #  define VTYPE  __m128d
00038 #endif
00039 
00040 #define VMUL            JOIN(_mm_mul_,     VSFX)
00041 #define VADD            JOIN(_mm_add_,     VSFX)
00042 #define VSTZ            JOIN(_mm_setzero_, VSFX)
00043 #define VLD1            JOIN(_mm_load1_,   VSFX)
00044 #define ALIGNPTR        (sizeof(FLT) * VSIZE - 1)
00045 #define ALIGNSTRIDE     (VSIZE - 1)
00046 #define VL_IMCONVCOL    JOIN3(_vl_imconvcol_,    SFX, _sse2)
00047 #define VL_IMCONVCOLTRI JOIN3(_vl_imconvcoltri_, SFX, _sse2)
00048 
00049 /* ---------------------------------------------------------------- */
00050 void
00051 VL_IMCONVCOL (FLT* dst, int dst_stride,
00052               FLT const* src,
00053               int src_width, int src_height, int src_stride,
00054               FLT const* filt, int filt_begin, int filt_end,
00055               int step, unsigned int flags)
00056 {
00057   int x = 0 ;
00058   int y ;
00059   int dheight = (src_height - 1) / step + 1 ;
00060   vl_bool use_simd  = (src_stride & ALIGNSTRIDE) == 0 ;
00061   vl_bool transp    = flags & VL_TRANSPOSE ;
00062   vl_bool zeropad   = (flags & VL_PAD_MASK) == VL_PAD_BY_ZERO ;
00063   double totcol = 0 ;
00064   double simdcol = 0 ;
00065   
00066   /* let filt point to the last sample of the filter */
00067   filt += filt_end - filt_begin ;
00068   
00069   while (x < src_width) {
00070     /* Calculate dest[x,y] = sum_p image[x,p] filt[y - p]
00071      * where supp(filt) = [filt_begin, filt_end] = [fb,fe].
00072      * 
00073      * CHUNK_A: y - fe <= p < 0
00074      *          completes VL_MAX(fe - y, 0) samples
00075      * CHUNK_B: VL_MAX(y - fe, 0) <= p < VL_MIN(y - fb, height - 1)
00076      *          completes fe - VL_MAX(fb, height - y) + 1 samples
00077      * CHUNK_C: completes all samples
00078      */
00079     
00080     FLT const *filti ;
00081     int stop ; 
00082     
00083     if ((x + VSIZE < src_width) & 
00084         (((vl_intptr)(src + x) & ALIGNPTR) == 0) & 
00085         use_simd) 
00086     {
00087       /* ----------------------------------------------  Vectorized */
00088       for (y = 0 ; y < src_height ; y += step)  {
00089         union {VTYPE v ; FLT x [VSIZE] ; } acc ;
00090         VTYPE v, c ;
00091         FLT const *srci ;
00092         acc.v = VSTZ () ;
00093         
00094         filti = filt ;
00095         stop = filt_end - y ;
00096         srci = src + x - stop * src_stride ;
00097         
00098         if (stop > 0) {
00099           if (zeropad) {
00100             v = VSTZ () ;
00101           } else {
00102             v = * (VTYPE*) (src + x) ;
00103           } 
00104           while (filti > filt - stop) {
00105             c = VLD1 (filti--) ;
00106             acc.v = VADD (acc.v,  VMUL (v, c)) ;
00107             srci += src_stride ;
00108           }
00109         }
00110         
00111         stop = filt_end - VL_MAX(filt_begin, y - src_height + 1) + 1 ;
00112         while (filti > filt - stop) {
00113           v = * (VTYPE*) srci ; 
00114           c = VLD1 (filti--) ;
00115           acc.v = VADD (acc.v, VMUL (v, c)) ;
00116           srci += src_stride ;
00117         }
00118         
00119         if (zeropad) v = VSTZ () ;
00120         
00121         stop = filt_end - filt_begin + 1;
00122         while (filti > filt - stop) {
00123           c = VLD1 (filti--) ;
00124           acc.v = VADD (acc.v, VMUL (v, c)) ;
00125         }
00126         
00127         if (transp) {
00128           *dst = acc.x[0] ; dst += dst_stride ;
00129           *dst = acc.x[1] ; dst += dst_stride ;
00130 #if(VSIZE == 4)
00131           *dst = acc.x[2] ; dst += dst_stride ;
00132           *dst = acc.x[3] ; dst += dst_stride ;          
00133 #endif
00134           dst += 1 * 1 - VSIZE * dst_stride ;
00135         } else {
00136           *dst = acc.x[0] ; dst += 1 ;
00137           *dst = acc.x[1] ; dst += 1 ;      
00138 #if(VSIZE == 4)
00139           *dst = acc.x[2] ; dst += 1 ;
00140           *dst = acc.x[3] ; dst += 1 ;
00141 #endif
00142           dst += 1 * dst_stride - VSIZE * 1 ;
00143         }        
00144       } /* next y */
00145       if (transp) {
00146         dst += VSIZE * dst_stride - dheight * 1 ;
00147       } else {
00148         dst += VSIZE * 1 - dheight * dst_stride ;
00149       }
00150       x       += VSIZE ;
00151       simdcol += VSIZE ;
00152       totcol  += VSIZE ;
00153     } else {
00154       /* -------------------------------------------------  Vanilla */
00155       for (y = 0 ; y < src_height ; y += step) {
00156         FLT acc = 0 ;  
00157         FLT v, c ;
00158         FLT const* srci ;
00159         
00160         filti = filt ;
00161         stop = filt_end - y ;
00162         srci = src + x - stop * src_stride ;
00163         
00164         if (stop > 0) {   
00165           if (zeropad) {
00166             v = 0 ;
00167           } else {
00168             v = *(src + x) ;
00169           }
00170           while (filti > filt - stop) { 
00171             c = *filti-- ;
00172             acc += v * c ;
00173             srci += src_stride ;
00174           }
00175         }
00176         
00177         stop = filt_end - VL_MAX(filt_begin, y - src_height + 1) + 1 ;
00178         while (filti > filt - stop) {
00179           v = *srci ; 
00180           c = *filti-- ;
00181           acc += v * c ;
00182           srci += src_stride ;
00183         }
00184         
00185         if (zeropad) v = 0 ;
00186         
00187         stop = filt_end - filt_begin + 1 ;
00188         while (filti > filt - stop) {
00189           c = *filti-- ;
00190           acc += v * c ;
00191         }
00192         
00193         if (transp) {
00194           *dst = acc ; dst += 1 ;          
00195         } else {
00196           *dst = acc ; dst += dst_stride ;
00197         }
00198       } /* next y */
00199       if (transp) {
00200         dst += 1 * dst_stride - dheight * 1 ;
00201       } else {
00202         dst += 1 * 1 - dheight * dst_stride ;
00203       }
00204       x      += 1 ;
00205       totcol += 1 ;
00206     } /* next x */
00207   }
00208 }
00209 
00210 #if 0
00211 /* ---------------------------------------------------------------- */
00212 VL_EXPORT
00213 void
00214 VL_IMCONVCOLTRI (FLT* dst, int dst_stride,
00215                  FLT const* src,
00216                  int src_width, int src_height, int src_stride,
00217                  int filt_size,
00218                  int step, unsigned int flags)
00219 {
00220   int x = 0 ;
00221   int y ;
00222   int dheight = (src_height - 1) / step + 1 ;
00223   vl_bool use_simd  = ((src_stride & ALIGNSTRIDE) == 0) && 
00224   (! (flags & VL_NO_SIMD)) ;
00225   vl_bool transp = flags & VL_TRANSPOSE ;
00226   vl_bool zeropad = (flags & VL_PAD_MASK) == VL_PAD_BY_ZERO ;
00227   
00228   FLT * buff = vl_malloc(sizeof(FLT) * (src_height + filt_size)) ;
00229 #define fa (1.0 / (double) (filt_size + 1))
00230   FLT scale = fa*fa*fa*fa ;  
00231   buff += filt_size ;
00232   
00233   while (x < src_width) {
00234     FLT const *srci ;
00235 
00236     use_simd = 0 ;
00237     if ((x + VSIZE < src_width) & 
00238         (((vl_ptrint)(src + x) & ALIGNPTR) == 0) & 
00239         use_simd) 
00240     {
00241       
00242     } else {
00243       int stridex = transp ? dst_stride : 1 ;
00244       int stridey = transp ? 1 : dst_stride ;
00245       srci = src + x + src_stride * (src_height - 1) ;
00246       
00247       /* integrate backward the column */
00248       buff [src_height - 1] = *srci ;
00249       for (y = src_height-2 ; y >=  0 ; --y) {
00250         srci -= src_stride ;
00251         buff [y] = buff [y+1] + *srci ;
00252       }
00253       if (zeropad) {
00254         for ( ; y >= - filt_size ; --y) {
00255           buff [y] = buff [y+1] ;          
00256         }
00257       } else {
00258         for ( ; y >= - filt_size ; --y) {
00259           buff [y] = buff[y+1] + *srci ;
00260         }
00261       }
00262       
00263       /* compute the filter forward */
00264       for (y = - filt_size ; y < src_height - filt_size ; ++y) {
00265         buff [y] = buff [y] - buff [y + filt_size] ;
00266       }
00267       if (! zeropad) {
00268         for (y = src_height - filt_size ; y < src_height ; ++y) {
00269           buff [y] = buff [y] - buff [src_height-1]  * 
00270           (src_height - filt_size - y) ;
00271         } 
00272       }
00273       
00274       /* integrate forward the column */
00275       for (y = - filt_size + 1 ; y < src_height ; ++y) {
00276         buff [y] += buff [y - 1] ;
00277       } 
00278       
00279       /* compute the filter backward */
00280       for (y = src_height - 1 ; y >= 0 ; --y) {
00281         dst [x*stridex + y*stridey] 
00282         = scale * (buff [y] - buff [y - filt_size]) ;
00283       }
00284     } /* next y */
00285     x += 1 ;
00286   }
00287   vl_free (buff - filt_size) ;
00288 }
00289 #endif
Copyright © 2008 Andrea Vedaldi and Brian Fulkerson