Source: data-flow/include/vec.h
|
|
|
|
// Copyright (C) 2001 Jean-Marc Valin
//This file contains vector primitives and supports 3DNow! and SSE (incomplete) optimization
#ifndef VEC_H
#define VEC_H
#include <math.h>
#include "vec_sse.h"
#include "vec_3dnow.h"
#include "iextensions.h"
/* Prefetch routines, may be used on any type (not only float and SSE2 double) */
#ifdef _USE_3DNOW /*This means we're likely to have 64-byte cache lines (unless it's a K6-II)*/
#ifndef CACHE_LINES
#define CACHE_LINES 64
#endif /*CACHE_LINES */
#ifndef MAX_PREFETCH
#define MAX_PREFETCH 8192 /* The Athlon L1 size is 64 kB */
#endif /*MAX_PREFETCH*/
#ifndef CACHE_MASK
#define CACHE_MASK 0xffffffd0
#endif /*CACHE_MASK */
#else /*_USE_3DNOW*/
#ifndef CACHE_LINES
#define CACHE_LINES 32
#endif /*CACHE_LINES */
#ifndef CACHE_MASK
#define CACHE_MASK 0xffffffe0
#endif /*CACHE_MASK */
#ifndef MAX_PREFETCH
#define MAX_PREFETCH 4096 /* The PIII L1 size is 32 kB */
#endif /*MAX_PREFETCH*/
#endif
/*template <class T>
void vec_prefetchnta(T *a, int len)
{
int size=sizeof(T)*len;
if (size > MAX_PREFETCH)
return;
__asm__ __volatile__ (
"
push %%eax
push %%ecx
prefetch_loop%=:
prefetchnta (%%eax)
add %%edi, %%eax
sub %%edi, %%ecx
ja prefetch_loop%=
pop %%ecx
pop %%eax
"
: : "a" (a), "D" (CACHE_LINES), "c" (size)
:
);
}
*/
template <class T>
inline void vec_copy(const T *x, T *y, int len)
{
while (len > 3)
{
*y++ = *x++;
*y++ = *x++;
*y++ = *x++;
*y++ = *x++;
len -= 4;
}
while (len)
{
*y++ = *x++;
len--;
}
}
template <class T>
inline T vec_inner_prod(const T *a, const T *b, int len)
{
T sum1=0, sum2=0, sum3=0, sum4=0;
const T *end = a+len;
while (a<end-3)
{
sum1+=a[0]*b[0];
sum2+=a[1]*b[1];
sum3+=a[2]*b[2];
sum4+=a[3]*b[3];
a+=4;
b+=4;
}
while (a<end)
{
sum1+=a[0]*b[0];
a++; b++;
}
return (sum1+sum2)+(sum3+sum4);
}
template <class T>
inline void vec_add_vec(const T *a, const T *b, T *c, int len)
{
const T *end = a+len;
while (a<end-3)
{
c[0]=a[0]+b[0];
c[1]=a[1]+b[1];
c[2]=a[2]+b[2];
c[3]=a[3]+b[3];
a+=4;
b+=4;
c+=4;
}
while (a<end)
{
c[0]=a[0]+b[0];
a++; b++; c++;
}
}
template <class T>
inline void vec_sub_vec(const T *a, const T *b, T *c, int len)
{
const T *end = a+len;
while (a<end-3)
{
c[0]=a[0]-b[0];
c[1]=a[1]-b[1];
c[2]=a[2]-b[2];
c[3]=a[3]-b[3];
a+=4;
b+=4;
c+=4;
}
while (a<end)
{
c[0]=a[0]-b[0];
a++; b++; c++;
}
}
template <class T>
inline void vec_mul_vec(const T *a, const T *b, T *c, int len)
{
const T *end = a+len;
while (a<end-3)
{
c[0]=a[0]*b[0];
c[1]=a[1]*b[1];
c[2]=a[2]*b[2];
c[3]=a[3]*b[3];
a+=4;
b+=4;
c+=4;
}
while (a<end)
{
c[0]=a[0]*b[0];
a++; b++; c++;
}
}
template <class T>
inline void vec_mul_and_add(const T *a, const T *b, T *c, int len)
{
const T *end = a+len;
while (a<end-3)
{
c[0]+=a[0]*b[0];
c[1]+=a[1]*b[1];
c[2]+=a[2]*b[2];
c[3]+=a[3]*b[3];
a+=4;
b+=4;
c+=4;
}
while (a<end)
{
c[0]+=a[0]*b[0];
a++; b++; c++;
}
}
template <class T>
inline void vec_mul_and_add(const T a, const T *b, T *c, int len)
{
const T *end = b+len;
while (b<end-3)
{
c[0]+=a*b[0];
c[1]+=a*b[1];
c[2]+=a*b[2];
c[3]+=a*b[3];
b+=4;
c+=4;
}
while (b<end)
{
c[0]+=a*b[0];
b++; c++;
}
}
template <class T>
inline void vec_div_vec(const T *a, const T *b, T *c, int len)
{
const T *end = a+len;
while (a<end-3)
{
c[0]=a[0]/b[0];
c[1]=a[1]/b[1];
c[2]=a[2]/b[2];
c[3]=a[3]/b[3];
a+=4;
b+=4;
c+=4;
}
while (a<end)
{
c[0]=a[0]/b[0];
a++; b++; c++;
}
}
template <class T>
inline void vec_add_scal(const T a, const T *b, T *c, int len)
{
const T *end = b+len;
while (b<end-3)
{
c[0]=a+b[0];
c[1]=a+b[1];
c[2]=a+b[2];
c[3]=a+b[3];
b+=4;
c+=4;
}
while (b<end)
{
c[0]=a+b[0];
b++; c++;
}
}
template <class T>
inline void vec_mul_scal(const T a, const T *b, T *c, int len)
{
const T *end = b+len;
while (b<end-3)
{
c[0]=a*b[0];
c[1]=a*b[1];
c[2]=a*b[2];
c[3]=a*b[3];
b+=4;
c+=4;
}
while (b<end)
{
c[0]=a*b[0];
b++; c++;
}
}
template <class T>
inline T vec_dist2(const T *a, const T *b, int len)
{
T sum1=0, sum2=0, sum3=0, sum4=0;
const T *end = a+len;
while (a<end-3)
{
sum1+=(a[0]-b[0])*(a[0]-b[0]);
sum2+=(a[1]-b[1])*(a[1]-b[1]);
sum3+=(a[2]-b[2])*(a[2]-b[2]);
sum4+=(a[3]-b[3])*(a[3]-b[3]);
a+=4;
b+=4;
}
while (a<end)
{
sum1+=(a[0]-b[0])*(a[0]-b[0]);
a++; b++;
}
return (sum1+sum2)+(sum3+sum4);
}
template <class T>
inline T vec_mahalanobis2(const T *a, const T *b, const T *c, int len)
{
T sum1=0, sum2=0, sum3=0, sum4=0;
const T *end = a+len;
while (a<end-3)
{
sum1+=c[0]*(a[0]-b[0])*(a[0]-b[0]);
sum2+=c[1]*(a[1]-b[1])*(a[1]-b[1]);
sum3+=c[2]*(a[2]-b[2])*(a[2]-b[2]);
sum4+=c[3]*(a[3]-b[3])*(a[3]-b[3]);
a+=4;
b+=4;
c+=4;
}
while (a<end)
{
sum1+=c[0]*(a[0]-b[0])*(a[0]-b[0]);
a++; b++; c++;
}
return (sum1+sum2)+(sum3+sum4);
}
template <class T>
inline T vec_sum(const T *a, int len)
{
T sum1=0, sum2=0, sum3=0, sum4=0;
const T *end = a+len;
while (a<end-3)
{
sum1+=a[0];
sum2+=a[1];
sum3+=a[2];
sum4+=a[3];
a+=4;
}
while (a<end)
{
sum1+=a[0];
a++;
}
return (sum1+sum2)+(sum3+sum4);
}
template <class T>
inline T vec_norm2(const T *a, int len)
{
T sum1=0, sum2=0, sum3=0, sum4=0;
const T *end = a+len;
while (a<end-3)
{
sum1+=a[0]*a[0];
sum2+=a[1]*a[1];
sum3+=a[2]*a[2];
sum4+=a[3]*a[4];
a+=4;
}
while (a<end)
{
sum1+=a[0]*a[0];
a++;
}
return (sum1+sum2)+(sum3+sum4);
}
template <class T>
inline void vec_inv(const T *a, T *b, int len)
{
const T *end = a+len;
while (a<end-3)
{
b[0]=1/a[0];
b[1]=1/a[1];
b[2]=1/a[2];
b[3]=1/a[3];
a+=4; b+=4;
}
while (a<end)
{
b[0]=1/a[0];
a++; b++;
}
}
template <class T>
inline void vec_sqrt(const T *a, T *b, int len)
{
const T *end = a+len;
while (a<end-3)
{
b[0]=sqrt(a[0]);
b[1]=sqrt(a[1]);
b[2]=sqrt(a[2]);
b[3]=sqrt(a[3]);
a+=4; b+=4;
}
while (a<end)
{
b[0]=sqrt(a[0]);
a++; b++;
}
}
template <class T>
inline void vec_corr_cont(const T *a, T *filt, T *out, int len, int filtLen)
{
for (int i=0;i<len;i++)
out[i] = vec_inner_prod(a-filtLen+1, filt, filtLen);
}
template <class T>
inline void vec_conv_cont(const T *a, T *filt, T *out, int len, int filtLen)
{
T filt2[filtLen];
for (int i=0;i<filtLen;i++)
filt2[i] = filt[filtLen-i-1];
for (int i=0;i<len;i++)
out[i] = vec_inner_prod(a-filtLen+1, filt2, filtLen);
}
/*Here are float (REAL*4) specific versions of the routines */
inline float vec_inner_prod_float(const float *a, const float *b, int len)
{
float sum1=0, sum2=0, sum3=0, sum4=0;
const float *end = a+len;
while (a<end-3)
{
sum1+=a[0]*b[0];
sum2+=a[1]*b[1];
sum3+=a[2]*b[2];
sum4+=a[3]*b[3];
a+=4;
b+=4;
}
while (a<end)
{
sum1+=a[0]*b[0];
a++; b++;
}
return (sum1+sum2)+(sum3+sum4);
}
template <>
inline float vec_inner_prod<float>(const float *a, const float *b, int len)
{
#ifndef WIN32
#ifdef _ENABLE_3DNOW
if (len >= 8 && IExtensions::have3DNow())
return vec_inner_prod_3dnow(a,b,len);
else
#endif
#ifdef _ENABLE_SSE
if (len >=8 && IExtensions::haveSSE())
return vec_inner_prod_sse(a,b,len);
else
#endif
#endif
return vec_inner_prod_float(a,b,len);
}
#endif /* ifndef VEC_H*/
Generated by: jmvalin@usw-pr-shell2 on Mon Jun 24 00:06:36 2002, using kdoc 2.0a40. |