Source: data-flow/include/vec_sse.h

// Copyright (C) 2001 Jean-Marc Valin
//This file contains SSE-optimized vector primitives


#ifndef VEC_SSE_H
#define VEC_SSE_H


#ifdef _ENABLE_SSE


#define CLOBBER_SSE : "memory"


//template <>
inline void vec_mul_and_add_sse(const float a, const float *b, float *c, int len)
{
  __asm__ __volatile__ (
  "
  push %0
  push %1
  push %2
  push %3

  movss (%0), %%xmm0
  shufps $0, %%xmm0, %%xmm0

  sub $8, %2
  jb mul8_skip%=

mul8_loop%=:
  movups (%1), %%xmm1
  movups 16(%1), %%xmm2
  mulps %%xmm0, %%xmm1
  mulps %%xmm0, %%xmm2
  
  movups (%3), %%xmm3
  movups 16(%3), %%xmm4
  addps %%xmm1, %%xmm3
  addps %%xmm2, %%xmm4
  movups %%xmm3, (%3)
  movups %%xmm4, 16(%3)

  add $32, %1
  add $32, %3
  sub $8,  %2

  jae mul8_loop%=

mul8_skip%=:

  add $4, %2
  jl mul4_skip%=

  movups (%1), %%xmm1
  mulps %%xmm0, %%xmm1
  movups (%3), %%xmm3
  addps %%xmm1, %%xmm3
  movups %%xmm3, (%3)

  add $16, %1
  add $16, %3

  sub $4,  %2

mul4_skip%=:


  add $4, %2

  jmp cond1%=

mul1_loop%=:

  movss (%1), %%xmm1
  mulss %%xmm0, %%xmm1
  movss (%3), %%xmm3
  addss %%xmm1, %%xmm3
  movss %%xmm3, (%3)
  add $4, %1
  add $4, %3

cond1%=:
  sub $1, %2
  jae mul1_loop%=

  pop %3
  pop %2
  pop %1
  pop %0
  "
  : : "r" (&a), "r" (b), "q" (len), "r" (c)
  CLOBBER_SSE
  );
}

inline float vec_inner_prod_sse(const float *a, const float *b, int len)
{
  float sum;
  __asm__ __volatile__ (
  "
  push %%eax
  push %%edi
  push %%ecx
  xorps %%xmm3, %%xmm3
  xorps %%xmm4, %%xmm4

  sub $8, %%ecx
  jb mul8_skip%=

mul8_loop%=:
  movups (%%eax), %%xmm0
  movups (%%edi), %%xmm1
  movups 16(%%eax), %%xmm5
  movups 16(%%edi), %%xmm6
  add $32, %%eax
  add $32, %%edi
  mulps %%xmm0, %%xmm1
  mulps %%xmm5, %%xmm6
  addps %%xmm1, %%xmm3
  addps %%xmm6, %%xmm4

  sub $8,  %%ecx

  jae mul8_loop%=

mul8_skip%=:

  addps %%xmm4, %%xmm3

  add $4, %%ecx
  jl mul4_skip%=

  movups (%%eax), %%xmm0
  movups (%%edi), %%xmm1
  add $16, %%eax
  add $16, %%edi
  mulps %%xmm0, %%xmm1
  addps %%xmm1, %%xmm3

  sub $4,  %%ecx

mul4_skip%=:


  add $4, %%ecx

  jmp cond1%=

mul1_loop%=:
  movss (%%eax), %%xmm0
  movss (%%edi), %%xmm1
  add $4, %%eax
  add $4, %%edi
  mulss %%xmm0, %%xmm1
  addss %%xmm1, %%xmm3

cond1%=:
  sub $1, %%ecx
  jae mul1_loop%=

  movhlps %%xmm3, %%xmm4
  addps %%xmm4, %%xmm3
  movaps %%xmm3, %%xmm4
  //FIXME: which one?
  shufps $0x55, %%xmm4, %%xmm4
  //shufps $33, %%xmm4, %%xmm4
  addss %%xmm4, %%xmm3
  movss %%xmm3, (%%edx)
  
  pop %%ecx
  pop %%edi
  pop %%eax
  emms
  "
  : : "a" (a), "D" (b), "c" (len), "d" (&sum)
CLOBBER_SSE
  );
  return sum;
}

//WARNING:
//FIXME: Does not work yet with lengths that are not a multiple of 4
inline float vec_mahalanobis2_mul4_sse(const float *a, const float *b, const float *c, int len)
{
   float sum=0;
   __asm__ __volatile__ (
   "
   push %%eax
   push %%esi
   push %%edi
   push %%ecx
   xorps %%xmm4, %%xmm4
   xorps %%xmm5, %%xmm5

   sub $8, %%ecx
   jb mul8_skip%=

mul8_loop%=:
   movups (%%eax), %%xmm0
   movups (%%edi), %%xmm1
   movups 16(%%eax), %%xmm2
   movups 16(%%edi), %%xmm3
   movups (%%esi), %%xmm6
   movups 16(%%esi), %%xmm7
   add $32, %%eax
   add $32, %%edi
   add $32, %%esi
   subps %%xmm0, %%xmm1
   subps %%xmm2, %%xmm3
   mulps %%xmm1, %%xmm1
   mulps %%xmm3, %%xmm3
   mulps %%xmm6, %%xmm1
   mulps %%xmm7, %%xmm3
   addps %%xmm1, %%xmm4
   addps %%xmm3, %%xmm5

   sub $8,  %%ecx
   jae mul8_loop%=

mul8_skip%=:
   addps %%xmm5, %%xmm4


   add $4, %%ecx
   jl mul4_skip%=

   movups (%%eax), %%xmm0
   movups (%%edi), %%xmm1
   movups (%%esi), %%xmm6
   add $16, %%eax
   add $16, %%edi
   add $16, %%esi

   subps %%xmm0, %%xmm1
   mulps %%xmm1, %%xmm1
   mulps %%xmm6, %%xmm1
   addps %%xmm1, %%xmm4

   sub $4,  %%ecx

mul4_skip%=:



   movaps %%xmm4, %%xmm3


   movhlps %%xmm3, %%xmm4
   addps %%xmm4, %%xmm3
   movaps %%xmm3, %%xmm4
   shufps $33, %%xmm4, %%xmm4
   addss %%xmm4, %%xmm3
   movss %%xmm3, (%%edx)


   pop %%ecx
   pop %%edi
   pop %%esi
   pop %%eax
   emms
   "
   : : "a" (a), "S" (c), "D" (b), "c" (len), "d" (&sum)
   CLOBBER_SSE
   );
    
   return sum;
}


//WARNING:
//FIXME: Does not work yet with lengths that are not a multiple of 4
inline float vec_dist2_mul4_sse(const float *a, const float *b, int len)
{
   float sum=0;
   __asm__ __volatile__ (
   "
   push %%eax
   push %%edi
   push %%ecx
   xorps %%xmm4, %%xmm4
   xorps %%xmm5, %%xmm5

   sub $8, %%ecx
   jb mul8_skip%=

mul8_loop%=:
   movups (%%eax), %%xmm0
   movups (%%edi), %%xmm1
   movups 16(%%eax), %%xmm2
   movups 16(%%edi), %%xmm3
   add $32, %%eax
   add $32, %%edi
   subps %%xmm0, %%xmm1
   subps %%xmm2, %%xmm3
   mulps %%xmm1, %%xmm1
   mulps %%xmm3, %%xmm3
   addps %%xmm1, %%xmm4
   addps %%xmm3, %%xmm5

   sub $8,  %%ecx
   jae mul8_loop%=

mul8_skip%=:
   addps %%xmm5, %%xmm4


   add $4, %%ecx
   jl mul4_skip%=

   movups (%%eax), %%xmm0
   movups (%%edi), %%xmm1
   add $16, %%eax
   add $16, %%edi

   subps %%xmm0, %%xmm1
   mulps %%xmm1, %%xmm1
   addps %%xmm1, %%xmm4

   sub $4,  %%ecx

mul4_skip%=:



   movaps %%xmm4, %%xmm3


   movhlps %%xmm3, %%xmm4
   addps %%xmm4, %%xmm3
   movaps %%xmm3, %%xmm4
   shufps $33, %%xmm4, %%xmm4
   addss %%xmm4, %%xmm3
   movss %%xmm3, (%%edx)


   pop %%ecx
   pop %%edi
   pop %%eax
   emms
   "
   : : "a" (a), "D" (b), "c" (len), "d" (&sum)
   CLOBBER_SSE
   );
    
   return sum;
}


#else /* _ENABLE_SSE */


#include "BaseException.h"

#define ERROR_SSE_NI {throw new GeneralException("Trying to use SSE, but Overflow not compiled with _ENABLE_SSE. Bad, bad, this should never happen", __FILE__, __LINE__);}

inline float vec_inner_prod_sse(const float *a, const float *b, int len)
ERROR_SSE_NI

inline float vec_mahalanobis2_mul4_sse(const float *a, const float *b, const float *c, int len)
ERROR_SSE_NI

inline float vec_dist2_mul4_sse(const float *a, const float *b, int len)
ERROR_SSE_NI

#endif /* !_ENABLE_SSE */


#endif
Generated by: jmvalin@usw-pr-shell2 on Mon Jun 24 00:06:36 2002, using kdoc 2.0a40.