Author: Michael R. Crusoe <michael.crusoe@gmail.com>
Description: Use "SIMD Everywhere" to enable wider portability
--- examl.orig/examl/avxLikelihood.c
+++ examl/examl/avxLikelihood.c
@@ -10,9 +10,8 @@
 #include <limits.h>
 #include "axml.h"
 #include <stdint.h>
-#include <xmmintrin.h>
-#include <pmmintrin.h>
-#include <immintrin.h>
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include "simde/x86/avx2.h"
 
 #ifdef _FMA
 #include <x86intrin.h>
--- examl.orig/examl/evaluateGenericSpecial.c
+++ examl/examl/evaluateGenericSpecial.c
@@ -43,11 +43,8 @@
 
 /* includes for using SSE3 intrinsics */
 
-#ifdef __SIM_SSE3
-#include <xmmintrin.h>
-#include <pmmintrin.h>
-/*#include <tmmintrin.h>*/
-#endif
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include "simde/x86/sse3.h"
 
 #ifdef __MIC_NATIVE
 #include "mic_native.h"
@@ -1174,7 +1171,6 @@
     {               
       for (i = 0; i < n; i++) 
 	{
-#ifdef __SIM_SSE3  	  
 	  __m128d 
 	    tv = _mm_setzero_pd();
 	      	 	  	 
@@ -1203,20 +1199,6 @@
 	  tv = _mm_hadd_pd(tv, tv);
 	  _mm_storel_pd(&term, tv);
 	  
-#else	  	  	  	  
-	  for(j = 0, term = 0.0; j < 4; j++)
-	    {
-	      double 
-		t = 0.0;
-	      
-	      left = &(tipVector[j][20 * tipX1[i]]);
-	      right = &(x2[80 * i + 20 * j]);
-	      for(l = 0; l < 20; l++)
-		t += left[l] * right[l] * diagptable[j * 20 + l];	      
-
-	      term += weights[j] * t;
-	    }	  
-#endif
 	  
 	  if(fastScaling)
 	    term = LOG(FABS(term));
@@ -1230,7 +1212,6 @@
     {
       for (i = 0; i < n; i++) 
 	{	  	 	             
-#ifdef __SIM_SSE3        
 	  __m128d 
 	    tv = _mm_setzero_pd();	 	  	  
 	      
@@ -1258,22 +1239,6 @@
 	  tv = _mm_hadd_pd(tv, tv);
 	  _mm_storel_pd(&term, tv);	  
 	  
-#else
-	  for(j = 0, term = 0.0; j < 4; j++)
-	    {
-	      double
-		t = 0.0;
-	      
-	      left  = &(x1[80 * i + 20 * j]);
-	      right = &(x2[80 * i + 20 * j]);	    
-	      
-	      for(l = 0; l < 20; l++)
-		t += left[l] * right[l] * diagptable[j * 20 + l];	
-
-	      term += weights[j] * t;
-	    }
-#endif
-	  
 	  if(fastScaling)
 	    term = LOG(FABS(term));
 	  else
--- examl.orig/examl/evaluatePartialGenericSpecial.c
+++ examl/examl/evaluatePartialGenericSpecial.c
@@ -40,14 +40,10 @@
 #include <string.h>
 #include "axml.h"
 
-#ifdef __SIM_SSE3
-#include <xmmintrin.h>
-#include <pmmintrin.h>
-#endif
-
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include "simde/x86/sse3.h"
 
-
-#if defined(_OPTIMIZED_FUNCTIONS) && !defined(__MIC_NATIVE)
+#if !defined(__MIC_NATIVE)
 static inline void computeVectorGTRCATPROT(double *lVector, int *eVector, double ki, int i, double qz, double rz,
 					   traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
 					   unsigned  char **yVector, int mxtips);
@@ -85,175 +81,6 @@
 					   int w, double *EIGN, double *EI, double *EV,
 					   double *tipVector, unsigned  char **yVector, 
 					   int branchReference, int mxtips);
-
-
-#else
-
-static inline void computeVectorCAT_FLEX(double *lVector, int *eVector, double ki, int i, double qz, double rz,
-					 traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
-					 unsigned char **yVector, int mxtips, const int states)
-{       
-  double  
-    *d1 =    (double *)malloc(sizeof(double) * states), 
-    *d2 =    (double *)malloc(sizeof(double) * states),  
-    *x1px2 = (double *)malloc(sizeof(double) * states), 
-    ump_x1, 
-    ump_x2,    
-    lz1, 
-    lz2,
-    *x1, 
-    *x2, 
-    *x3;
-  
-  int 
-    scale,
-    j, 
-    k,
-    pNumber = ti->pNumber,
-    rNumber = ti->rNumber,
-    qNumber = ti->qNumber;
- 
-  x3  = &lVector[states * (pNumber  - mxtips)];  
- 
-  switch(ti->tipCase)
-    {
-    case TIP_TIP:     
-      x1 = &(tipVector[states * yVector[qNumber][i]]);
-      x2 = &(tipVector[states * yVector[rNumber][i]]);    
-      break;
-    case TIP_INNER:     
-      x1 = &(tipVector[states * yVector[qNumber][i]]);
-      x2 = &(lVector[states * (rNumber - mxtips)]);           
-      break;
-    case INNER_INNER:            
-      x1 = &(lVector[states * (qNumber - mxtips)]);
-      x2 = &(lVector[states * (rNumber - mxtips)]);     
-      break;
-    default:
-      assert(0);
-    }
-     
-  lz1 = qz * ki;  
-  lz2 = rz * ki;
-  
-  d1[0] = x1[0];
-  d2[0] = x2[0];
-
-
-  for(j = 1; j < states; j++)
-    {
-      d1[j] = x1[j] * EXP(EIGN[j] * lz1);
-      d2[j] = x2[j] * EXP(EIGN[j] * lz2);	    
-    }
- 
- 
-  for(j = 0; j < states; j++)
-    {         
-      ump_x1 = 0.0;
-      ump_x2 = 0.0;
-
-      for(k = 0; k < states; k++)
-	{
-	  ump_x1 += d1[k] * EI[j * states + k];
-	  ump_x2 += d2[k] * EI[j * states + k];
-	}
-      
-      x1px2[j] = ump_x1 * ump_x2;
-    }
-  
-  for(j = 0; j < states; j++)
-    x3[j] = 0.0;
-
-  for(j = 0; j < states; j++)          
-    for(k = 0; k < states; k++)	
-      x3[k] +=  x1px2[j] *  EV[states * j + k];	   
-      
-  scale = 1;
-  for(j = 0; scale && (j < states); j++)
-    scale = ((x3[j] < minlikelihood) && (x3[j] > minusminlikelihood));
-  
-  if(scale)
-    {
-      for(j = 0; j < states; j++)
-	x3[j] *= twotothe256;       
-      *eVector = *eVector + 1;
-    }	              
-
-  free(d1);
-  free(d2);
-  free(x1px2);
-       
-  return;
-}
-
-
-static double evaluatePartialCAT_FLEX(int i, double ki, int counter,  traversalInfo *ti, double qz,
-				      int w, double *EIGN, double *EI, double *EV,
-				      double *tipVector, unsigned  char **yVector, 
-				      int branchReference, int mxtips, const int states)
-{
-  int 
-    scale = 0, 
-    k;
-  
-  double 
-    *lVector = (double *)malloc_aligned(sizeof(double) * states * mxtips),
-    *d = (double *)malloc_aligned(sizeof(double) * states),
-    lz, 
-    term, 
-    *x1, 
-    *x2; 
-
-  traversalInfo 
-    *trav = &ti[0];
- 
-  assert(isTip(trav->pNumber, mxtips));
-     
-  x1 = &(tipVector[states *  yVector[trav->pNumber][i]]);   
-
-  for(k = 1; k < counter; k++)    
-    {
-      double 
-	qz = ti[k].qz[branchReference],
-	rz = ti[k].rz[branchReference];
-      
-      qz = (qz > zmin) ? log(qz) : log(zmin);
-      rz = (rz > zmin) ? log(rz) : log(zmin);
-
-      computeVectorCAT_FLEX(lVector, &scale, ki, i, qz, rz, &ti[k], 
-			    EIGN, EI, EV, 
-			    tipVector, yVector, mxtips, states);       
-    }
-   
-  x2 = &lVector[states * (trav->qNumber - mxtips)]; 
-
-  assert(0 <=  (trav->qNumber - mxtips) && (trav->qNumber - mxtips) < mxtips);  
-       
-  if(qz < zmin) 
-    lz = zmin;
-  lz  = log(qz); 
-  lz *= ki;  
-  
-  d[0] = 1.0;
-
-  for(k = 1; k < states; k++)
-    d[k] = EXP (EIGN[k] * lz);
-  
-  term = 0.0;
-
-  for(k = 0; k < states; k++) 
-    term += x1[k] * x2[k] * d[k];       
-
-  term = LOG(FABS(term)) + (scale * LOG(minlikelihood));   
-
-  term = term * w;
-
-  free(lVector);  
-  free(d);
-
-  return  term;
-}
-
 #endif
 
 double evaluatePartialGeneric (tree *tr, int i, double ki, int _model)
@@ -274,25 +101,7 @@
   else
     branchReference = 0;
 
-#ifndef _OPTIMIZED_FUNCTIONS
-  if(tr->rateHetModel == CAT)
-    result = evaluatePartialCAT_FLEX(index, ki, tr->td[0].count, tr->td[0].ti, tr->td[0].ti[0].qz[branchReference], 
-				     tr->partitionData[_model].wgt[index],
-				     tr->partitionData[_model].EIGN, 
-				     tr->partitionData[_model].EI, 
-				     tr->partitionData[_model].EV,
-				     tr->partitionData[_model].tipVector,
-				     tr->partitionData[_model].yVector, branchReference, tr->mxtips, states);
-  else
-    /* 
-       the per-site site likelihood function should only be called for the CAT model
-       under the GAMMA model this is required only for estimating per-site protein models 
-       which has however been removed in this version of the code
-    */
-    assert(0); 
-  
- 
-#elif defined(__MIC_NATIVE)
+#if defined(__MIC_NATIVE)
 if (tr->rateHetModel == CAT)
     result = evaluatePartialCAT_FLEX(index, ki, tr->td[0].count, tr->td[0].ti, tr->td[0].ti[0].qz[branchReference],
                      tr->partitionData[_model].wgt[index],
@@ -357,13 +166,9 @@
       assert(0);
     }
   #endif
- 
-
   return result;
 }
 
-#ifdef _OPTIMIZED_FUNCTIONS
-
 
 static inline void computeVectorGTRCAT_BINARY(double *lVector, int *eVector, double ki, int i, double qz, double rz,
 					      traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, 
@@ -1055,4 +860,3 @@
 
 
 
-#endif
--- examl.orig/examl/makenewzGenericSpecial.c
+++ examl/examl/makenewzGenericSpecial.c
@@ -43,11 +43,8 @@
 #include <string.h>
 #include "axml.h"
 
-#ifdef __SIM_SSE3
-#include <xmmintrin.h>
-#include <pmmintrin.h>
-/*#include <tmmintrin.h>*/
-#endif
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include "simde/x86/sse3.h"
 
 /* includes MIC-optimized functions */
 
@@ -164,167 +161,10 @@
    So if we want to do a Newton-Rpahson optimization we only execute this function once in the beginning for each new branch we are considering !
 */
 
-#ifndef _OPTIMIZED_FUNCTIONS
-
-static void sumCAT_FLEX(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
-			unsigned char *tipX1, unsigned char *tipX2, int n, const int states)
-{
-  int 
-    i, 
-    l;
-  
-  double 
-    *sum, 
-    *left, 
-    *right;
-
-  switch(tipCase)
-    {
-      
-      /* switch over possible configurations of the nodes p and q defining the branch */
-
-    case TIP_TIP:
-      for (i = 0; i < n; i++)
-	{
-	  left  = &(tipVector[states * tipX1[i]]);
-	  right = &(tipVector[states * tipX2[i]]);
-	  sum = &sumtable[states * i];
-
-	  /* just multiply the values with each other for each site, note the similarity with evaluate() 
-	     we precompute the product which will remain constant and then just multiply this pre-computed 
-	     product with the changing P matrix exponentaions that depend on the branch lengths */
-
-	  for(l = 0; l < states; l++)
-	    sum[l] = left[l] * right[l];
-	}
-      break;
-    case TIP_INNER:
-
-      /* same as for TIP_TIP only that 
-	 we now access on tip vector and one 
-	 inner vector. 
-
-	 You may also observe that we do not consider using scaling vectors anywhere here.
-
-	 This is because we are interested in the first and second derivatives of the likelihood and 
-	 hence the addition of the log() of the scaling factor times the number of scaling events
-	 becomes obsolete through the derivative */
-
-      for (i = 0; i < n; i++)
-	{
-	  left = &(tipVector[states * tipX1[i]]);
-	  right = &x2[states * i];
-	  sum = &sumtable[states * i];
-
-	  for(l = 0; l < states; l++)
-	    sum[l] = left[l] * right[l];
-	}
-      break;
-    case INNER_INNER:
-      for (i = 0; i < n; i++)
-	{
-	  left  = &x1[states * i];
-	  right = &x2[states * i];
-	  sum = &sumtable[states * i];
-
-	  for(l = 0; l < states; l++)
-	    sum[l] = left[l] * right[l];
-	}
-      break;
-    default:
-      assert(0);
-    }
-}
-
-
-/* same thing for GAMMA models. The only noteworthy thing here is that we have an additional inner loop over the 
-   number of discrete gamma rates. The data access pattern is also different since for tip vector accesses through our 
-   lookup table, we do not distnguish between rates 
-
-   Note the different access pattern in TIP_INNER:
-
-   left = &(tipVector[states * tipX1[i]]);	  
-   right = &(x2[span * i + l * states]);
-
-*/
-
-static void sumGAMMA_FLEX(int tipCase, double *sumtable, double *x1, double *x2, double *tipVector,
-			  unsigned char *tipX1, unsigned char *tipX2, int n, const int states)
-{
-  int 
-    i, 
-    l, 
-    k;
-  
-  const int 
-    span = 4 * states;
-
-  double 
-    *left, 
-    *right, 
-    *sum;
-
-  switch(tipCase)
-    {
-    case TIP_TIP:
-      for(i = 0; i < n; i++)
-	{
-	  left  = &(tipVector[states * tipX1[i]]);
-	  right = &(tipVector[states * tipX2[i]]);
-
-	  for(l = 0; l < 4; l++)
-	    {
-	      sum = &sumtable[i * span + l * states];
-
-	      for(k = 0; k < states; k++)
-		sum[k] = left[k] * right[k];
-
-	    }
-	}
-      break;
-    case TIP_INNER:
-      for(i = 0; i < n; i++)
-	{
-	  left = &(tipVector[states * tipX1[i]]);
-
-	  for(l = 0; l < 4; l++)
-	    {
-	      right = &(x2[span * i + l * states]);
-	      sum = &sumtable[i * span + l * states];
-
-	      for(k = 0; k < states; k++)
-		sum[k] = left[k] * right[k];
-
-	    }
-	}
-      break;
-    case INNER_INNER:
-      for(i = 0; i < n; i++)
-	{
-	  for(l = 0; l < 4; l++)
-	    {
-	      left  = &(x1[span * i + l * states]);
-	      right = &(x2[span * i + l * states]);
-	      sum   = &(sumtable[i * span + l * states]);
-
-
-	      for(k = 0; k < states; k++)
-		sum[k] = left[k] * right[k];
-	    }
-	}
-      break;
-    default:
-      assert(0);
-    }
-}
-
-#endif
 
 /* optimized functions for branch length optimization */
 
 
-#ifdef _OPTIMIZED_FUNCTIONS
-
 static void sumGAMMA_BINARY(int tipCase, double *sumtable, double *x1_start, double *x2_start, double *tipVector,
                             unsigned char *tipX1, unsigned char *tipX2, int n);
 static void coreGTRGAMMA_BINARY(const int upper, double *sumtable,
@@ -382,243 +222,6 @@
 static void coreGTRCATPROT(double *EIGN, double lz, int numberOfCategories, double *rptr, int *cptr, int upper,
 			   int *wgt,  volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, double *sumtable);
 
-#endif
-
-
-#ifndef _OPTIMIZED_FUNCTIONS
-
-/* now this is the core function of the newton-Raphson based branch length optimization that actually computes 
-   the first and second derivative of the likelihood given a new proposed branch length lz */
-
-
-static void coreCAT_FLEX(int upper, int numberOfCategories, double *sum,
-			 volatile double *d1, volatile double *d2, int *wgt,
-			 double *rptr, double *EIGN, int *cptr, double lz, const int states)
-{
-  int 
-    i, 
-    l;
-  
-  double 
-    *d, 
-    
-    /* arrays to store stuff we can pre-compute */
-
-    *d_start = (double *)malloc_aligned(numberOfCategories * states * sizeof(double)),
-    *e =(double *)malloc_aligned(states * sizeof(double)),
-    *s = (double *)malloc_aligned(states * sizeof(double)),
-    *dd = (double *)malloc_aligned(states * sizeof(double)),
-    inv_Li, 
-    dlnLidlz, 
-    d2lnLidlz2,
-    dlnLdlz = 0.0,
-    d2lnLdlz2 = 0.0;
-
-  d = d_start;
-  
-  e[0] = 0.0;
-  s[0] = 0.0; 
-  dd[0] = 0.0;
-
-
-  /* we are pre-computing values for computing the first and second derivative of P(lz)
-     since this requires an exponetial that the only thing we really have to derive here */
-
-  for(l = 1; l < states; l++)
-    { 
-      s[l]  = EIGN[l];
-      e[l]  = EIGN[l] * EIGN[l];     
-      dd[l] = s[l] * lz;
-    }
-
-  /* compute the P matrices and their derivatives for 
-     all per-site rate categories */
-
-  for(i = 0; i < numberOfCategories; i++)
-    {      
-      d[states * i] = 1.0;
-      for(l = 1; l < states; l++)
-	d[states * i + l] = EXP(dd[l] * rptr[i]);
-    }
-
-
-  /* now loop over the sites in this partition to obtain the per-site 1st and 2nd derivatives */
-
-  for (i = 0; i < upper; i++)
-    {    
-      double 
-	r = rptr[cptr[i]],
-	wr1 = r * wgt[i],
-	wr2 = r * r * wgt[i];
-
-      /* get the correct p matrix for the rate at the current site i */
-      
-      d = &d_start[states * cptr[i]];      
-          
-      /* this is the likelihood at site i, NOT the log likelihood, we don't need the log 
-	 likelihood to compute derivatives ! */
-
-      inv_Li     = sum[states * i]; 
-      
-      /* those are for storing the first and second derivative of the Likelihood at site i */
-
-      dlnLidlz   = 0.0;
-      d2lnLidlz2 = 0.0;
-
-      /* now multiply the likelihood and the first and second derivative with the 
-	 appropriate derivatives of P(lz) */
-
-      for(l = 1; l < states; l++)
-	{
-	  double
-	    tmpv = d[l] * sum[states * i + l];
-	  
-	  inv_Li     += tmpv;	 	  
-	  dlnLidlz   += tmpv * s[l];       
-	  d2lnLidlz2 += tmpv * e[l];
-	}     
-      
-      /* below we are implementing the other mathematical operations that are required 
-	 to obtain the deirivatives */
-
-      inv_Li = 1.0/ FABS(inv_Li);
-
-      dlnLidlz   *= inv_Li;
-      d2lnLidlz2 *= inv_Li;
-
-      /* under the CAT model, wrptr[] and wr2ptr[] are pre-computed extension sof the weight pointer:
-	 wrptr[i]  = wgt[i] * rptr[cptr[i]].
-	 and 
-	 wr2ptr[i]  = wgt[i] * rptr[cptr[i]] * rptr[cptr[i]] 
-
-	 this is also something that is required for the derivatives because when computing the 
-	 derivative of the exponential() the rate must be multiplied with the 
-	 exponential 
-
-	 wgt is just the pattern site wieght 
-      */
-
-      /* compute the accumulated first and second derivatives of this site */
-
-      dlnLdlz  += wr1 * dlnLidlz;
-      d2lnLdlz2 += wr2 * (d2lnLidlz2 - dlnLidlz * dlnLidlz);
-    }
-
-  /* 
-     set the result values, i.e., the sum of the per-site first and second derivatives of the likelihood function 
-     for this partition. 
-   */
-
-  *d1  = dlnLdlz;
-  *d2 = d2lnLdlz2;
-
-  /* free the temporary arrays */
-
-  free(d_start);
-  free(e);
-  free(s);
-  free(dd);
-}
-
-static void coreGAMMA_FLEX(int upper, double *sumtable, volatile double *ext_dlnLdlz,  volatile double *ext_d2lnLdlz2, 
-			   double *EIGN, double *gammaRates, double lz, int *wgt, const int states)
-{
-   double  
-    *sum, 
-     diagptable[1024], /* TODO make this dynamic */
-    dlnLdlz = 0.0,
-    d2lnLdlz2 = 0.0,
-    ki, 
-    kisqr,
-    tmp,
-    inv_Li, 
-    dlnLidlz, 
-    d2lnLidlz2;
-
-  int     
-    i, 
-    j, 
-    l;  
-
-  const int 
-    gammaStates = 4 * states;
-
-  /* pre-compute the derivatives of the P matrix for all discrete GAMMA rates */
-
-  for(i = 0; i < 4; i++)
-    {
-      ki = gammaRates[i];
-      kisqr = ki * ki;
-
-      for(l = 1; l < states; l++)
-	{
-	  diagptable[i * gammaStates + l * 4]     = EXP(EIGN[l] * ki * lz);
-	  diagptable[i * gammaStates + l * 4 + 1] = EIGN[l] * ki;
-	  diagptable[i * gammaStates + l * 4 + 2] = EIGN[l] * EIGN[l] * kisqr;
-	}
-    }
-
-  /* loop over sites in this partition */
-
-  for (i = 0; i < upper; i++)
-    {
-      double 
-	r = rptr[cptr[i]],
-	wr1 = r * wgt[i],
-	wr2 = r * r * wgt[i];
-
-      /* access the array with pre-computed values */
-      sum = &sumtable[i * gammaStates];
-
-      /* initial per-site likelihood and 1st and 2nd derivatives */
-
-      inv_Li   = 0.0;
-      dlnLidlz = 0.0;
-      d2lnLidlz2 = 0.0;
-
-      /* loop over discrete GAMMA rates */
-
-      for(j = 0; j < 4; j++)
-	{
-	  inv_Li += sum[j * states];
-
-	  for(l = 1; l < states; l++)
-	    {
-	      inv_Li     += (tmp = diagptable[j * gammaStates + l * 4] * sum[j * states + l]);
-	      dlnLidlz   +=  tmp * diagptable[j * gammaStates + l * 4 + 1];
-	      d2lnLidlz2 +=  tmp * diagptable[j * gammaStates + l * 4 + 2];
-	    }
-	}
-
-      /* finalize derivative computation */
-      /* note that wrptr[] here unlike in CAT above is the 
-	 integer weight vector of the current site 
-
-	 The operations:
-
-	 EIGN[l] * ki;
-	 EIGN[l] * EIGN[l] * kisqr;
-
-	 that are hidden in CAT in wrptr (at least the * ki and * ki *ki part of them 
-	 are done explicitely here 
-
-      */
-
-      inv_Li = 1.0 / FABS(inv_Li);
-
-      dlnLidlz   *= inv_Li;
-      d2lnLidlz2 *= inv_Li;
-
-      dlnLdlz   += wr1 * dlnLidlz;
-      d2lnLdlz2 += wr2 * (d2lnLidlz2 - dlnLidlz * dlnLidlz);
-    }
-
-  *ext_dlnLdlz   = dlnLdlz;
-  *ext_d2lnLdlz2 = d2lnLdlz2;
-  
-}
-
-#endif
 
 /* the function below is called only once at the very beginning of each Newton-Raphson procedure for optimizing barnch lengths.
    It initially invokes an iterative newview call to get a consistent pair of vectors at the left and the right end of the 
@@ -721,15 +324,6 @@
 	  double
 	    *sumBuffer = tr->partitionData[model].sumBuffer + x_offset;
 	 
-#ifndef _OPTIMIZED_FUNCTIONS
-	  assert(!tr->saveMemory);
-	  if(tr->rateHetModel == CAT)
-	    sumCAT_FLEX(tipCase, sumBuffer, x1_start, x2_start, tr->partitionData[model].tipVector, tipX1, tipX2,
-			width, states);
-	  else
-	    sumGAMMA_FLEX(tipCase, sumBuffer, x1_start, x2_start, tr->partitionData[model].tipVector, tipX1, tipX2,
-			  width, states);
-#else
 	  switch(states)
 	    {
 	    case 2:
@@ -834,7 +428,6 @@
 	    default:
 	      assert(0);
 	    }
-#endif
 	}
     }  // for model
   }  // omp parallel region
@@ -977,19 +570,6 @@
 	    dlnLdlz   = 0.0,
 	    d2lnLdlz2 = 0.0;
 
-  #ifndef _OPTIMIZED_FUNCTIONS
-
-	    /* compute first and second derivatives with the slow generic functions */
-
-	    if(tr->rateHetModel == CAT)
-	      coreCAT_FLEX(width, tr->partitionData[model].numberOfCategories, sumBuffer,
-			   &dlnLdlz, &d2lnLdlz2, wgt,
-			   tr->partitionData[model].perSiteRates, tr->partitionData[model].EIGN, rateCategory, lz, states);
-	    else
-	      coreGAMMA_FLEX(width, sumBuffer,
-			     &dlnLdlz, &d2lnLdlz2, tr->partitionData[model].EIGN, tr->partitionData[model].gammaRates, lz,
-			     wgt, states);
-  #else
 	    switch(states)
 	      {
 	      case 2:
@@ -1068,7 +648,6 @@
 	      default:
 		assert(0);
 	      }
-  #endif
 
 	    /* store first and second derivative */
 
@@ -1402,8 +981,6 @@
 
 /* below are, once again the optimized functions */
 
-#ifdef _OPTIMIZED_FUNCTIONS
-
 /**** binary ***/
 static void coreGTRCAT_BINARY(int upper, int numberOfCategories, double *sum,
                               volatile double *d1, volatile double *d2, 
@@ -2013,17 +1590,12 @@
 	      right = &(tipVector[l][20 * tipX2[i]]);
 
 	      sum = &sumtable[i * 80 + l * 20];
-#ifdef __SIM_SSE3
 	      for(k = 0; k < 20; k+=2)
 		{
 		  __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
 		  
 		  _mm_store_pd(&sum[k], sumv);		 
 		}
-#else
-	      for(k = 0; k < 20; k++)
-		sum[k] = left[k] * right[k];
-#endif
 	    }
 	}
       break;
@@ -2037,17 +1609,12 @@
 	      left = &(tipVector[l][20 * tipX1[i]]);
 	      right = &(x2[80 * i + l * 20]);
 	      sum = &sumtable[i * 80 + l * 20];
-#ifdef __SIM_SSE3
 	      for(k = 0; k < 20; k+=2)
 		{
 		  __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
 		  
 		  _mm_store_pd(&sum[k], sumv);		 
 		}
-#else
-	      for(k = 0; k < 20; k++)
-		sum[k] = left[k] * right[k];
-#endif
 	    }
 	}
       break;
@@ -2060,17 +1627,12 @@
 	      right = &(x2[80 * i + l * 20]);
 	      sum   = &(sumtable[i * 80 + l * 20]);
 
-#ifdef __SIM_SSE3
 	      for(k = 0; k < 20; k+=2)
 		{
 		  __m128d sumv = _mm_mul_pd(_mm_load_pd(&left[k]), _mm_load_pd(&right[k]));
 		  
 		  _mm_store_pd(&sum[k], sumv);		 
 		}
-#else
-	      for(k = 0; k < 20; k++)
-		sum[k] = left[k] * right[k];
-#endif
 	    }
 	}
       break;
@@ -2737,11 +2299,3 @@
 
   free(d_start);
 }
-
-
-
-
-#endif
-
-
-
--- examl.orig/examl/mic_native_aa.c
+++ examl/examl/mic_native_aa.c
@@ -1,4 +1,5 @@
-#include <immintrin.h>
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include "simde/x86/avx2.h"
 #include <string.h>
 #include <math.h>
 
--- examl.orig/examl/newviewGenericSpecial.c
+++ examl/examl/newviewGenericSpecial.c
@@ -41,11 +41,9 @@
 #include <limits.h>
 #include "axml.h"
 
-#ifdef __SIM_SSE3
-
 #include <stdint.h>
-#include <xmmintrin.h>
-#include <pmmintrin.h>
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include "simde/x86/avx2.h"
 
 /* required to compute the absoliute values of double precision numbers with SSE3 */
 
@@ -57,7 +55,6 @@
 
 
 
-#endif
 
 /* includes MIC-optimized functions */
 
@@ -212,478 +209,6 @@
    conditional likelihood arrays at p, given child nodes q and r. Once again we need 
    two generic function implementations, one for CAT and one for GAMMA */
 
-#ifndef _OPTIMIZED_FUNCTIONS
-
-static void newviewCAT_FLEX(int tipCase, double *extEV,
-			    int *cptr,
-			    double *x1, double *x2, double *x3, double *tipVector,
-			    unsigned char *tipX1, unsigned char *tipX2,
-			    int n, double *left, double *right, int *wgt, int *scalerIncrement, const int states)
-{
-  double
-    *le, 
-    *ri, 
-    *v, 
-    *vl, 
-    *vr,
-    ump_x1, 
-    ump_x2, 
-    x1px2;
-
-  int 
-    i, 
-    l, 
-    j, 
-    scale, 
-    addScale = 0;
-
-  const int 
-    statesSquare = states * states;
-
-
-  /* here we switch over the different cases for efficiency, but also because 
-     each case accesses different data types.
-
-     We consider three cases: either q and r are both tips, q or r are tips, and q and r are inner 
-     nodes.
-  */
-     
-
-  switch(tipCase)
-    {
-      
-      /* both child nodes of p weher we want to update the conditional likelihood are tips */
-    case TIP_TIP:     
-      /* loop over sites */
-      for (i = 0; i < n; i++)
-	{
-	  /* set a pointer to the P-Matrices for the rate category of this site */
-	  le = &left[cptr[i] * statesSquare];
-	  ri = &right[cptr[i] * statesSquare];
-	  
-	  /* pointers to the likelihood entries of the tips q (vl) and r (vr) 
-	     We will do reading accesses to these values only.
-	   */
-	  vl = &(tipVector[states * tipX1[i]]);
-	  vr = &(tipVector[states * tipX2[i]]);
-	  
-	  /* address of the conditional likelihood array entres at site i. This is 
-	     a writing access to v */
-	  v  = &x3[states * i];
-	  
-	  /* initialize v */
-	  for(l = 0; l < states; l++)
-	    v[l] = 0.0;
-	  	  
-	  /* loop over states to compute the cond likelihoods at p (v) */
-
-	  for(l = 0; l < states; l++)
-	    {	      
-	      ump_x1 = 0.0;
-	      ump_x2 = 0.0;
-	      
-	      /* le and ri are the P-matrices */
-
-	      for(j = 0; j < states; j++)
-		{
-		  ump_x1 += vl[j] * le[l * states + j];
-		  ump_x2 += vr[j] * ri[l * states + j];
-		}
-	      
-	      x1px2 = ump_x1 * ump_x2;
-	      
-	      /* multiply with matrix of eigenvectors extEV */
-
-	      for(j = 0; j < states; j++)
-		v[j] += x1px2 * extEV[l * states + j];
-	    }	   
-	}    
-      break;
-    case TIP_INNER:      
-
-      /* same as above, only that now vl is a tip and vr is the conditional probability vector 
-	 at an inner node. Note that, if we have the case that either q or r is a tip, the 
-	 nodes will be flipped to ensure that tipX1 always points to the sequence at the tip.
-      */
-
-      for (i = 0; i < n; i++)
-	{
-	  le = &left[cptr[i] * statesSquare];
-	  ri = &right[cptr[i] * statesSquare];
-	  
-	  /* access tip vector lookup table */
-	  vl = &(tipVector[states * tipX1[i]]);
-
-	  /* access conditional likelihoo arrays */
-	  /* again, vl and vr are reading accesses, while v is a writing access */
-	  vr = &x2[states * i];
-	  v  = &x3[states * i];
-	  
-	  /* same as in the loop above */
-
-	  for(l = 0; l < states; l++)
-	    v[l] = 0.0;
-	  
-	  for(l = 0; l < states; l++)
-	    {
-	      ump_x1 = 0.0;
-	      ump_x2 = 0.0;
-	      
-	      for(j = 0; j < states; j++)
-		{
-		  ump_x1 += vl[j] * le[l * states + j];
-		  ump_x2 += vr[j] * ri[l * states + j];
-		}
-	      
-	      x1px2 = ump_x1 * ump_x2;
-	      
-	      for(j = 0; j < states; j++)
-		v[j] += x1px2 * extEV[l * states + j];
-	    }
-	  
-	  /* now let's check for numerical scaling. 
-	     The maths in RAxML are a bit non-standard to avoid/economize on arithmetic operations 
-	     at the virtual root and for branch length optimization and hence values stored 
-	     in the conditional likelihood vectors can become negative.
-	     Below we check if all absolute values stored at position i of v are smaller 
-	     than a pre-defined value in axml.h. If they are all smaller we can then safely 
-	     multiply them by a large, constant number twotothe256 (without numerical overflow) 
-	     that is also speced in axml.h */
-
-	  scale = 1;
-	  for(l = 0; scale && (l < states); l++)
-	    scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood));	   
-	  
-	  if(scale)
-	    {
-	      for(l = 0; l < states; l++)
-		v[l] *= twotothe256;
-	      
-	      /* if we have scaled the entries to prevent underflow, we need to keep track of how many scaling 
-		 multiplications we did per node such as to undo them at the virtual root, e.g., in 
-		 evaluateGeneric() 
-		 Note here, that, if we scaled the site we need to increment the scaling counter by the wieght, i.e., 
-		 the number of sites this potentially compressed pattern represents ! */ 
-
-	      addScale += wgt[i];	  
-	    }
-	}   
-      break;
-    case INNER_INNER:
-      
-      /* same as above, only that the two child nodes q and r are now inner nodes */
-
-      for(i = 0; i < n; i++)
-	{
-	  le = &left[cptr[i] * statesSquare];
-	  ri = &right[cptr[i] * statesSquare];
-
-	  /* index conditional likelihood vectors of inner nodes */
-
-	  vl = &x1[states * i];
-	  vr = &x2[states * i];
-	  v = &x3[states * i];
-
-	  for(l = 0; l < states; l++)
-	    v[l] = 0.0;
-	 
-	  for(l = 0; l < states; l++)
-	    {
-	      ump_x1 = 0.0;
-	      ump_x2 = 0.0;
-
-	      for(j = 0; j < states; j++)
-		{
-		  ump_x1 += vl[j] * le[l * states + j];
-		  ump_x2 += vr[j] * ri[l * states + j];
-		}
-
-	      x1px2 =  ump_x1 * ump_x2;
-
-	      for(j = 0; j < states; j++)
-		v[j] += x1px2 * extEV[l * states + j];	      
-	    }
-
-	   scale = 1;
-	   for(l = 0; scale && (l < states); l++)
-	     scale = ((v[l] < minlikelihood) && (v[l] > minusminlikelihood));
-  
-	   if(scale)
-	     {
-	       for(l = 0; l < states; l++)
-		 v[l] *= twotothe256;
-
-	       addScale += wgt[i];	   
-	     }
-	}
-      break;
-    default:
-      assert(0);
-    }
-   
-  /* increment the scaling counter by the additional scalings done at node p */
-
-  *scalerIncrement = addScale;
-}
-
-
-static void newviewGAMMA_FLEX(int tipCase,
-			      double *x1, double *x2, double *x3, double *extEV, double *tipVector,
-			      unsigned char *tipX1, unsigned char *tipX2,
-			      int n, double *left, double *right, int *wgt, int *scalerIncrement, const int states, const int maxStateValue)
-{
-  double  
-    *uX1, 
-    *uX2, 
-    *v, 
-    x1px2, 
-    *vl, 
-    *vr, 
-    al, 
-    ar;
-  
-  int  
-    i, 
-    j, 
-    l, 
-    k, 
-    scale, 
-    addScale = 0;
-
-  const int     
-    statesSquare = states * states,
-    span = states * 4,
-    /* this is required for doing some pre-computations that help to save 
-       numerical operations. What we are actually computing here are additional lookup tables 
-       for each possible state a certain data-type can assume.
-       for DNA with ambuguity coding this is 15, for proteins this is 22 or 23, since there 
-       also exist one or two amibguity codes for protein data.
-       Essentially this is very similar to the tip vectors which we also use as lookup tables */
-    precomputeLength = maxStateValue * span;
-
-  switch(tipCase)
-    {
-    case TIP_TIP:
-      {
-	/* allocate pre-compute memory space */
-
-	double 
-	  *umpX1 = (double*)malloc(sizeof(double) * precomputeLength),
-	  *umpX2 = (double*)malloc(sizeof(double) * precomputeLength);
-
-	/* multiply all possible tip state vectors with the respective P-matrices 
-	 */
-
-	for(i = 0; i < maxStateValue; i++)
-	  {
-	    v = &(tipVector[states * i]);
-
-	    for(k = 0; k < span; k++)
-	      {
-
-		umpX1[span * i + k] = 0.0;
-		umpX2[span * i + k] = 0.0;
-
-		for(l = 0; l < states; l++)
-		  {
-		    umpX1[span * i + k] +=  v[l] *  left[k * states + l];
-		    umpX2[span * i + k] +=  v[l] * right[k * states + l];
-		  }
-
-	      }
-	  }
-
-	for(i = 0; i < n; i++)
-	  {
-	    /* access the precomputed arrays (pre-computed multiplication of conditional with the tip state) 
-	     */
-
-	    uX1 = &umpX1[span * tipX1[i]];
-	    uX2 = &umpX2[span * tipX2[i]];
-
-	    /* loop over discrete GAMMA rates */
-
-	    for(j = 0; j < 4; j++)
-	      {
-		/* the rest is the same as for CAT */
-		v = &x3[i * span + j * states];
-
-		for(k = 0; k < states; k++)
-		  v[k] = 0.0;
-
-		for(k = 0; k < states; k++)
-		  {		   
-		    x1px2 = uX1[j * states + k] * uX2[j * states + k];
-		   
-		    for(l = 0; l < states; l++)		      					
-		      v[l] += x1px2 * extEV[states * k + l];		     
-		  }
-
-	      }	   
-	  }
-	
-	/* free precomputed vectors */
-
-	free(umpX1);
-	free(umpX2);
-      }
-      break;
-    case TIP_INNER:
-      {
-	/* we do analogous pre-computations as above, with the only difference that we now do them 
-	   only for one tip vector */
-
-	double 
-	  *umpX1 = (double*)malloc(sizeof(double) * precomputeLength),
-	  *ump_x2 = (double*)malloc(sizeof(double) * states);
-
-	/* precompute P and left tip vector product */
-
-	for(i = 0; i < maxStateValue; i++)
-	  {
-	    v = &(tipVector[states * i]);
-
-	    for(k = 0; k < span; k++)
-	      {
-  
-		umpX1[span * i + k] = 0.0;
-
-		for(l = 0; l < states; l++)
-		  umpX1[span * i + k] +=  v[l] * left[k * states + l];
-
-
-	      }
-	  }
-
-	for (i = 0; i < n; i++)
-	  {
-	    /* access pre-computed value based on the raw sequence data tipX1 that is used as an index */
-
-	    uX1 = &umpX1[span * tipX1[i]];
-
-	    /* loop over discrete GAMMA rates */
-
-	    for(k = 0; k < 4; k++)
-	      {
-		v = &(x2[span * i + k * states]);
-
-		for(l = 0; l < states; l++)
-		  {
-		    ump_x2[l] = 0.0;
-
-		    for(j = 0; j < states; j++)
-		      ump_x2[l] += v[j] * right[k * statesSquare + l * states + j];
-		  }
-
-		v = &(x3[span * i + states * k]);
-
-		for(l = 0; l < states; l++)
-		  v[l] = 0;
-
-		for(l = 0; l < states; l++)
-		  {
-		    x1px2 = uX1[k * states + l]  * ump_x2[l];
-		    for(j = 0; j < states; j++)
-		      v[j] += x1px2 * extEV[l * states  + j];
-		  }
-	      }
-	   
-	    /* also do numerical scaling as above. Note that here we need to scale 
-	       4 * 4 values for DNA or 4 * 20 values for protein data.
-	       If they are ALL smaller than our threshold, we scale. Note that,
-	       this can cause numerical problems with GAMMA, if the values generated 
-	       by the four discrete GAMMA rates are too different.
-
-	       For details, see: 
-	       
-	       F. Izquierdo-Carrasco, S.A. Smith, A. Stamatakis: "Algorithms, Data Structures, and Numerics for Likelihood-based Phylogenetic Inference of Huge Trees"
-
-	    */
-	    
-
-	    v = &x3[span * i];
-	    scale = 1;
-	    for(l = 0; scale && (l < span); l++)
-	      scale = (ABS(v[l]) <  minlikelihood);
-
-
-	    if (scale)
-	      {
-		for(l = 0; l < span; l++)
-		  v[l] *= twotothe256;
-	
-		addScale += wgt[i];		    
-	      }
-	  }
-
-	free(umpX1);
-	free(ump_x2);
-      }
-      break;
-    case INNER_INNER:
-
-      /* same as above, without pre-computations */
-
-      for (i = 0; i < n; i++)
-       {
-	 for(k = 0; k < 4; k++)
-	   {
-	     vl = &(x1[span * i + states * k]);
-	     vr = &(x2[span * i + states * k]);
-	     v =  &(x3[span * i + states * k]);
-
-
-	     for(l = 0; l < states; l++)
-	       v[l] = 0;
-
-
-	     for(l = 0; l < states; l++)
-	       {		 
-
-		 al = 0.0;
-		 ar = 0.0;
-
-		 for(j = 0; j < states; j++)
-		   {
-		     al += vl[j] * left[k * statesSquare + l * states + j];
-		     ar += vr[j] * right[k * statesSquare + l * states + j];
-		   }
-
-		 x1px2 = al * ar;
-
-		 for(j = 0; j < states; j++)
-		   v[j] += x1px2 * extEV[states * l + j];
-
-	       }
-	   }
-	 
-	 v = &(x3[span * i]);
-	 scale = 1;
-	 for(l = 0; scale && (l < span); l++)
-	   scale = ((ABS(v[l]) <  minlikelihood));
-
-	 if(scale)
-	   {  
-	     for(l = 0; l < span; l++)
-	       v[l] *= twotothe256;
-	     
-	     addScale += wgt[i];	    	  
-	   }
-       }
-      break;
-    default:
-      assert(0);
-    }
-
-  /* as above, increment the global counter that counts scaling multiplications by the scaling multiplications 
-     carried out for computing the likelihood array at node p */
-
-  *scalerIncrement = addScale;
-}
-
-#endif
-
-
     
 /* The function below computes partial traversals only down to the point/node in the tree where the 
    conditional likelihhod vector summarizing a subtree is already oriented in the correct direction */
@@ -817,71 +342,6 @@
    file.
 */
 
-#if (defined(_OPTIMIZED_FUNCTIONS) && !defined(__AVX))
-
-static void newviewGTRGAMMAPROT_LG4(int tipCase,
-				    double *x1, double *x2, double *x3, double *extEV[4], double *tipVector[4],
-				    int *ex3, unsigned char *tipX1, unsigned char *tipX2,
-				    int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling);
-
-static void newviewGTRGAMMA_GAPPED_SAVE(int tipCase,
-					double *x1_start, double *x2_start, double *x3_start,
-					double *EV, double *tipVector,
-					unsigned char *tipX1, unsigned char *tipX2,
-					const int n, double *left, double *right, int *wgt, int *scalerIncrement, 
-					unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, 
-					double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn);
-
-static void newviewGTRGAMMA(int tipCase,
-			    double *x1_start, double *x2_start, double *x3_start,
-			    double *EV, double *tipVector,
-			    unsigned char *tipX1, unsigned char *tipX2,
-			    const int n, double *left, double *right, int *wgt, int *scalerIncrement
-			    );
-
-static void newviewGTRCAT( int tipCase,  double *EV,  int *cptr,
-			   double *x1_start, double *x2_start,  double *x3_start, double *tipVector,
-			   unsigned char *tipX1, unsigned char *tipX2,
-			   int n,  double *left, double *right, int *wgt, int *scalerIncrement);
-
-
-static void newviewGTRCAT_SAVE( int tipCase,  double *EV,  int *cptr,
-				double *x1_start, double *x2_start,  double *x3_start, double *tipVector,
-				unsigned char *tipX1, unsigned char *tipX2,
-				int n,  double *left, double *right, int *wgt, int *scalerIncrement,
-				unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
-				double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats);
-
-static void newviewGTRGAMMAPROT_GAPPED_SAVE(int tipCase,
-					    double *x1, double *x2, double *x3, double *extEV, double *tipVector,
-					    unsigned char *tipX1, unsigned char *tipX2,
-					    int n, double *left, double *right, int *wgt, int *scalerIncrement, 
-					    unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,  
-					    double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn
-					    );
-
-static void newviewGTRGAMMAPROT(int tipCase,
-				double *x1, double *x2, double *x3, double *extEV, double *tipVector,
-				unsigned char *tipX1, unsigned char *tipX2,
-				int n, double *left, double *right, int *wgt, int *scalerIncrement);
-static void newviewGTRCATPROT(int tipCase, double *extEV,
-			      int *cptr,
-			      double *x1, double *x2, double *x3, double *tipVector,
-			      unsigned char *tipX1, unsigned char *tipX2,
-			      int n, double *left, double *right, int *wgt, int *scalerIncrement );
-
-static void newviewGTRCATPROT_SAVE(int tipCase, double *extEV,
-				   int *cptr,
-				   double *x1, double *x2, double *x3, double *tipVector,
-				   unsigned char *tipX1, unsigned char *tipX2,
-				   int n, double *left, double *right, int *wgt, int *scalerIncrement,
-				   unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
-				   double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats);
-
-#endif
-
-#ifdef _OPTIMIZED_FUNCTIONS
-
 static void newviewGTRCAT_BINARY( int tipCase,  double *EV,  int *cptr,
                                   double *x1_start,  double *x2_start,  double *x3_start,  double *tipVector,
                                   int *ex3, unsigned char *tipX1, unsigned char *tipX2,
@@ -894,8 +354,6 @@
 				   const int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling
 				   );
 
-#endif
-
 boolean isGap(unsigned int *x, int pos)
 {
   return (x[pos / 32] & mask32[pos % 32]);
@@ -1260,26 +718,6 @@
 		    assert(0);
 		  }
 		
-#ifndef _OPTIMIZED_FUNCTIONS
-
-	      /* memory saving not implemented */
-
-	      assert(!tr->saveMemory);
-
-	      /* figure out if we need to compute the CAT or GAMMA model of rate heterogeneity */
-
-	      if(tr->rateHetModel == CAT)
-		newviewCAT_FLEX(tInfo->tipCase,  tr->partitionData[model].EV, rateCategory,
-				x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
-				tipX1, tipX2,
-				width, left, right, wgt, &scalerIncrement, states);
-	      else
-		newviewGAMMA_FLEX(tInfo->tipCase,
-				  x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector,
-				  tipX1, tipX2,
-				  width, left, right, wgt, &scalerIncrement, states, getUndetermined(tr->partitionData[model].dataType) + 1);
-
-#else
 	      /* dedicated highly optimized functions. Analogously to the functions in evaluateGeneric() 
 		 we also siwtch over the state number */
 
@@ -1309,32 +747,20 @@
 		      if(tr->saveMemory)
 #ifdef __MIC_NATIVE
 		     assert(0 && "Neither CAT model of rate heterogeneity nor memory saving are implemented on Intel MIC");
-#elif __AVX
+#else
 			newviewGTRCAT_AVX_GAPPED_SAVE(tInfo->tipCase,  tr->partitionData[model].EV, rateCategory,
 						      x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
 						      (int*)NULL, tipX1, tipX2,
 						      width, left, right, wgt, &scalerIncrement, TRUE, x1_gap, x2_gap, x3_gap,
 						      x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories);
-#else
-			newviewGTRCAT_SAVE(tInfo->tipCase,  tr->partitionData[model].EV, rateCategory,
-					   x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
-					   tipX1, tipX2,
-					   width, left, right, wgt, &scalerIncrement, x1_gap, x2_gap, x3_gap,
-					   x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories);
-#endif
 		      else
 #ifdef __MIC_NATIVE
 		     assert(0 && "CAT model of rate heterogeneity is not implemented on Intel MIC");
-#elif __AVX
+#else
 			newviewGTRCAT_AVX(tInfo->tipCase,  tr->partitionData[model].EV, rateCategory,
 					  x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
 					  tipX1, tipX2,
 					  width, left, right, wgt, &scalerIncrement);
-#else
-			newviewGTRCAT(tInfo->tipCase,  tr->partitionData[model].EV, rateCategory,
-				      x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
-				      tipX1, tipX2,
-				      width, left, right, wgt, &scalerIncrement);
 #endif
 		    }
 		  else
@@ -1344,20 +770,13 @@
 		       if(tr->saveMemory)
 #ifdef __MIC_NATIVE
 		     assert(0 && "Memory saving is not implemented on Intel MIC");
-#elif __AVX
+#else
 			 newviewGTRGAMMA_AVX_GAPPED_SAVE(tInfo->tipCase,
 							 x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector, (int*)NULL,
 							 tipX1, tipX2,
 							 width, left, right, wgt, &scalerIncrement, TRUE,
 							 x1_gap, x2_gap, x3_gap, 
 							 x1_gapColumn, x2_gapColumn, x3_gapColumn);
-#else
-		       newviewGTRGAMMA_GAPPED_SAVE(tInfo->tipCase,
-						   x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector,
-						   tipX1, tipX2,
-						   width, left, right, wgt, &scalerIncrement, 
-						   x1_gap, x2_gap, x3_gap, 
-						   x1_gapColumn, x2_gapColumn, x3_gapColumn);
 #endif
 		       else
 #ifdef __MIC_NATIVE
@@ -1366,16 +785,11 @@
 				  tipX1, tipX2,
 				  width, left, right, wgt, &scalerIncrement,
 				  tr->partitionData[model].mic_umpLeft, tr->partitionData[model].mic_umpRight);
-#elif __AVX
+#else
 			 newviewGTRGAMMA_AVX(tInfo->tipCase,
 					     x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector,
 					     tipX1, tipX2,
 					     width, left, right, wgt, &scalerIncrement);
-#else
-		       newviewGTRGAMMA(tInfo->tipCase,
-					 x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector,
-					 tipX1, tipX2,
-					 width, left, right, wgt, &scalerIncrement);
 #endif
 		    }
 		
@@ -1388,30 +802,21 @@
 			{
 #ifdef __MIC_NATIVE
 		     assert(0 && "Neither CAT model of rate heterogeneity nor memory saving are implemented on Intel MIC");
-#elif __AVX
+#else
 			  newviewGTRCATPROT_AVX_GAPPED_SAVE(tInfo->tipCase,  tr->partitionData[model].EV, rateCategory,
 							    x1_start, x2_start, x3_start, tr->partitionData[model].tipVector, (int*)NULL,
 							    tipX1, tipX2, width, left, right, wgt, &scalerIncrement, TRUE, x1_gap, x2_gap, x3_gap,
 							    x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories);
-#else
-			  newviewGTRCATPROT_SAVE(tInfo->tipCase,  tr->partitionData[model].EV, rateCategory,
-						 x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
-						 tipX1, tipX2, width, left, right, wgt, &scalerIncrement, x1_gap, x2_gap, x3_gap,
-						 x1_gapColumn, x2_gapColumn, x3_gapColumn, tr->maxCategories);
 #endif
 			}
 		      else
 			{			 			
 #ifdef __MIC_NATIVE
 		     assert(0 && "CAT model of rate heterogeneity is not implemented on Intel MIC");
-#elif __AVX
+#else
 			  newviewGTRCATPROT_AVX(tInfo->tipCase,  tr->partitionData[model].EV, rateCategory,
 						x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
 						tipX1, tipX2, width, left, right, wgt, &scalerIncrement);
-#else
-			  newviewGTRCATPROT(tInfo->tipCase,  tr->partitionData[model].EV, rateCategory,
-					    x1_start, x2_start, x3_start, tr->partitionData[model].tipVector,
-					    tipX1, tipX2, width, left, right, wgt, &scalerIncrement);			
 #endif
 			}
 		    }
@@ -1421,7 +826,7 @@
 			{
 #ifdef __MIC_NATIVE
 		     assert(0 && "Memory saving is not implemented on Intel MIC");
-#elif __AVX
+#else
 			  newviewGTRGAMMAPROT_AVX_GAPPED_SAVE(tInfo->tipCase,
 							      x1_start, x2_start, x3_start,
 							      tr->partitionData[model].EV,
@@ -1430,15 +835,6 @@
 							      width, left, right, wgt, &scalerIncrement, TRUE,
 							      x1_gap, x2_gap, x3_gap,
 							      x1_gapColumn, x2_gapColumn, x3_gapColumn);
-#else
-			  newviewGTRGAMMAPROT_GAPPED_SAVE(tInfo->tipCase,
-							  x1_start, x2_start, x3_start,
-							  tr->partitionData[model].EV,
-							  tr->partitionData[model].tipVector,
-							  tipX1, tipX2,
-							  width, left, right, wgt, &scalerIncrement,
-							  x1_gap, x2_gap, x3_gap,
-							  x1_gapColumn, x2_gapColumn, x3_gapColumn);
 #endif
 			}
 		      else
@@ -1451,21 +847,13 @@
 							tipX1, tipX2,
 							width, left, right, wgt, &scalerIncrement,
 							tr->partitionData[model].mic_umpLeft, tr->partitionData[model].mic_umpRight);
-#elif __AVX
+#else
 			      newviewGTRGAMMAPROT_AVX_LG4(tInfo->tipCase,
 							  x1_start, x2_start, x3_start,
 							  tr->partitionData[model].EV_LG4,
 							  tr->partitionData[model].tipVector_LG4,
 							  (int*)NULL, tipX1, tipX2,
 							  width, left, right, wgt, &scalerIncrement, TRUE);
-#else
-			      newviewGTRGAMMAPROT_LG4(tInfo->tipCase,
-						      x1_start, x2_start, x3_start,
-						      tr->partitionData[model].EV_LG4,
-						      tr->partitionData[model].tipVector_LG4,
-						      (int*)NULL, tipX1, tipX2,
-						      width, left, right, 
-						      wgt, &scalerIncrement, TRUE);
 #endif			    
 			    }
 			  else
@@ -1476,16 +864,11 @@
 							tipX1, tipX2,
 							width, left, right, wgt, &scalerIncrement,
 							tr->partitionData[model].mic_umpLeft, tr->partitionData[model].mic_umpRight);
-#elif __AVX
+#else
 			      newviewGTRGAMMAPROT_AVX(tInfo->tipCase,
 						      x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector,
 						      tipX1, tipX2,
 						      width, left, right, wgt, &scalerIncrement);
-#else
-			      newviewGTRGAMMAPROT(tInfo->tipCase,
-						  x1_start, x2_start, x3_start, tr->partitionData[model].EV, tr->partitionData[model].tipVector,
-						  tipX1, tipX2,
-						  width, left, right, wgt, &scalerIncrement);
 #endif
 			    }
 			}
@@ -1595,4277 +978,6 @@
 
 /* optimized function implementations */
 
-#if (defined(_OPTIMIZED_FUNCTIONS) && !defined(__AVX))
-
-static void newviewGTRGAMMA_GAPPED_SAVE(int tipCase,
-					double *x1_start, double *x2_start, double *x3_start,
-					double *EV, double *tipVector,
-					unsigned char *tipX1, unsigned char *tipX2,
-					const int n, double *left, double *right, int *wgt, int *scalerIncrement, 
-					unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, 
-					double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn)
-{
-  int     
-    i, 
-    j, 
-    k, 
-    l,
-    addScale = 0, 
-    scaleGap = 0;
-  
-  double
-    *x1,
-    *x2,
-    *x3,
-    *x1_ptr = x1_start,
-    *x2_ptr = x2_start,       
-    max,
-    maxima[2] __attribute__ ((aligned (BYTE_ALIGNMENT))),        
-    EV_t[16] __attribute__ ((aligned (BYTE_ALIGNMENT)));      
-    
-  __m128d 
-    values[8],
-    EVV[8];  
-
-  for(k = 0; k < 4; k++)
-    for (l=0; l < 4; l++)
-      EV_t[4 * l + k] = EV[4 * k + l];
-
-  for(k = 0; k < 8; k++)
-    EVV[k] = _mm_load_pd(&EV_t[k * 2]);      
- 
-  
-
-  switch(tipCase)
-    {
-    case TIP_TIP:
-      {
-	double *uX1, umpX1[256] __attribute__ ((aligned (BYTE_ALIGNMENT))), *uX2, umpX2[256] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-
-
-	for (i = 1; i < 16; i++)
-	  {	    
-	    __m128d x1_1 = _mm_load_pd(&(tipVector[i*4]));
-	    __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2]));	   
-
-	    for (j = 0; j < 4; j++)
-	      for (k = 0; k < 4; k++)
-		{			 	 
-		  __m128d left1 = _mm_load_pd(&left[j*16 + k*4]);
-		  __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]);
-		  
-		  __m128d acc = _mm_setzero_pd();
-
-		  acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
-		  acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
-		  		  
-		  acc = _mm_hadd_pd(acc, acc);
-		  _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);
-		}
-	  
-	    for (j = 0; j < 4; j++)
-	      for (k = 0; k < 4; k++)
-		{
-		  __m128d left1 = _mm_load_pd(&right[j*16 + k*4]);
-		  __m128d left2 = _mm_load_pd(&right[j*16 + k*4 + 2]);
-		  
-		  __m128d acc = _mm_setzero_pd();
-
-		  acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
-		  acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
-		  		  
-		  acc = _mm_hadd_pd(acc, acc);
-		  _mm_storel_pd(&umpX2[i*16 + j*4 + k], acc);
-		 
-		}
-	  }   		  
-	
-	uX1 = &umpX1[240];
-	uX2 = &umpX2[240];	   	    	    
-	
-	for (j = 0; j < 4; j++)
-	  {				 		  		  		   
-	    __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
-	    __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
-	    	    
-	    __m128d uX2_k0_sse = _mm_load_pd( &uX2[j * 4] );
-	    __m128d uX2_k2_sse = _mm_load_pd( &uX2[j * 4 + 2] );
-	    
-	    __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, uX2_k0_sse );
-	    __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, uX2_k2_sse );		    		    		   
-	    
-	    __m128d EV_t_l0_k0 = EVV[0];
-	    __m128d EV_t_l0_k2 = EVV[1];
-	    __m128d EV_t_l1_k0 = EVV[2];
-	    __m128d EV_t_l1_k2 = EVV[3];
-	    __m128d EV_t_l2_k0 = EVV[4];
-	    __m128d EV_t_l2_k2 = EVV[5];
-	    __m128d EV_t_l3_k0 = EVV[6]; 
-	    __m128d EV_t_l3_k2 = EVV[7];
-	    
-	    EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	    EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	    
-	    EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	    EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	    
-	    EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	    
-	    EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	    EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-	    
-	    EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	    EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	    EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-	    
-	    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
-	    	  
-	    _mm_store_pd( &x3_gapColumn[j * 4 + 0], EV_t_l0_k0 );
-	    _mm_store_pd( &x3_gapColumn[j * 4 + 2], EV_t_l2_k0 );	   
-	  }  
-	
-       
-	x3 = x3_start;
-	
-	for (i = 0; i < n; i++)
-	  {	    
-	    if(!(x3_gap[i / 32] & mask32[i % 32]))	     
-	      {
-		uX1 = &umpX1[16 * tipX1[i]];
-		uX2 = &umpX2[16 * tipX2[i]];	   	    	    		
-		
-		for (j = 0; j < 4; j++)
-		  {				 		  		  		   
-		    __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
-		    __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
-		    
-		    
-		    __m128d uX2_k0_sse = _mm_load_pd( &uX2[j * 4] );
-		    __m128d uX2_k2_sse = _mm_load_pd( &uX2[j * 4 + 2] );
-		    
-		    
-		    //
-		    // multiply left * right
-		    //
-		    
-		    __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, uX2_k0_sse );
-		    __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, uX2_k2_sse );
-		    
-		    
-		    //
-		    // multiply with EV matrix (!?)
-		    //
-		    
-		    __m128d EV_t_l0_k0 = EVV[0];
-		    __m128d EV_t_l0_k2 = EVV[1];
-		    __m128d EV_t_l1_k0 = EVV[2];
-		    __m128d EV_t_l1_k2 = EVV[3];
-		    __m128d EV_t_l2_k0 = EVV[4];
-		    __m128d EV_t_l2_k2 = EVV[5];
-		    __m128d EV_t_l3_k0 = EVV[6]; 
-		    __m128d EV_t_l3_k2 = EVV[7];
-		    
-		    EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-		    EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-		    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-		    
-		    EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-		    EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-		    
-		    EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-		    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-		    
-		    EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-		    EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-		    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-		    
-		    EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-		    EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-		    EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-		    
-		    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
-		    
-		    _mm_store_pd( &x3[j * 4 + 0], EV_t_l0_k0 );
-		    _mm_store_pd( &x3[j * 4 + 2], EV_t_l2_k0 );
-		  }
-		
-		x3 += 16;
-	      }
-	  }
-      }
-      break;
-    case TIP_INNER:
-      {	
-	double 
-	  *uX1, 
-	  umpX1[256] __attribute__ ((aligned (BYTE_ALIGNMENT)));		 
-
-	for (i = 1; i < 16; i++)
-	  {
-	    __m128d x1_1 = _mm_load_pd(&(tipVector[i*4]));
-	    __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2]));	   
-
-	    for (j = 0; j < 4; j++)
-	      for (k = 0; k < 4; k++)
-		{		 
-		  __m128d left1 = _mm_load_pd(&left[j*16 + k*4]);
-		  __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]);
-		  
-		  __m128d acc = _mm_setzero_pd();
-
-		  acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
-		  acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
-		  		  
-		  acc = _mm_hadd_pd(acc, acc);
-		  _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);		 
-		}
-	  }
-
-	{
-	  __m128d maxv =_mm_setzero_pd();
-	  
-	  scaleGap = 0;
-	  
-	  x2 = x2_gapColumn;			 
-	  x3 = x3_gapColumn;
-	  
-	  uX1 = &umpX1[240];	     
-	  
-	  for (j = 0; j < 4; j++)
-	    {		     		   
-	      double *x2_p = &x2[j*4];
-	      double *right_k0_p = &right[j*16];
-	      double *right_k1_p = &right[j*16 + 1*4];
-	      double *right_k2_p = &right[j*16 + 2*4];
-	      double *right_k3_p = &right[j*16 + 3*4];
-	      __m128d x2_0 = _mm_load_pd( &x2_p[0] );
-	      __m128d x2_2 = _mm_load_pd( &x2_p[2] );
-	      
-	      __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
-	      __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
-	      __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
-	      __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
-	      __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
-	      __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
-	      __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
-	      __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
-	      	      
-	      right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	      right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-	      
-	      right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	      right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-	      
-	      right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	      right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	      right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-	      	       
-	      right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	      right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-	      	       
-	      right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	      right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-	      
-	      right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	      right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	      right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
-	      
-	      __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
-	      __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
-	      
-	      __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, right_k0_0 );
-	      __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, right_k2_0 );
-	      
-	      __m128d EV_t_l0_k0 = EVV[0];
-	      __m128d EV_t_l0_k2 = EVV[1];
-	      __m128d EV_t_l1_k0 = EVV[2];
-	      __m128d EV_t_l1_k2 = EVV[3];
-	      __m128d EV_t_l2_k0 = EVV[4];
-	      __m128d EV_t_l2_k2 = EVV[5];
-	      __m128d EV_t_l3_k0 = EVV[6]; 
-	      __m128d EV_t_l3_k2 = EVV[7];
-	      
-	      EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	      EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	      EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	      
-	      EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	      EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	      
-	      EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	      EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	      
-	      EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	      EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	      EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-	      
-	      EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	      EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	      EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-	      
-	      EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
-	      
-	      values[j * 2]     = EV_t_l0_k0;
-	      values[j * 2 + 1] = EV_t_l2_k0;		   		   
-	      
-	      maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
-	      maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));		   	     		   
-	    }
-
-	  
-	  _mm_store_pd(maxima, maxv);
-		 
-	  max = MAX(maxima[0], maxima[1]);
-	  
-	  if(max < minlikelihood)
-	    {
-	      scaleGap = 1;
-	      
-	      __m128d sv = _mm_set1_pd(twotothe256);
-	      
-	      _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));	   
-	      _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
-	      _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
-	      _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
-	      _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));	   
-	      _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
-	      _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
-	      _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));	     	      	     
-	    }
-	  else
-	    {
-	      _mm_store_pd(&x3[0], values[0]);	   
-	      _mm_store_pd(&x3[2], values[1]);
-	      _mm_store_pd(&x3[4], values[2]);
-	      _mm_store_pd(&x3[6], values[3]);
-	      _mm_store_pd(&x3[8], values[4]);	   
-	      _mm_store_pd(&x3[10], values[5]);
-	      _mm_store_pd(&x3[12], values[6]);
-	      _mm_store_pd(&x3[14], values[7]);
-	    }
-	}		       	
-      	
-	x3 = x3_start;
-
-	for (i = 0; i < n; i++)
-	   {
-	     if((x3_gap[i / 32] & mask32[i % 32]))
-	       {	       
-		 if(scaleGap)
-		   {		    
-		       addScale += wgt[i];		     
-		   }
-	       }
-	     else
-	       {				 
-		 __m128d maxv =_mm_setzero_pd();		 
-		 
-		 if(x2_gap[i / 32] & mask32[i % 32])
-		   x2 = x2_gapColumn;
-		 else
-		   {
-		     x2 = x2_ptr;
-		     x2_ptr += 16;
-		   }
-		 		 		 
-		 uX1 = &umpX1[16 * tipX1[i]];	     
-		 
-		 
-		 for (j = 0; j < 4; j++)
-		   {		     		   
-		     double *x2_p = &x2[j*4];
-		     double *right_k0_p = &right[j*16];
-		     double *right_k1_p = &right[j*16 + 1*4];
-		     double *right_k2_p = &right[j*16 + 2*4];
-		     double *right_k3_p = &right[j*16 + 3*4];
-		     __m128d x2_0 = _mm_load_pd( &x2_p[0] );
-		     __m128d x2_2 = _mm_load_pd( &x2_p[2] );
-		     
-		     __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
-		     __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
-		     __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
-		     __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
-		     __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
-		     __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
-		     __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
-		     __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
-		     
-		     		     
-		     right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-		     right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-		     
-		     right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-		     right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-		     
-		     right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-		     right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-		     right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-		     
-		     
-		     right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-		     right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-		     
-		     
-		     right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-		     right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-		     
-		     right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-		     right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-		     right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
-		     
-		     {
-		       //
-		       // load left side from tip vector
-		       //
-		       
-		       __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
-		       __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
-		       
-		       
-		       //
-		       // multiply left * right
-			   //
-		       
-		       __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, right_k0_0 );
-		       __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, right_k2_0 );
-		       
-		       
-		       //
-		       // multiply with EV matrix (!?)
-		       //		   		  
-		       
-		       __m128d EV_t_l0_k0 = EVV[0];
-		       __m128d EV_t_l0_k2 = EVV[1];
-		       __m128d EV_t_l1_k0 = EVV[2];
-		       __m128d EV_t_l1_k2 = EVV[3];
-		       __m128d EV_t_l2_k0 = EVV[4];
-		       __m128d EV_t_l2_k2 = EVV[5];
-		       __m128d EV_t_l3_k0 = EVV[6]; 
-		       __m128d EV_t_l3_k2 = EVV[7];
-		       
-		       
-		       EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-		       EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-		       EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-		       
-		       EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-		       EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-		       
-		       EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-		       EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-		       
-		       EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-		       EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-		       EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-		       
-		       EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-		       EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-		       EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-		       
-		       EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
-		       
-		       values[j * 2]     = EV_t_l0_k0;
-		       values[j * 2 + 1] = EV_t_l2_k0;		   		   
-			   
-		       maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
-		       maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));		   
-		     }		   
-		   }
-
-	     
-		 _mm_store_pd(maxima, maxv);
-		 
-		 max = MAX(maxima[0], maxima[1]);
-		 
-		 if(max < minlikelihood)
-		   {
-		     __m128d sv = _mm_set1_pd(twotothe256);
-		     
-		     _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));	   
-		     _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
-		     _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
-		     _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
-		     _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));	   
-		     _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
-		     _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
-		     _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));	     
-		     
-		     
-		     addScale += wgt[i];
-		    
-		   }
-		 else
-		   {
-		     _mm_store_pd(&x3[0], values[0]);	   
-		     _mm_store_pd(&x3[2], values[1]);
-		     _mm_store_pd(&x3[4], values[2]);
-		     _mm_store_pd(&x3[6], values[3]);
-		     _mm_store_pd(&x3[8], values[4]);	   
-		     _mm_store_pd(&x3[10], values[5]);
-		     _mm_store_pd(&x3[12], values[6]);
-		     _mm_store_pd(&x3[14], values[7]);
-		   }		 
-		 
-		 x3 += 16;
-	       }
-	   }
-      }
-      break;
-    case INNER_INNER:         
-      {
-	__m128d maxv =_mm_setzero_pd();
-	
-	scaleGap = 0;
-	
-	x1 = x1_gapColumn;	     	    
-	x2 = x2_gapColumn;	    
-	x3 = x3_gapColumn;
-	
-	for (j = 0; j < 4; j++)
-	  {
-	    
-	    double *x1_p = &x1[j*4];
-	    double *left_k0_p = &left[j*16];
-	    double *left_k1_p = &left[j*16 + 1*4];
-	    double *left_k2_p = &left[j*16 + 2*4];
-	    double *left_k3_p = &left[j*16 + 3*4];
-	    
-	    __m128d x1_0 = _mm_load_pd( &x1_p[0] );
-	    __m128d x1_2 = _mm_load_pd( &x1_p[2] );
-	    
-	    __m128d left_k0_0 = _mm_load_pd( &left_k0_p[0] );
-	    __m128d left_k0_2 = _mm_load_pd( &left_k0_p[2] );
-	    __m128d left_k1_0 = _mm_load_pd( &left_k1_p[0] );
-	    __m128d left_k1_2 = _mm_load_pd( &left_k1_p[2] );
-	    __m128d left_k2_0 = _mm_load_pd( &left_k2_p[0] );
-	    __m128d left_k2_2 = _mm_load_pd( &left_k2_p[2] );
-	    __m128d left_k3_0 = _mm_load_pd( &left_k3_p[0] );
-	    __m128d left_k3_2 = _mm_load_pd( &left_k3_p[2] );
-	    
-	    left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-	    left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-	    
-	    left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-	    left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-	    
-	    left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-	    left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-	    left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-	    
-	    left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-	    left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-	    
-	    left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-	    left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-	    
-	    left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-	    left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-	    left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-	    
-	    
-	    double *x2_p = &x2[j*4];
-	    double *right_k0_p = &right[j*16];
-	    double *right_k1_p = &right[j*16 + 1*4];
-	    double *right_k2_p = &right[j*16 + 2*4];
-	    double *right_k3_p = &right[j*16 + 3*4];
-	    __m128d x2_0 = _mm_load_pd( &x2_p[0] );
-	    __m128d x2_2 = _mm_load_pd( &x2_p[2] );
-	    
-	    __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
-	    __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
-	    __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
-	    __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
-	    __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
-	    __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
-	    __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
-	    __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
-	    
-	    right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	    right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-	    
-	    right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	    right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-	    
-	    right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	    right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	    right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-	    
-	    right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	    right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-	    	    
-	    right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	    right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-	    
-	    right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	    right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	    right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   		 		
-	    
-	    __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-	    __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );		 		 	   
-	    
-	    __m128d EV_t_l0_k0 = EVV[0];
-	    __m128d EV_t_l0_k2 = EVV[1];
-	    __m128d EV_t_l1_k0 = EVV[2];
-	    __m128d EV_t_l1_k2 = EVV[3];
-	    __m128d EV_t_l2_k0 = EVV[4];
-	    __m128d EV_t_l2_k2 = EVV[5];
-	    __m128d EV_t_l3_k0 = EVV[6]; 
-	    __m128d EV_t_l3_k2 = EVV[7];
-	    
-	    EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	    EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	    
-	    EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	    EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	    
-	    EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	    
-	    EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	    EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-	    
-	    EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	    EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	    EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-	    
-	    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
-	    
-	    
-	    values[j * 2] = EV_t_l0_k0;
-	    values[j * 2 + 1] = EV_t_l2_k0;            	   	    
-	    
-	    maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
-	    maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));
-	  }
-		     
-	_mm_store_pd(maxima, maxv);
-	
-	max = MAX(maxima[0], maxima[1]);
-	
-	if(max < minlikelihood)
-	  {
-	    __m128d sv = _mm_set1_pd(twotothe256);
-	    
-	    scaleGap = 1;
-	    
-	    _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));	   
-	    _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
-	    _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
-	    _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
-	    _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));	   
-	    _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
-	    _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
-	    _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));	     	    	 
-	  }
-	else
-	  {
-	    _mm_store_pd(&x3[0], values[0]);	   
-	    _mm_store_pd(&x3[2], values[1]);
-	    _mm_store_pd(&x3[4], values[2]);
-	    _mm_store_pd(&x3[6], values[3]);
-	    _mm_store_pd(&x3[8], values[4]);	   
-	    _mm_store_pd(&x3[10], values[5]);
-	    _mm_store_pd(&x3[12], values[6]);
-	    _mm_store_pd(&x3[14], values[7]);
-	  }
-      }
-
-     
-      x3 = x3_start;
-
-     for (i = 0; i < n; i++)
-       { 
-	 if(x3_gap[i / 32] & mask32[i % 32])
-	   {	     
-	     if(scaleGap)
-	       {		 
-		 addScale += wgt[i];		 	       
-	       }
-	   }
-	 else
-	   {
-	     __m128d maxv =_mm_setzero_pd();	     	    
-	     
-	     if(x1_gap[i / 32] & mask32[i % 32])
-	       x1 = x1_gapColumn;
-	     else
-	       {
-		 x1 = x1_ptr;
-		 x1_ptr += 16;
-	       }
-	     
-	     if(x2_gap[i / 32] & mask32[i % 32])
-	       x2 = x2_gapColumn;
-	     else
-	       {
-		 x2 = x2_ptr;
-		 x2_ptr += 16;
-	       }
-	     
-	     
-	     for (j = 0; j < 4; j++)
-	       {
-		 
-		 double *x1_p = &x1[j*4];
-		 double *left_k0_p = &left[j*16];
-		 double *left_k1_p = &left[j*16 + 1*4];
-		 double *left_k2_p = &left[j*16 + 2*4];
-		 double *left_k3_p = &left[j*16 + 3*4];
-		 
-		 __m128d x1_0 = _mm_load_pd( &x1_p[0] );
-		 __m128d x1_2 = _mm_load_pd( &x1_p[2] );
-		 
-		 __m128d left_k0_0 = _mm_load_pd( &left_k0_p[0] );
-		 __m128d left_k0_2 = _mm_load_pd( &left_k0_p[2] );
-		 __m128d left_k1_0 = _mm_load_pd( &left_k1_p[0] );
-		 __m128d left_k1_2 = _mm_load_pd( &left_k1_p[2] );
-		 __m128d left_k2_0 = _mm_load_pd( &left_k2_p[0] );
-		 __m128d left_k2_2 = _mm_load_pd( &left_k2_p[2] );
-		 __m128d left_k3_0 = _mm_load_pd( &left_k3_p[0] );
-		 __m128d left_k3_2 = _mm_load_pd( &left_k3_p[2] );
-		 
-		 left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-		 left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-		 
-		 left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-		 left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-		 
-		 left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-		 left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-		 left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-		 
-		 left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-		 left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-		 
-		 left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-		 left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-		 
-		 left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-		 left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-		 left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-		 
-		 
-		 //
-		 // multiply/add right side
-		 //
-		 double *x2_p = &x2[j*4];
-		 double *right_k0_p = &right[j*16];
-		 double *right_k1_p = &right[j*16 + 1*4];
-		 double *right_k2_p = &right[j*16 + 2*4];
-		 double *right_k3_p = &right[j*16 + 3*4];
-		 __m128d x2_0 = _mm_load_pd( &x2_p[0] );
-		 __m128d x2_2 = _mm_load_pd( &x2_p[2] );
-		 
-		 __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
-		 __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
-		 __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
-		 __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
-		 __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
-		 __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
-		 __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
-		 __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
-		 
-		 right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-		 right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-		 
-		 right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-		 right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-		 
-		 right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-		 right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-		 right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-		 
-		 right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-		 right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-		 
-		 
-		 right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-		 right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-		 
-		 right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-		 right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-		 right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-		 
-		 //
-		 // multiply left * right
-		 //
-		 
-		 __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-		 __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
-		 
-		 
-		 //
-		 // multiply with EV matrix (!?)
-		 //	     
-		 
-		 __m128d EV_t_l0_k0 = EVV[0];
-		 __m128d EV_t_l0_k2 = EVV[1];
-		 __m128d EV_t_l1_k0 = EVV[2];
-		 __m128d EV_t_l1_k2 = EVV[3];
-		 __m128d EV_t_l2_k0 = EVV[4];
-		 __m128d EV_t_l2_k2 = EVV[5];
-		 __m128d EV_t_l3_k0 = EVV[6]; 
-		 __m128d EV_t_l3_k2 = EVV[7];
-		 
-		 
-		 EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-		 EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-		 EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-		 
-		 EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-		 EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-		 
-		 EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-		 EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-		 
-		 EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-		 EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-		 EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-		 
-		 
-		 EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-		 EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-		 EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-		 
-		 EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
-		 
-		 
-		 values[j * 2] = EV_t_l0_k0;
-		 values[j * 2 + 1] = EV_t_l2_k0;            	   	    
-		 
-		 maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
-		 maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));
-	       }
-	     
-	     
-	     _mm_store_pd(maxima, maxv);
-	     
-	     max = MAX(maxima[0], maxima[1]);
-	     
-	     if(max < minlikelihood)
-	       {
-		 __m128d sv = _mm_set1_pd(twotothe256);
-		 
-		 _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));	   
-		 _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
-		 _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
-		 _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
-		 _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));	   
-		 _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
-		 _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
-		 _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));	     
-		 
-		 
-		 addScale += wgt[i];
-		
-	       }
-	     else
-	       {
-		 _mm_store_pd(&x3[0], values[0]);	   
-		 _mm_store_pd(&x3[2], values[1]);
-		 _mm_store_pd(&x3[4], values[2]);
-		 _mm_store_pd(&x3[6], values[3]);
-		 _mm_store_pd(&x3[8], values[4]);	   
-		 _mm_store_pd(&x3[10], values[5]);
-		 _mm_store_pd(&x3[12], values[6]);
-		 _mm_store_pd(&x3[14], values[7]);
-	       }	 
-
-	    
-		 
-	     x3 += 16;
-
-	   }
-       }
-     break;
-    default:
-      assert(0);
-    }
-  
- 
-  *scalerIncrement = addScale;
-}
-
-
-static void newviewGTRGAMMA(int tipCase,
-			    double *x1_start, double *x2_start, double *x3_start,
-			    double *EV, double *tipVector,
-			    unsigned char *tipX1, unsigned char *tipX2,
-			    const int n, double *left, double *right, int *wgt, int *scalerIncrement
-			    )
-{
-  int 
-    i, 
-    j, 
-    k, 
-    l,
-    addScale = 0;
-  
-  double
-    *x1,
-    *x2,
-    *x3,
-    max,
-    maxima[2] __attribute__ ((aligned (BYTE_ALIGNMENT))),       
-    EV_t[16] __attribute__ ((aligned (BYTE_ALIGNMENT)));      
-    
-  __m128d 
-    values[8],
-    EVV[8];  
-
-  for(k = 0; k < 4; k++)
-    for (l=0; l < 4; l++)
-      EV_t[4 * l + k] = EV[4 * k + l];
-
-  for(k = 0; k < 8; k++)
-    EVV[k] = _mm_load_pd(&EV_t[k * 2]);
-   
-  switch(tipCase)
-    {
-    case TIP_TIP:
-      {
-	double *uX1, umpX1[256] __attribute__ ((aligned (BYTE_ALIGNMENT))), *uX2, umpX2[256] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-
-
-	for (i = 1; i < 16; i++)
-	  {
-	    __m128d x1_1 = _mm_load_pd(&(tipVector[i*4]));
-	    __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2]));	   
-
-	    for (j = 0; j < 4; j++)
-	      for (k = 0; k < 4; k++)
-		{		 
-		  __m128d left1 = _mm_load_pd(&left[j*16 + k*4]);
-		  __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]);
-		  
-		  __m128d acc = _mm_setzero_pd();
-
-		  acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
-		  acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
-		  		  
-		  acc = _mm_hadd_pd(acc, acc);
-		  _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);
-		}
-	  
-	    for (j = 0; j < 4; j++)
-	      for (k = 0; k < 4; k++)
-		{
-		  __m128d left1 = _mm_load_pd(&right[j*16 + k*4]);
-		  __m128d left2 = _mm_load_pd(&right[j*16 + k*4 + 2]);
-		  
-		  __m128d acc = _mm_setzero_pd();
-
-		  acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
-		  acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
-		  		  
-		  acc = _mm_hadd_pd(acc, acc);
-		  _mm_storel_pd(&umpX2[i*16 + j*4 + k], acc);
-		 
-		}
-	  }   	
-	  
-	for (i = 0; i < n; i++)
-	  {
-	    x3 = &x3_start[i * 16];
-
-	    
-	    uX1 = &umpX1[16 * tipX1[i]];
-	    uX2 = &umpX2[16 * tipX2[i]];	   	    	    
-	    
-	    for (j = 0; j < 4; j++)
-	       {				 		  		  		   
-		 __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
-		 __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
-		 				  
-		   
-		 __m128d uX2_k0_sse = _mm_load_pd( &uX2[j * 4] );
-		 __m128d uX2_k2_sse = _mm_load_pd( &uX2[j * 4 + 2] );
- 		 
-
-		 //
-		 // multiply left * right
-		 //
-		 
-		 __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, uX2_k0_sse );
-		 __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, uX2_k2_sse );
-		 
-		 
-		 //
-		 // multiply with EV matrix (!?)
-		 //
-		 
-		 __m128d EV_t_l0_k0 = EVV[0];
-		 __m128d EV_t_l0_k2 = EVV[1];
-		 __m128d EV_t_l1_k0 = EVV[2];
-		 __m128d EV_t_l1_k2 = EVV[3];
-		 __m128d EV_t_l2_k0 = EVV[4];
-		 __m128d EV_t_l2_k2 = EVV[5];
-		 __m128d EV_t_l3_k0 = EVV[6]; 
-		 __m128d EV_t_l3_k2 = EVV[7];
-		 
-		 EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-		 EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-		 EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-		 
-		 EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-		 EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-		 
-		 EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-		 EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-		 
-		 EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-		 EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-		 EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-		 
-		 EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-		 EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-		 EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-		 
-		 EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
-		 
-		 _mm_store_pd( &x3[j * 4 + 0], EV_t_l0_k0 );
-		 _mm_store_pd( &x3[j * 4 + 2], EV_t_l2_k0 );
-	       }
-	  }
-      }
-      break;
-    case TIP_INNER:
-      {	
-	double *uX1, umpX1[256] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-
-
-	for (i = 1; i < 16; i++)
-	  {
-	    __m128d x1_1 = _mm_load_pd(&(tipVector[i*4]));
-	    __m128d x1_2 = _mm_load_pd(&(tipVector[i*4 + 2]));	   
-
-	    for (j = 0; j < 4; j++)
-	      for (k = 0; k < 4; k++)
-		{		 
-		  __m128d left1 = _mm_load_pd(&left[j*16 + k*4]);
-		  __m128d left2 = _mm_load_pd(&left[j*16 + k*4 + 2]);
-		  
-		  __m128d acc = _mm_setzero_pd();
-
-		  acc = _mm_add_pd(acc, _mm_mul_pd(left1, x1_1));
-		  acc = _mm_add_pd(acc, _mm_mul_pd(left2, x1_2));
-		  		  
-		  acc = _mm_hadd_pd(acc, acc);
-		  _mm_storel_pd(&umpX1[i*16 + j*4 + k], acc);		 
-		}
-	  }
-
-	 for (i = 0; i < n; i++)
-	   {
-	     __m128d maxv =_mm_setzero_pd();
-	     
-	     x2 = &x2_start[i * 16];
-	     x3 = &x3_start[i * 16];
-
-	     uX1 = &umpX1[16 * tipX1[i]];	     
-
-	     for (j = 0; j < 4; j++)
-	       {
-
-		 //
-		 // multiply/add right side
-		 //
-		 double *x2_p = &x2[j*4];
-		 double *right_k0_p = &right[j*16];
-		 double *right_k1_p = &right[j*16 + 1*4];
-		 double *right_k2_p = &right[j*16 + 2*4];
-		 double *right_k3_p = &right[j*16 + 3*4];
-		 __m128d x2_0 = _mm_load_pd( &x2_p[0] );
-		 __m128d x2_2 = _mm_load_pd( &x2_p[2] );
-
-		 __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
-		 __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
-		 __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
-		 __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
-		 __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
-		 __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
-		 __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
-		 __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
-
-
-
-		 right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-		 right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-
-		 right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-		 right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-
-		 right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-		 right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-		 right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-
-
-		 right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-		 right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-
-
-		 right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-		 right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-
-		 right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-		 right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-		 right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);
-
-		 {
-		   //
-		   // load left side from tip vector
-		   //
-		   
-		   __m128d uX1_k0_sse = _mm_load_pd( &uX1[j * 4] );
-		   __m128d uX1_k2_sse = _mm_load_pd( &uX1[j * 4 + 2] );
-		 
-		 
-		   //
-		   // multiply left * right
-		   //
-		   
-		   __m128d x1px2_k0 = _mm_mul_pd( uX1_k0_sse, right_k0_0 );
-		   __m128d x1px2_k2 = _mm_mul_pd( uX1_k2_sse, right_k2_0 );
-		   
-		   
-		   //
-		   // multiply with EV matrix (!?)
-		   //		   		  
-
-		   __m128d EV_t_l0_k0 = EVV[0];
-		   __m128d EV_t_l0_k2 = EVV[1];
-		   __m128d EV_t_l1_k0 = EVV[2];
-		   __m128d EV_t_l1_k2 = EVV[3];
-		   __m128d EV_t_l2_k0 = EVV[4];
-		   __m128d EV_t_l2_k2 = EVV[5];
-		   __m128d EV_t_l3_k0 = EVV[6]; 
-		   __m128d EV_t_l3_k2 = EVV[7];
-
-		   
-		   EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-		   EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-		   EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-		   
-		   EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-		   EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-		   
-		   EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-		   EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-		   
-		   EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-		   EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-		   EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-		   		   
-		   EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-		   EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-		   EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-		   
-		   EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
-		   
-		   values[j * 2]     = EV_t_l0_k0;
-		   values[j * 2 + 1] = EV_t_l2_k0;		   		   
-		   
-		   maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
-		   maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));		   
-		 }
-	       }
-
-	     
-	     _mm_store_pd(maxima, maxv);
-
-	     max = MAX(maxima[0], maxima[1]);
-
-	     if(max < minlikelihood)
-	       {
-		 __m128d sv = _mm_set1_pd(twotothe256);
-	       		       	   	 	     
-		 _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));	   
-		 _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
-		 _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
-		 _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
-		 _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));	   
-		 _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
-		 _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
-		 _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));	     
-		 
-		 
-		 addScale += wgt[i];
-		 
-	       }
-	     else
-	       {
-		 _mm_store_pd(&x3[0], values[0]);	   
-		 _mm_store_pd(&x3[2], values[1]);
-		 _mm_store_pd(&x3[4], values[2]);
-		 _mm_store_pd(&x3[6], values[3]);
-		 _mm_store_pd(&x3[8], values[4]);	   
-		 _mm_store_pd(&x3[10], values[5]);
-		 _mm_store_pd(&x3[12], values[6]);
-		 _mm_store_pd(&x3[14], values[7]);
-	       }
-	   }
-      }
-      break;
-    case INNER_INNER:     
-     for (i = 0; i < n; i++)
-       {
-	 __m128d maxv =_mm_setzero_pd();
-	 
-
-	 x1 = &x1_start[i * 16];
-	 x2 = &x2_start[i * 16];
-	 x3 = &x3_start[i * 16];
-	 
-	 for (j = 0; j < 4; j++)
-	   {
-	     
-	     double *x1_p = &x1[j*4];
-	     double *left_k0_p = &left[j*16];
-	     double *left_k1_p = &left[j*16 + 1*4];
-	     double *left_k2_p = &left[j*16 + 2*4];
-	     double *left_k3_p = &left[j*16 + 3*4];
-	     
-	     __m128d x1_0 = _mm_load_pd( &x1_p[0] );
-	     __m128d x1_2 = _mm_load_pd( &x1_p[2] );
-	     
-	     __m128d left_k0_0 = _mm_load_pd( &left_k0_p[0] );
-	     __m128d left_k0_2 = _mm_load_pd( &left_k0_p[2] );
-	     __m128d left_k1_0 = _mm_load_pd( &left_k1_p[0] );
-	     __m128d left_k1_2 = _mm_load_pd( &left_k1_p[2] );
-	     __m128d left_k2_0 = _mm_load_pd( &left_k2_p[0] );
-	     __m128d left_k2_2 = _mm_load_pd( &left_k2_p[2] );
-	     __m128d left_k3_0 = _mm_load_pd( &left_k3_p[0] );
-	     __m128d left_k3_2 = _mm_load_pd( &left_k3_p[2] );
-	     
-	     left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-	     left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-	     
-	     left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-	     left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-	     
-	     left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-	     left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-	     left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-	     
-	     left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-	     left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-	     
-	     left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-	     left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-	     
-	     left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-	     left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-	     left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-	     
-	     
-	     //
-	     // multiply/add right side
-	     //
-	     double *x2_p = &x2[j*4];
-	     double *right_k0_p = &right[j*16];
-	     double *right_k1_p = &right[j*16 + 1*4];
-	     double *right_k2_p = &right[j*16 + 2*4];
-	     double *right_k3_p = &right[j*16 + 3*4];
-	     __m128d x2_0 = _mm_load_pd( &x2_p[0] );
-	     __m128d x2_2 = _mm_load_pd( &x2_p[2] );
-	     
-	     __m128d right_k0_0 = _mm_load_pd( &right_k0_p[0] );
-	     __m128d right_k0_2 = _mm_load_pd( &right_k0_p[2] );
-	     __m128d right_k1_0 = _mm_load_pd( &right_k1_p[0] );
-	     __m128d right_k1_2 = _mm_load_pd( &right_k1_p[2] );
-	     __m128d right_k2_0 = _mm_load_pd( &right_k2_p[0] );
-	     __m128d right_k2_2 = _mm_load_pd( &right_k2_p[2] );
-	     __m128d right_k3_0 = _mm_load_pd( &right_k3_p[0] );
-	     __m128d right_k3_2 = _mm_load_pd( &right_k3_p[2] );
-	     
-	     right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	     right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-	     
-	     right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	     right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-	     
-	     right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	     right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	     right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-	     
-	     right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	     right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-	     
-	     
-	     right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	     right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-	     
-	     right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	     right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	     right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-
-             //
-             // multiply left * right
-             //
-
-	     __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-	     __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
-
-
-             //
-             // multiply with EV matrix (!?)
-             //	     
-
-	     __m128d EV_t_l0_k0 = EVV[0];
-	     __m128d EV_t_l0_k2 = EVV[1];
-	     __m128d EV_t_l1_k0 = EVV[2];
-	     __m128d EV_t_l1_k2 = EVV[3];
-	     __m128d EV_t_l2_k0 = EVV[4];
-	     __m128d EV_t_l2_k2 = EVV[5];
-	     __m128d EV_t_l3_k0 = EVV[6]; 
-	     __m128d EV_t_l3_k2 = EVV[7];
-
-
-	    EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	    EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-
-	    EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	    EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-
-	    EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-
-	    EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	    EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-
-
-	    EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-            EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-            EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-
-            EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );
-
-	    
-	    values[j * 2] = EV_t_l0_k0;
-	    values[j * 2 + 1] = EV_t_l2_k0;            	   	    
-
-	    maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l0_k0, absMask.m));
-	    maxv = _mm_max_pd(maxv, _mm_and_pd(EV_t_l2_k0, absMask.m));
-           }
-	 	 
-	 
-	 _mm_store_pd(maxima, maxv);
-	 
-	 max = MAX(maxima[0], maxima[1]);
-	 
-	 if(max < minlikelihood)
-	   {
-	     __m128d sv = _mm_set1_pd(twotothe256);
-	       		       	   	 	     
-	     _mm_store_pd(&x3[0], _mm_mul_pd(values[0], sv));	   
-	     _mm_store_pd(&x3[2], _mm_mul_pd(values[1], sv));
-	     _mm_store_pd(&x3[4], _mm_mul_pd(values[2], sv));
-	     _mm_store_pd(&x3[6], _mm_mul_pd(values[3], sv));
-	     _mm_store_pd(&x3[8], _mm_mul_pd(values[4], sv));	   
-	     _mm_store_pd(&x3[10], _mm_mul_pd(values[5], sv));
-	     _mm_store_pd(&x3[12], _mm_mul_pd(values[6], sv));
-	     _mm_store_pd(&x3[14], _mm_mul_pd(values[7], sv));	     
-	     
-	    
-	     addScale += wgt[i];
-	    
-	   }
-	 else
-	   {
-	     _mm_store_pd(&x3[0], values[0]);	   
-	     _mm_store_pd(&x3[2], values[1]);
-	     _mm_store_pd(&x3[4], values[2]);
-	     _mm_store_pd(&x3[6], values[3]);
-	     _mm_store_pd(&x3[8], values[4]);	   
-	     _mm_store_pd(&x3[10], values[5]);
-	     _mm_store_pd(&x3[12], values[6]);
-	     _mm_store_pd(&x3[14], values[7]);
-	   }	 
-       }
-   
-     break;
-    default:
-      assert(0);
-    }
-  
- 
-  *scalerIncrement = addScale;
-
-}
-static void newviewGTRCAT( int tipCase,  double *EV,  int *cptr,
-			   double *x1_start, double *x2_start,  double *x3_start, double *tipVector,
-			   unsigned char *tipX1, unsigned char *tipX2,
-			   int n,  double *left, double *right, int *wgt, int *scalerIncrement)
-{
-  double
-    *le,
-    *ri,
-    *x1,
-    *x2, 
-    *x3, 
-    EV_t[16] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-    
-  int 
-    i, 
-    j, 
-    scale, 
-    addScale = 0;
-   
-  __m128d
-    minlikelihood_sse = _mm_set1_pd( minlikelihood ),
-    sc = _mm_set1_pd(twotothe256),
-    EVV[8];  
-  
-  for(i = 0; i < 4; i++)
-    for (j=0; j < 4; j++)
-      EV_t[4 * j + i] = EV[4 * i + j];
-  
-  for(i = 0; i < 8; i++)
-    EVV[i] = _mm_load_pd(&EV_t[i * 2]);
-  
-  switch(tipCase)
-    {
-    case TIP_TIP:      
-      for (i = 0; i < n; i++)
-	{	 
-	  x1 = &(tipVector[4 * tipX1[i]]);
-	  x2 = &(tipVector[4 * tipX2[i]]);
-	  
-	  x3 = &x3_start[i * 4];
-	  
-	  le =  &left[cptr[i] * 16];
-	  ri =  &right[cptr[i] * 16];
-	  
-	  __m128d x1_0 = _mm_load_pd( &x1[0] );
-	  __m128d x1_2 = _mm_load_pd( &x1[2] );
-	  
-	  __m128d left_k0_0 = _mm_load_pd( &le[0] );
-	  __m128d left_k0_2 = _mm_load_pd( &le[2] );
-	  __m128d left_k1_0 = _mm_load_pd( &le[4] );
-	  __m128d left_k1_2 = _mm_load_pd( &le[6] );
-	  __m128d left_k2_0 = _mm_load_pd( &le[8] );
-	  __m128d left_k2_2 = _mm_load_pd( &le[10] );
-	  __m128d left_k3_0 = _mm_load_pd( &le[12] );
-	  __m128d left_k3_2 = _mm_load_pd( &le[14] );
-	  
-	  left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-	  left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-	  
-	  left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-	  left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-	  
-	  left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-	  left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-	  left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-	  
-	  left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-	  left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-	  
-	  left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-	  left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-	  
-	  left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-	  left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-	  left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-	  
-	  __m128d x2_0 = _mm_load_pd( &x2[0] );
-	  __m128d x2_2 = _mm_load_pd( &x2[2] );
-	  
-	  __m128d right_k0_0 = _mm_load_pd( &ri[0] );
-	  __m128d right_k0_2 = _mm_load_pd( &ri[2] );
-	  __m128d right_k1_0 = _mm_load_pd( &ri[4] );
-	  __m128d right_k1_2 = _mm_load_pd( &ri[6] );
-	  __m128d right_k2_0 = _mm_load_pd( &ri[8] );
-	  __m128d right_k2_2 = _mm_load_pd( &ri[10] );
-	  __m128d right_k3_0 = _mm_load_pd( &ri[12] );
-	  __m128d right_k3_2 = _mm_load_pd( &ri[14] );
-	  
-	  right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	  right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-	  
-	  right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	  right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-	  
-	  right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	  right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	  right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-	  
-	  right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	  right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-	  
-	  right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	  right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-	  
-	  right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	  right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	  right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-	  
-	  __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-	  __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );	  	  
-
-	  __m128d EV_t_l0_k0 = EVV[0];
-	  __m128d EV_t_l0_k2 = EVV[1];
-	  __m128d EV_t_l1_k0 = EVV[2];
-	  __m128d EV_t_l1_k2 = EVV[3];
-	  __m128d EV_t_l2_k0 = EVV[4];
-	  __m128d EV_t_l2_k2 = EVV[5];
-	  __m128d EV_t_l3_k0 = EVV[6];
-	  __m128d EV_t_l3_k2 = EVV[7];
-	  
-	  EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	  EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	  EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	  
-	  EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	  EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	  
-	  EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	  EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	  
-	  EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	  EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	  EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-	  	  
-	  EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	  EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	  EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-	  
-	  EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	 
-	  	  
-	  _mm_store_pd(x3, EV_t_l0_k0);
-	  _mm_store_pd(&x3[2], EV_t_l2_k0);	  	 	   	    
-	}
-      break;
-    case TIP_INNER:      
-      for (i = 0; i < n; i++)
-	{
-	  x1 = &(tipVector[4 * tipX1[i]]);
-	  x2 = &x2_start[4 * i];
-	  x3 = &x3_start[4 * i];
-	  
-	  le =  &left[cptr[i] * 16];
-	  ri =  &right[cptr[i] * 16];
-
-	  __m128d x1_0 = _mm_load_pd( &x1[0] );
-	  __m128d x1_2 = _mm_load_pd( &x1[2] );
-	  
-	  __m128d left_k0_0 = _mm_load_pd( &le[0] );
-	  __m128d left_k0_2 = _mm_load_pd( &le[2] );
-	  __m128d left_k1_0 = _mm_load_pd( &le[4] );
-	  __m128d left_k1_2 = _mm_load_pd( &le[6] );
-	  __m128d left_k2_0 = _mm_load_pd( &le[8] );
-	  __m128d left_k2_2 = _mm_load_pd( &le[10] );
-	  __m128d left_k3_0 = _mm_load_pd( &le[12] );
-	  __m128d left_k3_2 = _mm_load_pd( &le[14] );
-	  
-	  left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-	  left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-	  
-	  left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-	  left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-	  
-	  left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-	  left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-	  left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-	  
-	  left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-	  left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-	  
-	  left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-	  left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-	  
-	  left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-	  left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-	  left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-	  
-	  __m128d x2_0 = _mm_load_pd( &x2[0] );
-	  __m128d x2_2 = _mm_load_pd( &x2[2] );
-	  
-	  __m128d right_k0_0 = _mm_load_pd( &ri[0] );
-	  __m128d right_k0_2 = _mm_load_pd( &ri[2] );
-	  __m128d right_k1_0 = _mm_load_pd( &ri[4] );
-	  __m128d right_k1_2 = _mm_load_pd( &ri[6] );
-	  __m128d right_k2_0 = _mm_load_pd( &ri[8] );
-	  __m128d right_k2_2 = _mm_load_pd( &ri[10] );
-	  __m128d right_k3_0 = _mm_load_pd( &ri[12] );
-	  __m128d right_k3_2 = _mm_load_pd( &ri[14] );
-	  
-	  right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	  right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-	  
-	  right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	  right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-	  
-	  right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	  right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	  right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-	  
-	  right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	  right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-	  
-	  right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	  right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-	  
-	  right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	  right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	  right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-	  
-	  __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-	  __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
-	  
-	  __m128d EV_t_l0_k0 = EVV[0];
-	  __m128d EV_t_l0_k2 = EVV[1];
-	  __m128d EV_t_l1_k0 = EVV[2];
-	  __m128d EV_t_l1_k2 = EVV[3];
-	  __m128d EV_t_l2_k0 = EVV[4];
-	  __m128d EV_t_l2_k2 = EVV[5];
-	  __m128d EV_t_l3_k0 = EVV[6];
-	  __m128d EV_t_l3_k2 = EVV[7];
-	 
-	  
-	  EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	  EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	  EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	  
-	  EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	  EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	  
-	  EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	  EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	  
-	  EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	  EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	  EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-	  	  
-	  EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	  EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	  EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-	  
-	  EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	  	 	    		  
-	 
-	  scale = 1;
-	  	  	  	    
-	  __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
-	  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	  if(_mm_movemask_pd( v1 ) != 3)
-	    scale = 0;
-	  else
-	    {
-	      v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
-	      v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	      if(_mm_movemask_pd( v1 ) != 3)
-		scale = 0;
-	    }
-	  	  
-	  if(scale)
-	    {		      
-	      _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
-	      _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));	      	      
-	      
-	      
-	      addScale += wgt[i];	  
-	    }	
-	  else
-	    {
-	      _mm_store_pd(x3, EV_t_l0_k0);
-	      _mm_store_pd(&x3[2], EV_t_l2_k0);
-	    }
-	 
-	  	  
-	}
-      break;
-    case INNER_INNER:
-      for (i = 0; i < n; i++)
-	{
-	  x1 = &x1_start[4 * i];
-	  x2 = &x2_start[4 * i];
-	  x3 = &x3_start[4 * i];
-	  
-	  le =  &left[cptr[i] * 16];
-	  ri =  &right[cptr[i] * 16];
-
-	  __m128d x1_0 = _mm_load_pd( &x1[0] );
-	  __m128d x1_2 = _mm_load_pd( &x1[2] );
-	  
-	  __m128d left_k0_0 = _mm_load_pd( &le[0] );
-	  __m128d left_k0_2 = _mm_load_pd( &le[2] );
-	  __m128d left_k1_0 = _mm_load_pd( &le[4] );
-	  __m128d left_k1_2 = _mm_load_pd( &le[6] );
-	  __m128d left_k2_0 = _mm_load_pd( &le[8] );
-	  __m128d left_k2_2 = _mm_load_pd( &le[10] );
-	  __m128d left_k3_0 = _mm_load_pd( &le[12] );
-	  __m128d left_k3_2 = _mm_load_pd( &le[14] );
-	  
-	  left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-	  left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-	  
-	  left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-	  left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-	  
-	  left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-	  left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-	  left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-	  
-	  left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-	  left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-	  
-	  left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-	  left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-	  
-	  left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-	  left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-	  left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-	  
-	  __m128d x2_0 = _mm_load_pd( &x2[0] );
-	  __m128d x2_2 = _mm_load_pd( &x2[2] );
-	  
-	  __m128d right_k0_0 = _mm_load_pd( &ri[0] );
-	  __m128d right_k0_2 = _mm_load_pd( &ri[2] );
-	  __m128d right_k1_0 = _mm_load_pd( &ri[4] );
-	  __m128d right_k1_2 = _mm_load_pd( &ri[6] );
-	  __m128d right_k2_0 = _mm_load_pd( &ri[8] );
-	  __m128d right_k2_2 = _mm_load_pd( &ri[10] );
-	  __m128d right_k3_0 = _mm_load_pd( &ri[12] );
-	  __m128d right_k3_2 = _mm_load_pd( &ri[14] );
-	  
-	  right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	  right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-	  
-	  right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	  right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-	  
-	  right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	  right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	  right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-	  
-	  right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	  right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-	  
-	  right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	  right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-	  
-	  right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	  right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	  right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-	  
-	  __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-	  __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
-	  
-	  __m128d EV_t_l0_k0 = EVV[0];
-	  __m128d EV_t_l0_k2 = EVV[1];
-	  __m128d EV_t_l1_k0 = EVV[2];
-	  __m128d EV_t_l1_k2 = EVV[3];
-	  __m128d EV_t_l2_k0 = EVV[4];
-	  __m128d EV_t_l2_k2 = EVV[5];
-	  __m128d EV_t_l3_k0 = EVV[6];
-	  __m128d EV_t_l3_k2 = EVV[7];
-	 
-	  
-	  EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	  EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	  EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	  
-	  EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	  EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	  
-	  EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	  EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	  
-	  EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	  EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	  EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-	  	  
-	  EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	  EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	  EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-	  
-	  EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	  	 	    		  	 
-
-	  scale = 1;
-	  	  
-	  __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
-	  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	  if(_mm_movemask_pd( v1 ) != 3)
-	    scale = 0;
-	  else
-	    {
-	      v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
-	      v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	      if(_mm_movemask_pd( v1 ) != 3)
-		scale = 0;
-	    }
-	  	  
-	  if(scale)
-	    {		      
-	      _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
-	      _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));	      	      
-	      
-	      
-	      addScale += wgt[i];	  
-	    }	
-	  else
-	    {
-	      _mm_store_pd(x3, EV_t_l0_k0);
-	      _mm_store_pd(&x3[2], EV_t_l2_k0);
-	    }
-	  	  
-	}
-      break;
-    default:
-      assert(0);
-    }
-
-  
-  *scalerIncrement = addScale;
-}
-
-
-
-static void newviewGTRCAT_SAVE( int tipCase,  double *EV,  int *cptr,
-				double *x1_start, double *x2_start,  double *x3_start, double *tipVector,
-				unsigned char *tipX1, unsigned char *tipX2,
-				int n,  double *left, double *right, int *wgt, int *scalerIncrement,
-				unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
-				double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats)
-{
-  double
-    *le,
-    *ri,
-    *x1,
-    *x2,
-    *x3,
-    *x1_ptr = x1_start,
-    *x2_ptr = x2_start, 
-    *x3_ptr = x3_start, 
-    EV_t[16] __attribute__ ((aligned (BYTE_ALIGNMENT)));
-    
-  int 
-    i, 
-    j, 
-    scale, 
-    scaleGap = 0,
-    addScale = 0;
-   
-  __m128d
-    minlikelihood_sse = _mm_set1_pd( minlikelihood ),
-    sc = _mm_set1_pd(twotothe256),
-    EVV[8];  
-  
-  for(i = 0; i < 4; i++)
-    for (j=0; j < 4; j++)
-      EV_t[4 * j + i] = EV[4 * i + j];
-  
-  for(i = 0; i < 8; i++)
-    EVV[i] = _mm_load_pd(&EV_t[i * 2]);
-  
-  {
-    x1 = x1_gapColumn;	      
-    x2 = x2_gapColumn;
-    x3 = x3_gapColumn;
-    
-    le =  &left[maxCats * 16];	     	 
-    ri =  &right[maxCats * 16];		   	  	  	  	         
-
-    __m128d x1_0 = _mm_load_pd( &x1[0] );
-    __m128d x1_2 = _mm_load_pd( &x1[2] );
-    
-    __m128d left_k0_0 = _mm_load_pd( &le[0] );
-    __m128d left_k0_2 = _mm_load_pd( &le[2] );
-    __m128d left_k1_0 = _mm_load_pd( &le[4] );
-    __m128d left_k1_2 = _mm_load_pd( &le[6] );
-    __m128d left_k2_0 = _mm_load_pd( &le[8] );
-    __m128d left_k2_2 = _mm_load_pd( &le[10] );
-    __m128d left_k3_0 = _mm_load_pd( &le[12] );
-    __m128d left_k3_2 = _mm_load_pd( &le[14] );
-    
-    left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-    left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-    
-    left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-    left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-    
-    left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-    left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-    left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-    
-    left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-    left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-    
-    left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-    left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-    
-    left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-    left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-    left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-    
-    __m128d x2_0 = _mm_load_pd( &x2[0] );
-    __m128d x2_2 = _mm_load_pd( &x2[2] );
-    
-    __m128d right_k0_0 = _mm_load_pd( &ri[0] );
-    __m128d right_k0_2 = _mm_load_pd( &ri[2] );
-    __m128d right_k1_0 = _mm_load_pd( &ri[4] );
-    __m128d right_k1_2 = _mm_load_pd( &ri[6] );
-    __m128d right_k2_0 = _mm_load_pd( &ri[8] );
-    __m128d right_k2_2 = _mm_load_pd( &ri[10] );
-    __m128d right_k3_0 = _mm_load_pd( &ri[12] );
-    __m128d right_k3_2 = _mm_load_pd( &ri[14] );
-    
-    right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-    right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-    
-    right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-    right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-    
-    right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-    right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-    right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-    
-    right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-    right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-    
-    right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-    right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-    
-    right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-    right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-    right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-    
-    __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-    __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
-    
-    __m128d EV_t_l0_k0 = EVV[0];
-    __m128d EV_t_l0_k2 = EVV[1];
-    __m128d EV_t_l1_k0 = EVV[2];
-    __m128d EV_t_l1_k2 = EVV[3];
-    __m128d EV_t_l2_k0 = EVV[4];
-    __m128d EV_t_l2_k2 = EVV[5];
-    __m128d EV_t_l3_k0 = EVV[6];
-    __m128d EV_t_l3_k2 = EVV[7];
-        
-    EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-    EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-    
-    EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-    EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-    
-    EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-    EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-    
-    EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-    EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-    
-    EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-    EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-    EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-    
-    EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	  	 	    		  
-	
-    if(tipCase != TIP_TIP)
-      {    
-	scale = 1;
-	      
-	__m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
-	v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	if(_mm_movemask_pd( v1 ) != 3)
-	  scale = 0;
-	else
-	  {
-	    v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
-	    v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	    if(_mm_movemask_pd( v1 ) != 3)
-	      scale = 0;
-	  }
-	
-	if(scale)
-	  {		      
-	    _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
-	    _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));	      	      
-	    
-	    scaleGap = TRUE;	   
-	  }	
-	else
-	  {
-	    _mm_store_pd(x3, EV_t_l0_k0);
-	    _mm_store_pd(&x3[2], EV_t_l2_k0);
-	  }
-      }
-    else
-      {
-	_mm_store_pd(x3, EV_t_l0_k0);
-	_mm_store_pd(&x3[2], EV_t_l2_k0);
-      }
-  }
-  
-
-  switch(tipCase)
-    {
-    case TIP_TIP:      
-      for (i = 0; i < n; i++)
-	{
-	  if(noGap(x3_gap, i))
-	    {
-	      x1 = &(tipVector[4 * tipX1[i]]);
-	      x2 = &(tipVector[4 * tipX2[i]]);
-	  
-	      x3 = x3_ptr;
-	  
-	      if(isGap(x1_gap, i))
-		le =  &left[maxCats * 16];
-	      else	  	  
-		le =  &left[cptr[i] * 16];	  
-	  
-	      if(isGap(x2_gap, i))
-		ri =  &right[maxCats * 16];
-	      else	 	  
-		ri =  &right[cptr[i] * 16];
-	  
-	      __m128d x1_0 = _mm_load_pd( &x1[0] );
-	      __m128d x1_2 = _mm_load_pd( &x1[2] );
-	      
-	      __m128d left_k0_0 = _mm_load_pd( &le[0] );
-	      __m128d left_k0_2 = _mm_load_pd( &le[2] );
-	      __m128d left_k1_0 = _mm_load_pd( &le[4] );
-	      __m128d left_k1_2 = _mm_load_pd( &le[6] );
-	      __m128d left_k2_0 = _mm_load_pd( &le[8] );
-	      __m128d left_k2_2 = _mm_load_pd( &le[10] );
-	      __m128d left_k3_0 = _mm_load_pd( &le[12] );
-	      __m128d left_k3_2 = _mm_load_pd( &le[14] );
-	  
-	      left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-	      left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-	      
-	      left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-	      left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-	      
-	      left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-	      left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-	      left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-	      
-	      left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-	      left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-	      
-	      left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-	      left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-	      
-	      left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-	      left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-	      left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-	      
-	      __m128d x2_0 = _mm_load_pd( &x2[0] );
-	      __m128d x2_2 = _mm_load_pd( &x2[2] );
-	      
-	      __m128d right_k0_0 = _mm_load_pd( &ri[0] );
-	      __m128d right_k0_2 = _mm_load_pd( &ri[2] );
-	      __m128d right_k1_0 = _mm_load_pd( &ri[4] );
-	      __m128d right_k1_2 = _mm_load_pd( &ri[6] );
-	      __m128d right_k2_0 = _mm_load_pd( &ri[8] );
-	      __m128d right_k2_2 = _mm_load_pd( &ri[10] );
-	      __m128d right_k3_0 = _mm_load_pd( &ri[12] );
-	      __m128d right_k3_2 = _mm_load_pd( &ri[14] );
-	      
-	      right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	      right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-	      
-	      right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	      right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-	      
-	      right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	      right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	      right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-	      
-	      right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	      right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-	      
-	      right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	      right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-	      
-	      right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	      right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	      right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-	      
-	      __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-	      __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );	  	  
-	      
-	      __m128d EV_t_l0_k0 = EVV[0];
-	      __m128d EV_t_l0_k2 = EVV[1];
-	      __m128d EV_t_l1_k0 = EVV[2];
-	      __m128d EV_t_l1_k2 = EVV[3];
-	      __m128d EV_t_l2_k0 = EVV[4];
-	      __m128d EV_t_l2_k2 = EVV[5];
-	      __m128d EV_t_l3_k0 = EVV[6];
-	      __m128d EV_t_l3_k2 = EVV[7];
-	      
-	      EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	      EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	      EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	      
-	      EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	      EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	      
-	      EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	      EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	      
-	      EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	      EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	      EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-	      
-	      EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	      EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	      EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-	      
-	      EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	 
-	  	  
-	      _mm_store_pd(x3, EV_t_l0_k0);
-	      _mm_store_pd(&x3[2], EV_t_l2_k0);	  	 	   	    
-
-	      x3_ptr += 4;
-	    }
-	}
-      break;
-    case TIP_INNER:      
-      for (i = 0; i < n; i++)
-	{ 
-	  if(isGap(x3_gap, i))
-	    {
-	      if(scaleGap)		   		    
-		addScale += wgt[i];
-	    }
-	  else
-	    {	      
-	      x1 = &(tipVector[4 * tipX1[i]]);
-	      
-	      x2 = x2_ptr;
-	      x3 = x3_ptr;
-
-	      if(isGap(x1_gap, i))
-		le =  &left[maxCats * 16];
-	      else
-		le =  &left[cptr[i] * 16];
-
-	      if(isGap(x2_gap, i))
-		{		 
-		  ri =  &right[maxCats * 16];
-		  x2 = x2_gapColumn;
-		}
-	      else
-		{
-		  ri =  &right[cptr[i] * 16];
-		  x2 = x2_ptr;
-		  x2_ptr += 4;
-		}	  	  	  	  
-
-	      __m128d x1_0 = _mm_load_pd( &x1[0] );
-	      __m128d x1_2 = _mm_load_pd( &x1[2] );
-	      
-	      __m128d left_k0_0 = _mm_load_pd( &le[0] );
-	      __m128d left_k0_2 = _mm_load_pd( &le[2] );
-	      __m128d left_k1_0 = _mm_load_pd( &le[4] );
-	      __m128d left_k1_2 = _mm_load_pd( &le[6] );
-	      __m128d left_k2_0 = _mm_load_pd( &le[8] );
-	      __m128d left_k2_2 = _mm_load_pd( &le[10] );
-	      __m128d left_k3_0 = _mm_load_pd( &le[12] );
-	      __m128d left_k3_2 = _mm_load_pd( &le[14] );
-	      
-	      left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-	      left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-	      
-	      left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-	      left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-	      
-	      left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-	      left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-	      left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-	      
-	      left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-	      left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-	      
-	      left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-	      left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-	      
-	      left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-	      left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-	      left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-	      
-	      __m128d x2_0 = _mm_load_pd( &x2[0] );
-	      __m128d x2_2 = _mm_load_pd( &x2[2] );
-	      
-	      __m128d right_k0_0 = _mm_load_pd( &ri[0] );
-	      __m128d right_k0_2 = _mm_load_pd( &ri[2] );
-	      __m128d right_k1_0 = _mm_load_pd( &ri[4] );
-	      __m128d right_k1_2 = _mm_load_pd( &ri[6] );
-	      __m128d right_k2_0 = _mm_load_pd( &ri[8] );
-	      __m128d right_k2_2 = _mm_load_pd( &ri[10] );
-	      __m128d right_k3_0 = _mm_load_pd( &ri[12] );
-	      __m128d right_k3_2 = _mm_load_pd( &ri[14] );
-	      
-	      right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	      right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-	  
-	      right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	      right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-	      
-	      right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	      right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	      right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-	      
-	      right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	      right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-	      
-	      right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	      right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-	      
-	      right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	      right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	      right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-	      
-	      __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-	      __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
-	      
-	      __m128d EV_t_l0_k0 = EVV[0];
-	      __m128d EV_t_l0_k2 = EVV[1];
-	      __m128d EV_t_l1_k0 = EVV[2];
-	      __m128d EV_t_l1_k2 = EVV[3];
-	      __m128d EV_t_l2_k0 = EVV[4];
-	      __m128d EV_t_l2_k2 = EVV[5];
-	      __m128d EV_t_l3_k0 = EVV[6];
-	      __m128d EV_t_l3_k2 = EVV[7];
-	      
-	      
-	      EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	      EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	      EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	      
-	      EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	      EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	      
-	      EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	      EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	      
-	      EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	      EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	      EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-	      
-	      EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	      EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	      EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-	      
-	      EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	  	 	    		  
-	      
-	      scale = 1;
-	      
-	      __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
-	      v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	      if(_mm_movemask_pd( v1 ) != 3)
-		scale = 0;
-	      else
-		{
-		  v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
-		  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		  if(_mm_movemask_pd( v1 ) != 3)
-		    scale = 0;
-		}
-	  	  
-	      if(scale)
-		{		      
-		  _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
-		  _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));	      	      
-		  		  
-		  addScale += wgt[i];	  
-		}	
-	      else
-		{
-		  _mm_store_pd(x3, EV_t_l0_k0);
-		  _mm_store_pd(&x3[2], EV_t_l2_k0);
-		}
-
-	      x3_ptr += 4;
-	    }
-	  	  
-	}
-      break;
-    case INNER_INNER:
-      for (i = 0; i < n; i++)
-	{ 
-	  if(isGap(x3_gap, i))
-	    {
-	      if(scaleGap)		   		    
-		addScale += wgt[i];
-	    }
-	  else
-	    {	     
-	      x3 = x3_ptr;
-	  	  
-	      if(isGap(x1_gap, i))
-		{
-		  x1 = x1_gapColumn;
-		  le =  &left[maxCats * 16];
-		}
-	      else
-		{
-		  le =  &left[cptr[i] * 16];
-		  x1 = x1_ptr;
-		  x1_ptr += 4;
-		}
-
-	      if(isGap(x2_gap, i))	
-		{
-		  x2 = x2_gapColumn;
-		  ri =  &right[maxCats * 16];	    
-		}
-	      else
-		{
-		  ri =  &right[cptr[i] * 16];
-		  x2 = x2_ptr;
-		  x2_ptr += 4;
-		}	 	  	  	  
-
-	      __m128d x1_0 = _mm_load_pd( &x1[0] );
-	      __m128d x1_2 = _mm_load_pd( &x1[2] );
-	      
-	      __m128d left_k0_0 = _mm_load_pd( &le[0] );
-	      __m128d left_k0_2 = _mm_load_pd( &le[2] );
-	      __m128d left_k1_0 = _mm_load_pd( &le[4] );
-	      __m128d left_k1_2 = _mm_load_pd( &le[6] );
-	      __m128d left_k2_0 = _mm_load_pd( &le[8] );
-	      __m128d left_k2_2 = _mm_load_pd( &le[10] );
-	      __m128d left_k3_0 = _mm_load_pd( &le[12] );
-	      __m128d left_k3_2 = _mm_load_pd( &le[14] );
-	      
-	      left_k0_0 = _mm_mul_pd(x1_0, left_k0_0);
-	      left_k0_2 = _mm_mul_pd(x1_2, left_k0_2);
-	      
-	      left_k1_0 = _mm_mul_pd(x1_0, left_k1_0);
-	      left_k1_2 = _mm_mul_pd(x1_2, left_k1_2);
-	      
-	      left_k0_0 = _mm_hadd_pd( left_k0_0, left_k0_2 );
-	      left_k1_0 = _mm_hadd_pd( left_k1_0, left_k1_2);
-	      left_k0_0 = _mm_hadd_pd( left_k0_0, left_k1_0);
-	      
-	      left_k2_0 = _mm_mul_pd(x1_0, left_k2_0);
-	      left_k2_2 = _mm_mul_pd(x1_2, left_k2_2);
-	      
-	      left_k3_0 = _mm_mul_pd(x1_0, left_k3_0);
-	      left_k3_2 = _mm_mul_pd(x1_2, left_k3_2);
-	      
-	      left_k2_0 = _mm_hadd_pd( left_k2_0, left_k2_2);
-	      left_k3_0 = _mm_hadd_pd( left_k3_0, left_k3_2);
-	      left_k2_0 = _mm_hadd_pd( left_k2_0, left_k3_0);
-	      
-	      __m128d x2_0 = _mm_load_pd( &x2[0] );
-	      __m128d x2_2 = _mm_load_pd( &x2[2] );
-	      
-	      __m128d right_k0_0 = _mm_load_pd( &ri[0] );
-	      __m128d right_k0_2 = _mm_load_pd( &ri[2] );
-	      __m128d right_k1_0 = _mm_load_pd( &ri[4] );
-	      __m128d right_k1_2 = _mm_load_pd( &ri[6] );
-	      __m128d right_k2_0 = _mm_load_pd( &ri[8] );
-	      __m128d right_k2_2 = _mm_load_pd( &ri[10] );
-	      __m128d right_k3_0 = _mm_load_pd( &ri[12] );
-	      __m128d right_k3_2 = _mm_load_pd( &ri[14] );
-	      
-	      right_k0_0 = _mm_mul_pd( x2_0, right_k0_0);
-	      right_k0_2 = _mm_mul_pd( x2_2, right_k0_2);
-	      
-	      right_k1_0 = _mm_mul_pd( x2_0, right_k1_0);
-	      right_k1_2 = _mm_mul_pd( x2_2, right_k1_2);
-	      
-	      right_k0_0 = _mm_hadd_pd( right_k0_0, right_k0_2);
-	      right_k1_0 = _mm_hadd_pd( right_k1_0, right_k1_2);
-	      right_k0_0 = _mm_hadd_pd( right_k0_0, right_k1_0);
-	      
-	      right_k2_0 = _mm_mul_pd( x2_0, right_k2_0);
-	      right_k2_2 = _mm_mul_pd( x2_2, right_k2_2);
-	      
-	      right_k3_0 = _mm_mul_pd( x2_0, right_k3_0);
-	      right_k3_2 = _mm_mul_pd( x2_2, right_k3_2);
-	      
-	      right_k2_0 = _mm_hadd_pd( right_k2_0, right_k2_2);
-	      right_k3_0 = _mm_hadd_pd( right_k3_0, right_k3_2);
-	      right_k2_0 = _mm_hadd_pd( right_k2_0, right_k3_0);	   
-	      
-	      __m128d x1px2_k0 = _mm_mul_pd( left_k0_0, right_k0_0 );
-	      __m128d x1px2_k2 = _mm_mul_pd( left_k2_0, right_k2_0 );
-	      
-	      __m128d EV_t_l0_k0 = EVV[0];
-	      __m128d EV_t_l0_k2 = EVV[1];
-	      __m128d EV_t_l1_k0 = EVV[2];
-	      __m128d EV_t_l1_k2 = EVV[3];
-	      __m128d EV_t_l2_k0 = EVV[4];
-	      __m128d EV_t_l2_k2 = EVV[5];
-	      __m128d EV_t_l3_k0 = EVV[6];
-	      __m128d EV_t_l3_k2 = EVV[7];
-	      
-	      
-	      EV_t_l0_k0 = _mm_mul_pd( x1px2_k0, EV_t_l0_k0 );
-	      EV_t_l0_k2 = _mm_mul_pd( x1px2_k2, EV_t_l0_k2 );
-	      EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l0_k2 );
-	      
-	      EV_t_l1_k0 = _mm_mul_pd( x1px2_k0, EV_t_l1_k0 );
-	      EV_t_l1_k2 = _mm_mul_pd( x1px2_k2, EV_t_l1_k2 );
-	      
-	      EV_t_l1_k0 = _mm_hadd_pd( EV_t_l1_k0, EV_t_l1_k2 );
-	      EV_t_l0_k0 = _mm_hadd_pd( EV_t_l0_k0, EV_t_l1_k0 );
-	      
-	      EV_t_l2_k0 = _mm_mul_pd( x1px2_k0, EV_t_l2_k0 );
-	      EV_t_l2_k2 = _mm_mul_pd( x1px2_k2, EV_t_l2_k2 );
-	      EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l2_k2 );
-	      
-	      EV_t_l3_k0 = _mm_mul_pd( x1px2_k0, EV_t_l3_k0 );
-	      EV_t_l3_k2 = _mm_mul_pd( x1px2_k2, EV_t_l3_k2 );
-	      EV_t_l3_k0 = _mm_hadd_pd( EV_t_l3_k0, EV_t_l3_k2 );
-	      
-	      EV_t_l2_k0 = _mm_hadd_pd( EV_t_l2_k0, EV_t_l3_k0 );	  	 	    		  	 
-	      
-	      scale = 1;
-	      
-	      __m128d v1 = _mm_and_pd(EV_t_l0_k0, absMask.m);
-	      v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	      if(_mm_movemask_pd( v1 ) != 3)
-		scale = 0;
-	      else
-		{
-		  v1 = _mm_and_pd(EV_t_l2_k0, absMask.m);
-		  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		  if(_mm_movemask_pd( v1 ) != 3)
-		    scale = 0;
-		}
-	  	  
-	      if(scale)
-		{		      
-		  _mm_store_pd(&x3[0], _mm_mul_pd(EV_t_l0_k0, sc));
-		  _mm_store_pd(&x3[2], _mm_mul_pd(EV_t_l2_k0, sc));	      	      
-		  	      
-		  addScale += wgt[i];	  
-		}	
-	      else
-		{
-		  _mm_store_pd(x3, EV_t_l0_k0);
-		  _mm_store_pd(&x3[2], EV_t_l2_k0);
-		}
-	     
-	      x3_ptr += 4;
-	    }
-	}
-      break;
-    default:
-      assert(0);
-    }
-
-  
-  *scalerIncrement = addScale;
-}
-
-static void newviewGTRGAMMAPROT_GAPPED_SAVE(int tipCase,
-					    double *x1, double *x2, double *x3, double *extEV, double *tipVector,
-					    unsigned char *tipX1, unsigned char *tipX2,
-					    int n, double *left, double *right, int *wgt, int *scalerIncrement, 
-					    unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,  
-					    double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn
-					    )
-{
-  double  *uX1, *uX2, *v;
-  double x1px2;
-  int  i, j, l, k, scale, addScale = 0,   
-    gapScaling = 0;
-  double 
-    *vl, *vr, *x1v, *x2v,
-    *x1_ptr = x1,
-    *x2_ptr = x2,
-    *x3_ptr = x3;
-
-  
-
-  switch(tipCase)
-    {
-    case TIP_TIP:
-      {
-	double umpX1[1840], umpX2[1840];
-
-	for(i = 0; i < 23; i++)
-	  {
-	    v = &(tipVector[20 * i]);
-
-	    for(k = 0; k < 80; k++)
-	      {
-		double *ll =  &left[k * 20];
-		double *rr =  &right[k * 20];
-		
-		__m128d umpX1v = _mm_setzero_pd();
-		__m128d umpX2v = _mm_setzero_pd();
-
-		for(l = 0; l < 20; l+=2)
-		  {
-		    __m128d vv = _mm_load_pd(&v[l]);
-		    umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));
-		    umpX2v = _mm_add_pd(umpX2v, _mm_mul_pd(vv, _mm_load_pd(&rr[l])));					
-		  }
-		
-		umpX1v = _mm_hadd_pd(umpX1v, umpX1v);
-		umpX2v = _mm_hadd_pd(umpX2v, umpX2v);
-		
-		_mm_storel_pd(&umpX1[80 * i + k], umpX1v);
-		_mm_storel_pd(&umpX2[80 * i + k], umpX2v);
-	      }
-	  }
-
-	{
-	  uX1 = &umpX1[1760];
-	  uX2 = &umpX2[1760];
-
-	  for(j = 0; j < 4; j++)
-	    {
-	      v = &x3_gapColumn[j * 20];
-
-	      __m128d zero =  _mm_setzero_pd();
-	      for(k = 0; k < 20; k+=2)		  		    
-		_mm_store_pd(&v[k], zero);
-
-	      for(k = 0; k < 20; k++)
-		{ 
-		  double *eev = &extEV[k * 20];
-		  x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
-		  __m128d x1px2v = _mm_set1_pd(x1px2);
-		  
-		  for(l = 0; l < 20; l+=2)
-		    {
-		      __m128d vv = _mm_load_pd(&v[l]);
-		      __m128d ee = _mm_load_pd(&eev[l]);
-		      
-		      vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
-		      
-		      _mm_store_pd(&v[l], vv);
-		    }
-		}
-	    }	   
-	}	
-
-	for(i = 0; i < n; i++)
-	  {
-	    if(!(x3_gap[i / 32] & mask32[i % 32]))
-	      {
-		uX1 = &umpX1[80 * tipX1[i]];
-		uX2 = &umpX2[80 * tipX2[i]];
-		
-		for(j = 0; j < 4; j++)
-		  {
-		    v = &x3_ptr[j * 20];
-		    
-		    
-		    __m128d zero =  _mm_setzero_pd();
-		    for(k = 0; k < 20; k+=2)		  		    
-		      _mm_store_pd(&v[k], zero);
-		    
-		    for(k = 0; k < 20; k++)
-		      { 
-			double *eev = &extEV[k * 20];
-			x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
-			__m128d x1px2v = _mm_set1_pd(x1px2);
-			
-			for(l = 0; l < 20; l+=2)
-			  {
-			    __m128d vv = _mm_load_pd(&v[l]);
-			    __m128d ee = _mm_load_pd(&eev[l]);
-			    
-			    vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
-			    
-			    _mm_store_pd(&v[l], vv);
-			  }
-		      }
-		  }	   
-		x3_ptr += 80;
-	      }
-	  }
-      }
-      break;
-    case TIP_INNER:
-      {
-	double umpX1[1840], ump_x2[20];
-
-
-	for(i = 0; i < 23; i++)
-	  {
-	    v = &(tipVector[20 * i]);
-
-	    for(k = 0; k < 80; k++)
-	      {
-		double *ll =  &left[k * 20];
-				
-		__m128d umpX1v = _mm_setzero_pd();
-		
-		for(l = 0; l < 20; l+=2)
-		  {
-		    __m128d vv = _mm_load_pd(&v[l]);
-		    umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));		    					
-		  }
-		
-		umpX1v = _mm_hadd_pd(umpX1v, umpX1v);				
-		_mm_storel_pd(&umpX1[80 * i + k], umpX1v);		
-
-	      }
-	  }
-
-	{
-	  uX1 = &umpX1[1760];
-
-	  for(k = 0; k < 4; k++)
-	    {
-	      v = &(x2_gapColumn[k * 20]);
-	       
-	      for(l = 0; l < 20; l++)
-		{		   
-		  double *r =  &right[k * 400 + l * 20];
-		  __m128d ump_x2v = _mm_setzero_pd();	    
-		  
-		  for(j = 0; j < 20; j+= 2)
-		    {
-		      __m128d vv = _mm_load_pd(&v[j]);
-		      __m128d rr = _mm_load_pd(&r[j]);
-		      ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr));
-		    }
-		  
-		  ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
-		  
-		  _mm_storel_pd(&ump_x2[l], ump_x2v);		   		     
-		}
-
-	      v = &(x3_gapColumn[20 * k]);
-
-	      __m128d zero =  _mm_setzero_pd();
-	      for(l = 0; l < 20; l+=2)		  		    
-		_mm_store_pd(&v[l], zero);
-		  
-	      for(l = 0; l < 20; l++)
-		{
-		  double *eev = &extEV[l * 20];
-		  x1px2 = uX1[k * 20 + l]  * ump_x2[l];
-		  __m128d x1px2v = _mm_set1_pd(x1px2);
-		  
-		  for(j = 0; j < 20; j+=2)
-		    {
-		      __m128d vv = _mm_load_pd(&v[j]);
-		      __m128d ee = _mm_load_pd(&eev[j]);
-		      
-		      vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
-		      
-		      _mm_store_pd(&v[j], vv);
-		    }		     		    
-		}			
-	      
-	    }
-	  
-	  { 
-	    v = x3_gapColumn;
-	    __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-	    
-	    scale = 1;
-	    for(l = 0; scale && (l < 80); l += 2)
-	      {
-		__m128d vv = _mm_load_pd(&v[l]);
-		__m128d v1 = _mm_and_pd(vv, absMask.m);
-		v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		if(_mm_movemask_pd( v1 ) != 3)
-		  scale = 0;
-	      }	    	  
-	  }
-
-
-	  if (scale)
-	    {
-	      gapScaling = 1;
-	      __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-	      
-	      for(l = 0; l < 80; l+=2)
-		{
-		  __m128d ex3v = _mm_load_pd(&v[l]);		  
-		  _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
-		}		   		  	      	    	       
-	    }
-	}
-
-	for (i = 0; i < n; i++)
-	  {	    
-	    if((x3_gap[i / 32] & mask32[i % 32]))
-	       {	       
-		 if(gapScaling)
-		   {		     
-		     addScale += wgt[i];		     
-		   }
-	       }
-	     else
-	       {
-		 uX1 = &umpX1[80 * tipX1[i]];
-
-		  if(x2_gap[i / 32] & mask32[i % 32])
-		   x2v = x2_gapColumn;
-		  else
-		    {
-		      x2v = x2_ptr;
-		      x2_ptr += 80;
-		    }
-		 
-		 for(k = 0; k < 4; k++)
-		   {
-		     v = &(x2v[k * 20]);
-		     
-		     for(l = 0; l < 20; l++)
-		       {		   
-			 double *r =  &right[k * 400 + l * 20];
-			 __m128d ump_x2v = _mm_setzero_pd();	    
-			 
-			 for(j = 0; j < 20; j+= 2)
-			   {
-			     __m128d vv = _mm_load_pd(&v[j]);
-			     __m128d rr = _mm_load_pd(&r[j]);
-			     ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr));
-			   }
-			 
-			 ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
-			 
-			 _mm_storel_pd(&ump_x2[l], ump_x2v);		   		     
-		       }
-		     
-		     v = &x3_ptr[20 * k];
-		     
-		     __m128d zero =  _mm_setzero_pd();
-		     for(l = 0; l < 20; l+=2)		  		    
-		       _mm_store_pd(&v[l], zero);
-		     
-		     for(l = 0; l < 20; l++)
-		       {
-			 double *eev = &extEV[l * 20];
-			 x1px2 = uX1[k * 20 + l]  * ump_x2[l];
-			 __m128d x1px2v = _mm_set1_pd(x1px2);
-			 
-			 for(j = 0; j < 20; j+=2)
-			   {
-			     __m128d vv = _mm_load_pd(&v[j]);
-			     __m128d ee = _mm_load_pd(&eev[j]);
-			     
-			     vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
-			     
-			     _mm_store_pd(&v[j], vv);
-			   }		     		    
-		       }			
-		     
-		   }
-		 
-		 
-		 { 
-		   v = x3_ptr;
-		   __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-		   
-		   scale = 1;
-		   for(l = 0; scale && (l < 80); l += 2)
-		     {
-		       __m128d vv = _mm_load_pd(&v[l]);
-		       __m128d v1 = _mm_and_pd(vv, absMask.m);
-		       v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		       if(_mm_movemask_pd( v1 ) != 3)
-			 scale = 0;
-		     }	    	  
-		 }
-		 
-		 
-		 if (scale)
-		   {
-		     __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-		     
-		     for(l = 0; l < 80; l+=2)
-		       {
-			 __m128d ex3v = _mm_load_pd(&v[l]);		  
-			 _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
-		       }		   		  
-		     		    
-		     addScale += wgt[i];		      
-		   }
-		 
-		 x3_ptr += 80;
-	       }
-	  }
-      }
-      break;
-    case INNER_INNER:
-      {
-	for(k = 0; k < 4; k++)
-	   {
-	     vl = &(x1_gapColumn[20 * k]);
-	     vr = &(x2_gapColumn[20 * k]);
-	     v =  &(x3_gapColumn[20 * k]);
-
-	     __m128d zero =  _mm_setzero_pd();
-	     for(l = 0; l < 20; l+=2)		  		    
-	       _mm_store_pd(&v[l], zero);
-	     
-	     for(l = 0; l < 20; l++)
-	       {		 
-		 {
-		   __m128d al = _mm_setzero_pd();
-		   __m128d ar = _mm_setzero_pd();
-
-		   double *ll   = &left[k * 400 + l * 20];
-		   double *rr   = &right[k * 400 + l * 20];
-		   double *EVEV = &extEV[20 * l];
-		   
-		   for(j = 0; j < 20; j+=2)
-		     {
-		       __m128d lv  = _mm_load_pd(&ll[j]);
-		       __m128d rv  = _mm_load_pd(&rr[j]);
-		       __m128d vll = _mm_load_pd(&vl[j]);
-		       __m128d vrr = _mm_load_pd(&vr[j]);
-		       
-		       al = _mm_add_pd(al, _mm_mul_pd(vll, lv));
-		       ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv));
-		     }  		 
-		       
-		   al = _mm_hadd_pd(al, al);
-		   ar = _mm_hadd_pd(ar, ar);
-		   
-		   al = _mm_mul_pd(al, ar);
-
-		   for(j = 0; j < 20; j+=2)
-		     {
-		       __m128d vv  = _mm_load_pd(&v[j]);
-		       __m128d EVV = _mm_load_pd(&EVEV[j]);
-
-		       vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
-
-		       _mm_store_pd(&v[j], vv);
-		     }		  		   		  
-		 }		 
-
-	       }
-	   }
-	 
-
-	{ 
-	   v = x3_gapColumn;
-	   __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-	   
-	   scale = 1;
-	   for(l = 0; scale && (l < 80); l += 2)
-	     {
-	       __m128d vv = _mm_load_pd(&v[l]);
-	       __m128d v1 = _mm_and_pd(vv, absMask.m);
-	       v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	       if(_mm_movemask_pd( v1 ) != 3)
-		 scale = 0;
-	     }	    	  
-	 }
-
-	 if (scale)
-	   {
-	     gapScaling = 1;
-	     __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-	     
-	     for(l = 0; l < 80; l+=2)
-	       {
-		 __m128d ex3v = _mm_load_pd(&v[l]);		  
-		 _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
-	       }		   		  
-	     
-	    	  
-	   }
-      }
-
-      for (i = 0; i < n; i++)
-       {
-	  if(x3_gap[i / 32] & mask32[i % 32])
-	   {	     
-	     if(gapScaling)
-	       {		
-		 addScale += wgt[i];			       
-	       }
-	   }
-	 else
-	   {
-	     if(x1_gap[i / 32] & mask32[i % 32])
-	       x1v = x1_gapColumn;
-	     else
-	       {
-		 x1v = x1_ptr;
-		 x1_ptr += 80;
-	       }
-
-	     if(x2_gap[i / 32] & mask32[i % 32])
-	       x2v = x2_gapColumn;
-	     else
-	       {
-		 x2v = x2_ptr;
-		 x2_ptr += 80;
-	       }
-
-	     for(k = 0; k < 4; k++)
-	       {
-		 vl = &(x1v[20 * k]);
-		 vr = &(x2v[20 * k]);
-		 v =  &x3_ptr[20 * k];
-		 		 
-		 __m128d zero =  _mm_setzero_pd();
-		 for(l = 0; l < 20; l+=2)		  		    
-		   _mm_store_pd(&v[l], zero);
-		 		 
-		 for(l = 0; l < 20; l++)
-		   {		 
-		     {
-		       __m128d al = _mm_setzero_pd();
-		       __m128d ar = _mm_setzero_pd();
-		       
-		       double *ll   = &left[k * 400 + l * 20];
-		       double *rr   = &right[k * 400 + l * 20];
-		       double *EVEV = &extEV[20 * l];
-		       
-		       for(j = 0; j < 20; j+=2)
-			 {
-			   __m128d lv  = _mm_load_pd(&ll[j]);
-			   __m128d rv  = _mm_load_pd(&rr[j]);
-			   __m128d vll = _mm_load_pd(&vl[j]);
-			   __m128d vrr = _mm_load_pd(&vr[j]);
-			   
-			   al = _mm_add_pd(al, _mm_mul_pd(vll, lv));
-			   ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv));
-			 }  		 
-		       
-		       al = _mm_hadd_pd(al, al);
-		       ar = _mm_hadd_pd(ar, ar);
-		       
-		       al = _mm_mul_pd(al, ar);
-		       
-		       for(j = 0; j < 20; j+=2)
-			 {
-			   __m128d vv  = _mm_load_pd(&v[j]);
-			   __m128d EVV = _mm_load_pd(&EVEV[j]);
-			   
-			   vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
-			   
-			   _mm_store_pd(&v[j], vv);
-			 }		  		   		  
-		     }		 
-		     
-		   }
-	       }
-	     
-
-	     
-	     { 
-	       v = x3_ptr;
-	       __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-	       
-	       scale = 1;
-	       for(l = 0; scale && (l < 80); l += 2)
-		 {
-		   __m128d vv = _mm_load_pd(&v[l]);
-		   __m128d v1 = _mm_and_pd(vv, absMask.m);
-		   v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		   if(_mm_movemask_pd( v1 ) != 3)
-		     scale = 0;
-		 }	    	  
-	     }
-	     
-	     
-	     if (scale)
-	       {
-		 __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-		 
-		 for(l = 0; l < 80; l+=2)
-		   {
-		     __m128d ex3v = _mm_load_pd(&v[l]);		  
-		     _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
-		   }		   		  
-		 		
-		 addScale += wgt[i];		 	  
-	       }
-	     x3_ptr += 80;
-	   }
-       }
-      break;
-    default:
-      assert(0);
-    }
-
- 
-  *scalerIncrement = addScale;  
-}
-
-
-
-static void newviewGTRGAMMAPROT(int tipCase,
-				double *x1, double *x2, double *x3, double *extEV, double *tipVector,
-				unsigned char *tipX1, unsigned char *tipX2,
-				int n, double *left, double *right, int *wgt, int *scalerIncrement)
-{
-  double  *uX1, *uX2, *v;
-  double x1px2;
-  int  i, j, l, k, scale, addScale = 0;
-  double *vl, *vr;
-
-
-
-  switch(tipCase)
-    {
-    case TIP_TIP:
-      {
-	double umpX1[1840], umpX2[1840];
-
-	for(i = 0; i < 23; i++)
-	  {
-	    v = &(tipVector[20 * i]);
-
-	    for(k = 0; k < 80; k++)
-	      {
-		double *ll =  &left[k * 20];
-		double *rr =  &right[k * 20];
-		
-		__m128d umpX1v = _mm_setzero_pd();
-		__m128d umpX2v = _mm_setzero_pd();
-
-		for(l = 0; l < 20; l+=2)
-		  {
-		    __m128d vv = _mm_load_pd(&v[l]);
-		    umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));
-		    umpX2v = _mm_add_pd(umpX2v, _mm_mul_pd(vv, _mm_load_pd(&rr[l])));					
-		  }
-		
-		umpX1v = _mm_hadd_pd(umpX1v, umpX1v);
-		umpX2v = _mm_hadd_pd(umpX2v, umpX2v);
-		
-		_mm_storel_pd(&umpX1[80 * i + k], umpX1v);
-		_mm_storel_pd(&umpX2[80 * i + k], umpX2v);
-
-	      }
-	  }
-
-	for(i = 0; i < n; i++)
-	  {
-	    uX1 = &umpX1[80 * tipX1[i]];
-	    uX2 = &umpX2[80 * tipX2[i]];
-
-	    for(j = 0; j < 4; j++)
-	      {
-		v = &x3[i * 80 + j * 20];
-
-
-		__m128d zero =  _mm_setzero_pd();
-		for(k = 0; k < 20; k+=2)		  		    
-		  _mm_store_pd(&v[k], zero);
-
-		for(k = 0; k < 20; k++)
-		  { 
-		    double *eev = &extEV[k * 20];
-		    x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
-		    __m128d x1px2v = _mm_set1_pd(x1px2);
-
-		    for(l = 0; l < 20; l+=2)
-		      {
-		      	__m128d vv = _mm_load_pd(&v[l]);
-			__m128d ee = _mm_load_pd(&eev[l]);
-
-			vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
-			
-			_mm_store_pd(&v[l], vv);
-		      }
-		  }
-
-
-	      }	   
-	  }
-      }
-      break;
-    case TIP_INNER:
-      {
-	double umpX1[1840], ump_x2[20];
-
-
-	for(i = 0; i < 23; i++)
-	  {
-	    v = &(tipVector[20 * i]);
-
-	    for(k = 0; k < 80; k++)
-	      {
-		double *ll =  &left[k * 20];
-				
-		__m128d umpX1v = _mm_setzero_pd();
-		
-		for(l = 0; l < 20; l+=2)
-		  {
-		    __m128d vv = _mm_load_pd(&v[l]);
-		    umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));		    					
-		  }
-		
-		umpX1v = _mm_hadd_pd(umpX1v, umpX1v);				
-		_mm_storel_pd(&umpX1[80 * i + k], umpX1v);		
-
-
-	      }
-	  }
-
-	for (i = 0; i < n; i++)
-	  {
-	    uX1 = &umpX1[80 * tipX1[i]];
-
-	    for(k = 0; k < 4; k++)
-	      {
-		v = &(x2[80 * i + k * 20]);
-	       
-		for(l = 0; l < 20; l++)
-		  {		   
-		    double *r =  &right[k * 400 + l * 20];
-		    __m128d ump_x2v = _mm_setzero_pd();	    
-		    
-		    for(j = 0; j < 20; j+= 2)
-		      {
-			__m128d vv = _mm_load_pd(&v[j]);
-			__m128d rr = _mm_load_pd(&r[j]);
-			ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr));
-		      }
-		     
-		    ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
-		    
-		    _mm_storel_pd(&ump_x2[l], ump_x2v);		   		     
-		  }
-
-		v = &(x3[80 * i + 20 * k]);
-
-		__m128d zero =  _mm_setzero_pd();
-		for(l = 0; l < 20; l+=2)		  		    
-		  _mm_store_pd(&v[l], zero);
-		  
-		for(l = 0; l < 20; l++)
-		  {
-		    double *eev = &extEV[l * 20];
-		    x1px2 = uX1[k * 20 + l]  * ump_x2[l];
-		    __m128d x1px2v = _mm_set1_pd(x1px2);
-		  
-		    for(j = 0; j < 20; j+=2)
-		      {
-			__m128d vv = _mm_load_pd(&v[j]);
-			__m128d ee = _mm_load_pd(&eev[j]);
-			
-			vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
-			
-			_mm_store_pd(&v[j], vv);
-		      }		     		    
-		  }			
-
-	      }
-	   
-
-	    { 
-	      v = &(x3[80 * i]);
-	      __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-	      
-	      scale = 1;
-	      for(l = 0; scale && (l < 80); l += 2)
-		{
-		  __m128d vv = _mm_load_pd(&v[l]);
-		  __m128d v1 = _mm_and_pd(vv, absMask.m);
-		  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		  if(_mm_movemask_pd( v1 ) != 3)
-		    scale = 0;
-		}	    	  
-	    }
-
-
-	    if (scale)
-	      {
-
-	       __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-	       
-	       for(l = 0; l < 80; l+=2)
-		 {
-		   __m128d ex3v = _mm_load_pd(&v[l]);		  
-		   _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
-		 }		   		  
-
-
-	
-		addScale += wgt[i];
-		       
-	      }
-	  }
-      }
-      break;
-    case INNER_INNER:
-      for (i = 0; i < n; i++)
-       {
-	 for(k = 0; k < 4; k++)
-	   {
-	     vl = &(x1[80 * i + 20 * k]);
-	     vr = &(x2[80 * i + 20 * k]);
-	     v =  &(x3[80 * i + 20 * k]);
-
-
-	     __m128d zero =  _mm_setzero_pd();
-	     for(l = 0; l < 20; l+=2)		  		    
-	       _mm_store_pd(&v[l], zero);
-
-
-	     for(l = 0; l < 20; l++)
-	       {		 
-
-		 {
-		   __m128d al = _mm_setzero_pd();
-		   __m128d ar = _mm_setzero_pd();
-
-		   double *ll   = &left[k * 400 + l * 20];
-		   double *rr   = &right[k * 400 + l * 20];
-		   double *EVEV = &extEV[20 * l];
-		   
-		   for(j = 0; j < 20; j+=2)
-		     {
-		       __m128d lv  = _mm_load_pd(&ll[j]);
-		       __m128d rv  = _mm_load_pd(&rr[j]);
-		       __m128d vll = _mm_load_pd(&vl[j]);
-		       __m128d vrr = _mm_load_pd(&vr[j]);
-		       
-		       al = _mm_add_pd(al, _mm_mul_pd(vll, lv));
-		       ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv));
-		     }  		 
-		       
-		   al = _mm_hadd_pd(al, al);
-		   ar = _mm_hadd_pd(ar, ar);
-		   
-		   al = _mm_mul_pd(al, ar);
-
-		   for(j = 0; j < 20; j+=2)
-		     {
-		       __m128d vv  = _mm_load_pd(&v[j]);
-		       __m128d EVV = _mm_load_pd(&EVEV[j]);
-
-		       vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
-
-		       _mm_store_pd(&v[j], vv);
-		     }		  		   		  
-		 }		 
-
-	       }
-	   }
-	 
-
-
-	 { 
-	   v = &(x3[80 * i]);
-	   __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-	   
-	   scale = 1;
-	   for(l = 0; scale && (l < 80); l += 2)
-	     {
-	       __m128d vv = _mm_load_pd(&v[l]);
-	       __m128d v1 = _mm_and_pd(vv, absMask.m);
-	       v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	       if(_mm_movemask_pd( v1 ) != 3)
-		 scale = 0;
-	     }	    	  
-	 }
-
-
-	 if (scale)
-	   {
-
-	       __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-	       
-	       for(l = 0; l < 80; l+=2)
-		 {
-		   __m128d ex3v = _mm_load_pd(&v[l]);		  
-		   _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
-		 }		   		  
-
-
-	    
-	     addScale += wgt[i];
-	      
-	   }
-       }
-      break;
-    default:
-      assert(0);
-    }
-
-  
-  *scalerIncrement = addScale;
-
-}
-
-
-     
-static void newviewGTRCATPROT(int tipCase, double *extEV,
-			      int *cptr,
-			      double *x1, double *x2, double *x3, double *tipVector,
-			      unsigned char *tipX1, unsigned char *tipX2,
-			      int n, double *left, double *right, int *wgt, int *scalerIncrement )
-{
-  double
-    *le, *ri, *v, *vl, *vr;
-
-  int i, l, j, scale, addScale = 0;
-
-  switch(tipCase)
-    {
-    case TIP_TIP:
-      {
-	for (i = 0; i < n; i++)
-	  {
-	    le = &left[cptr[i] * 400];
-	    ri = &right[cptr[i] * 400];
-
-	    vl = &(tipVector[20 * tipX1[i]]);
-	    vr = &(tipVector[20 * tipX2[i]]);
-	    v  = &x3[20 * i];
-
-	    for(l = 0; l < 20; l+=2)
-	      _mm_store_pd(&v[l], _mm_setzero_pd());	      		
-
-
-	    for(l = 0; l < 20; l++)
-	      {
-		__m128d x1v = _mm_setzero_pd();
-		__m128d x2v = _mm_setzero_pd();	 
-		double 
-		  *ev = &extEV[l * 20],
-		  *lv = &le[l * 20],
-		  *rv = &ri[l * 20];
-
-		for(j = 0; j < 20; j+=2)
-		  {
-		    x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));		    
-		    x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
-		  }
-
-		x1v = _mm_hadd_pd(x1v, x1v);
-		x2v = _mm_hadd_pd(x2v, x2v);
-
-		x1v = _mm_mul_pd(x1v, x2v);
-		
-		for(j = 0; j < 20; j+=2)
-		  {
-		    __m128d vv = _mm_load_pd(&v[j]);
-		    vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
-		    _mm_store_pd(&v[j], vv);
-		  }		    
-
-	      }	   
-	  }
-      }
-      break;
-    case TIP_INNER:
-      {
-	for (i = 0; i < n; i++)
-	  {
-	    le = &left[cptr[i] * 400];
-	    ri = &right[cptr[i] * 400];
-
-	    vl = &(tipVector[20 * tipX1[i]]);
-	    vr = &x2[20 * i];
-	    v  = &x3[20 * i];
-
-	    for(l = 0; l < 20; l+=2)
-	      _mm_store_pd(&v[l], _mm_setzero_pd());	      		
-
-	   
-
-	    for(l = 0; l < 20; l++)
-	      {
-
-		__m128d x1v = _mm_setzero_pd();
-		__m128d x2v = _mm_setzero_pd();	
-		double 
-		  *ev = &extEV[l * 20],
-		  *lv = &le[l * 20],
-		  *rv = &ri[l * 20];
-
-		for(j = 0; j < 20; j+=2)
-		  {
-		    x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));		    
-		    x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
-		  }
-
-		x1v = _mm_hadd_pd(x1v, x1v);
-		x2v = _mm_hadd_pd(x2v, x2v);
-
-		x1v = _mm_mul_pd(x1v, x2v);
-		
-		for(j = 0; j < 20; j+=2)
-		  {
-		    __m128d vv = _mm_load_pd(&v[j]);
-		    vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
-		    _mm_store_pd(&v[j], vv);
-		  }		    
-
-	      }
-
-	    { 	    
-	      __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-	      
-	      scale = 1;
-	      for(l = 0; scale && (l < 20); l += 2)
-		{
-		  __m128d vv = _mm_load_pd(&v[l]);
-		  __m128d v1 = _mm_and_pd(vv, absMask.m);
-		  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		  if(_mm_movemask_pd( v1 ) != 3)
-		    scale = 0;
-		}	    	  
-	    }
-
-
-	    if(scale)
-	      {
-
-		__m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-
-		for(l = 0; l < 20; l+=2)
-		  {
-		    __m128d ex3v = _mm_load_pd(&v[l]);
-		    _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));		    
-		  }
-	
-		addScale += wgt[i];	  
-	      }
-	  }
-      }
-      break;
-    case INNER_INNER:
-      for(i = 0; i < n; i++)
-	{
-	  le = &left[cptr[i] * 400];
-	  ri = &right[cptr[i] * 400];
-
-	  vl = &x1[20 * i];
-	  vr = &x2[20 * i];
-	  v = &x3[20 * i];
-
-
-	    for(l = 0; l < 20; l+=2)
-	      _mm_store_pd(&v[l], _mm_setzero_pd());	      		
-
-	 
-	  for(l = 0; l < 20; l++)
-	    {
-
-		__m128d x1v = _mm_setzero_pd();
-		__m128d x2v = _mm_setzero_pd();
-		double 
-		  *ev = &extEV[l * 20],
-		  *lv = &le[l * 20],
-		  *rv = &ri[l * 20];
-
-
-		for(j = 0; j < 20; j+=2)
-		  {
-		    x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));		    
-		    x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
-		  }
-
-		x1v = _mm_hadd_pd(x1v, x1v);
-		x2v = _mm_hadd_pd(x2v, x2v);
-
-		x1v = _mm_mul_pd(x1v, x2v);
-		
-		for(j = 0; j < 20; j+=2)
-		  {
-		    __m128d vv = _mm_load_pd(&v[j]);
-		    vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
-		    _mm_store_pd(&v[j], vv);
-		  }		    
-
-	    }
-
-	    { 	    
-	      __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-	      
-	      scale = 1;
-	      for(l = 0; scale && (l < 20); l += 2)
-		{
-		  __m128d vv = _mm_load_pd(&v[l]);
-		  __m128d v1 = _mm_and_pd(vv, absMask.m);
-		  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		  if(_mm_movemask_pd( v1 ) != 3)
-		    scale = 0;
-		}	    	  
-	    }
-   
-
-	   if(scale)
-	     {
-
-	       __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-	       
-	       for(l = 0; l < 20; l+=2)
-		 {
-		   __m128d ex3v = _mm_load_pd(&v[l]);		  
-		   _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
-		 }		   		  
-
-
-	       
-	       addScale += wgt[i];	   
-	     }
-	}
-      break;
-    default:
-      assert(0);
-    }
-  
- 
-  *scalerIncrement = addScale;
-
-}
-
-static void newviewGTRCATPROT_SAVE(int tipCase, double *extEV,
-				   int *cptr,
-				   double *x1, double *x2, double *x3, double *tipVector,
-				   unsigned char *tipX1, unsigned char *tipX2,
-				   int n, double *left, double *right, int *wgt, int *scalerIncrement,
-				   unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap,
-				   double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn, const int maxCats)
-{
-  double
-    *le, 
-    *ri, 
-    *v, 
-    *vl, 
-    *vr,
-    *x1_ptr = x1,
-    *x2_ptr = x2, 
-    *x3_ptr = x3;
-
-  int 
-    i, 
-    l, 
-    j, 
-    scale, 
-    scaleGap = 0,
-    addScale = 0;
-
-  {
-    vl = x1_gapColumn;	      
-    vr = x2_gapColumn;
-    v = x3_gapColumn;
-
-    le = &left[maxCats * 400];
-    ri = &right[maxCats * 400];	  
-
-    for(l = 0; l < 20; l+=2)
-      _mm_store_pd(&v[l], _mm_setzero_pd());	      		
-	 
-    for(l = 0; l < 20; l++)
-      {
-	__m128d x1v = _mm_setzero_pd();
-	__m128d x2v = _mm_setzero_pd();
-	double 
-	  *ev = &extEV[l * 20],
-	  *lv = &le[l * 20],
-	  *rv = &ri[l * 20];
-
-
-	for(j = 0; j < 20; j+=2)
-	  {
-	    x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));		    
-	    x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
-	  }
-	
-	x1v = _mm_hadd_pd(x1v, x1v);
-	x2v = _mm_hadd_pd(x2v, x2v);
-	
-	x1v = _mm_mul_pd(x1v, x2v);
-	
-	for(j = 0; j < 20; j+=2)
-	  {
-	    __m128d vv = _mm_load_pd(&v[j]);
-	    vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
-	    _mm_store_pd(&v[j], vv);
-	  }		    	
-      }
-    
-    if(tipCase != TIP_TIP)
-      { 	    
-	__m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-	      
-	scale = 1;
-	for(l = 0; scale && (l < 20); l += 2)
-	  {
-	    __m128d vv = _mm_load_pd(&v[l]);
-	    __m128d v1 = _mm_and_pd(vv, absMask.m);
-	    v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	    if(_mm_movemask_pd( v1 ) != 3)
-	      scale = 0;
-	  }	    	        
-  
-	if(scale)
-	  {
-	    __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-	    
-	    for(l = 0; l < 20; l+=2)
-	      {
-		__m128d ex3v = _mm_load_pd(&v[l]);		  
-		_mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
-	      }		   		  
-	       
-	    scaleGap = TRUE;	   
-	  }
-      }
-  }
-  
-  switch(tipCase)
-    {
-    case TIP_TIP:
-      {
-	for (i = 0; i < n; i++)
-	  {
-	    if(noGap(x3_gap, i))
-	      {		
-		vl = &(tipVector[20 * tipX1[i]]);
-		vr = &(tipVector[20 * tipX2[i]]);
-		v  = x3_ptr;
-
-		if(isGap(x1_gap, i))
-		  le =  &left[maxCats * 400];
-		else	  	  
-		  le =  &left[cptr[i] * 400];	  
-	  
-		if(isGap(x2_gap, i))
-		  ri =  &right[maxCats * 400];
-		else	 	  
-		  ri =  &right[cptr[i] * 400];
-
-		for(l = 0; l < 20; l+=2)
-		  _mm_store_pd(&v[l], _mm_setzero_pd());	      		
-		
-		for(l = 0; l < 20; l++)
-		  {
-		    __m128d x1v = _mm_setzero_pd();
-		    __m128d x2v = _mm_setzero_pd();	 
-		    double 
-		      *ev = &extEV[l * 20],
-		      *lv = &le[l * 20],
-		      *rv = &ri[l * 20];
-		    
-		    for(j = 0; j < 20; j+=2)
-		      {
-			x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));		    
-			x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
-		      }
-		    
-		    x1v = _mm_hadd_pd(x1v, x1v);
-		    x2v = _mm_hadd_pd(x2v, x2v);
-		    
-		    x1v = _mm_mul_pd(x1v, x2v);
-		    
-		    for(j = 0; j < 20; j+=2)
-		      {
-			__m128d vv = _mm_load_pd(&v[j]);
-			vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
-			_mm_store_pd(&v[j], vv);
-		      }		   
-		  }
-
-		x3_ptr += 20;
-
-	      }   
-	  }
-      }
-      break;
-    case TIP_INNER:
-      {
-	for (i = 0; i < n; i++)
-	  {
-	    if(isGap(x3_gap, i))
-	      {
-		if(scaleGap)		   		    
-		  addScale += wgt[i];
-	      }
-	    else
-	      {	 
-		vl = &(tipVector[20 * tipX1[i]]);
-	      
-		vr = x2_ptr;
-		v = x3_ptr;
-
-		if(isGap(x1_gap, i))
-		  le =  &left[maxCats * 400];
-		else
-		  le =  &left[cptr[i] * 400];
-
-		if(isGap(x2_gap, i))
-		  {		 
-		    ri =  &right[maxCats * 400];
-		    vr = x2_gapColumn;
-		  }
-		else
-		  {
-		    ri =  &right[cptr[i] * 400];
-		    vr = x2_ptr;
-		    x2_ptr += 20;
-		  }	  	  	  	  		  
-
-		for(l = 0; l < 20; l+=2)
-		  _mm_store_pd(&v[l], _mm_setzero_pd());	      			   
-
-		for(l = 0; l < 20; l++)
-		  {
-		    __m128d x1v = _mm_setzero_pd();
-		    __m128d x2v = _mm_setzero_pd();	
-		    double 
-		      *ev = &extEV[l * 20],
-		      *lv = &le[l * 20],
-		      *rv = &ri[l * 20];
-		    
-		    for(j = 0; j < 20; j+=2)
-		      {
-			x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));		    
-			x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
-		      }
-		    
-		    x1v = _mm_hadd_pd(x1v, x1v);
-		    x2v = _mm_hadd_pd(x2v, x2v);
-		    
-		    x1v = _mm_mul_pd(x1v, x2v);
-		    
-		    for(j = 0; j < 20; j+=2)
-		      {
-			__m128d vv = _mm_load_pd(&v[j]);
-			vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
-			_mm_store_pd(&v[j], vv);
-		      }		    
-		  }
-		
-		{ 	    
-		  __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-		  
-		  scale = 1;
-		  for(l = 0; scale && (l < 20); l += 2)
-		    {
-		      __m128d vv = _mm_load_pd(&v[l]);
-		      __m128d v1 = _mm_and_pd(vv, absMask.m);
-		      v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		      if(_mm_movemask_pd( v1 ) != 3)
-			scale = 0;
-		    }	    	  
-		}
-		
-		
-		if(scale)
-		  {
-		    __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-		    
-		    for(l = 0; l < 20; l+=2)
-		      {
-			__m128d ex3v = _mm_load_pd(&v[l]);
-			_mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));		    
-		      }
-		    
-		    addScale += wgt[i];	  
-		  }
-		x3_ptr += 20;
-	      }
-	  }
-      }
-      break;
-    case INNER_INNER:
-      for(i = 0; i < n; i++)
-	{ 
-	  if(isGap(x3_gap, i))
-	    {
-	      if(scaleGap)		   		    
-		addScale += wgt[i];
-	    }
-	  else
-	    {	  	     
-	      v = x3_ptr;
-	  	  
-	      if(isGap(x1_gap, i))
-		{
-		  vl = x1_gapColumn;
-		  le =  &left[maxCats * 400];
-		}
-	      else
-		{
-		  le =  &left[cptr[i] * 400];
-		  vl = x1_ptr;
-		  x1_ptr += 20;
-		}
-
-	      if(isGap(x2_gap, i))	
-		{
-		  vr = x2_gapColumn;
-		  ri =  &right[maxCats * 400];	    
-		}
-	      else
-		{
-		  ri =  &right[cptr[i] * 400];
-		  vr = x2_ptr;
-		  x2_ptr += 20;
-		}	 	  	  	  
-
-	      for(l = 0; l < 20; l+=2)
-		_mm_store_pd(&v[l], _mm_setzero_pd());	      		
-	 
-	      for(l = 0; l < 20; l++)
-		{
-		  __m128d x1v = _mm_setzero_pd();
-		  __m128d x2v = _mm_setzero_pd();
-		  double 
-		    *ev = &extEV[l * 20],
-		    *lv = &le[l * 20],
-		    *rv = &ri[l * 20];
-		  		  
-		  for(j = 0; j < 20; j+=2)
-		    {
-		      x1v = _mm_add_pd(x1v, _mm_mul_pd(_mm_load_pd(&vl[j]), _mm_load_pd(&lv[j])));		    
-		      x2v = _mm_add_pd(x2v, _mm_mul_pd(_mm_load_pd(&vr[j]), _mm_load_pd(&rv[j])));
-		    }
-		  
-		  x1v = _mm_hadd_pd(x1v, x1v);
-		  x2v = _mm_hadd_pd(x2v, x2v);
-		  
-		  x1v = _mm_mul_pd(x1v, x2v);
-		  
-		  for(j = 0; j < 20; j+=2)
-		    {
-		      __m128d vv = _mm_load_pd(&v[j]);
-		      vv = _mm_add_pd(vv, _mm_mul_pd(x1v, _mm_load_pd(&ev[j])));
-		      _mm_store_pd(&v[j], vv);
-		    }		    
-		  
-		}
-	      
-	      { 	    
-		__m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-		
-		scale = 1;
-		for(l = 0; scale && (l < 20); l += 2)
-		  {
-		    __m128d vv = _mm_load_pd(&v[l]);
-		    __m128d v1 = _mm_and_pd(vv, absMask.m);
-		    v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		    if(_mm_movemask_pd( v1 ) != 3)
-		      scale = 0;
-		  }	    	  
-	      }
-  
-	      if(scale)
-		{
-		  __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-		  
-		  for(l = 0; l < 20; l+=2)
-		    {
-		      __m128d ex3v = _mm_load_pd(&v[l]);		  
-		      _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
-		    }		   		  
-		  
-		  addScale += wgt[i];	   
-		}
-	      x3_ptr += 20;
-	    }
-	}
-      break;
-    default:
-      assert(0);
-    }
-  
- 
-  *scalerIncrement = addScale;
-
-}
-
-static void newviewGTRGAMMAPROT_LG4(int tipCase,
-				    double *x1, double *x2, double *x3, double *extEV[4], double *tipVector[4],
-				    int *ex3, unsigned char *tipX1, unsigned char *tipX2,
-				    int n, double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling)
-{
-  double  *uX1, *uX2, *v;
-  double x1px2;
-  int  i, j, l, k, scale, addScale = 0;
-  double *vl, *vr;
-#ifndef __SIM_SSE3
-  double al, ar;
-#endif
-
-
-
-  switch(tipCase)
-    {
-    case TIP_TIP:
-      {
-	double umpX1[1840], umpX2[1840];
-
-	for(i = 0; i < 23; i++)
-	  {
-	   
-
-	    for(k = 0; k < 80; k++)
-	      {
-		
-		v = &(tipVector[k / 20][20 * i]);
-#ifdef __SIM_SSE3
-		double *ll =  &left[k * 20];
-		double *rr =  &right[k * 20];
-		
-		__m128d umpX1v = _mm_setzero_pd();
-		__m128d umpX2v = _mm_setzero_pd();
-
-		for(l = 0; l < 20; l+=2)
-		  {
-		    __m128d vv = _mm_load_pd(&v[l]);
-		    umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));
-		    umpX2v = _mm_add_pd(umpX2v, _mm_mul_pd(vv, _mm_load_pd(&rr[l])));					
-		  }
-		
-		umpX1v = _mm_hadd_pd(umpX1v, umpX1v);
-		umpX2v = _mm_hadd_pd(umpX2v, umpX2v);
-		
-		_mm_storel_pd(&umpX1[80 * i + k], umpX1v);
-		_mm_storel_pd(&umpX2[80 * i + k], umpX2v);
-#else
-		umpX1[80 * i + k] = 0.0;
-		umpX2[80 * i + k] = 0.0;
-
-		for(l = 0; l < 20; l++)
-		  {
-		    umpX1[80 * i + k] +=  v[l] *  left[k * 20 + l];
-		    umpX2[80 * i + k] +=  v[l] * right[k * 20 + l];
-		  }
-#endif
-	      }
-	  }
-
-	for(i = 0; i < n; i++)
-	  {
-	    uX1 = &umpX1[80 * tipX1[i]];
-	    uX2 = &umpX2[80 * tipX2[i]];
-
-	    for(j = 0; j < 4; j++)
-	      {
-		v = &x3[i * 80 + j * 20];
-
-#ifdef __SIM_SSE3
-		__m128d zero =  _mm_setzero_pd();
-		for(k = 0; k < 20; k+=2)		  		    
-		  _mm_store_pd(&v[k], zero);
-
-		for(k = 0; k < 20; k++)
-		  { 
-		    double *eev = &extEV[j][k * 20];
-		    x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
-		    __m128d x1px2v = _mm_set1_pd(x1px2);
-
-		    for(l = 0; l < 20; l+=2)
-		      {
-		      	__m128d vv = _mm_load_pd(&v[l]);
-			__m128d ee = _mm_load_pd(&eev[l]);
-
-			vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
-			
-			_mm_store_pd(&v[l], vv);
-		      }
-		  }
-
-#else
-
-		for(k = 0; k < 20; k++)
-		  v[k] = 0.0;
-
-		for(k = 0; k < 20; k++)
-		  {		   
-		    x1px2 = uX1[j * 20 + k] * uX2[j * 20 + k];
-		   
-		    for(l = 0; l < 20; l++)		      					
-		      v[l] += x1px2 * extEV[j][20 * k + l];		     
-		  }
-#endif
-	      }	   
-	  }
-      }
-      break;
-    case TIP_INNER:
-      {
-	double umpX1[1840], ump_x2[20];
-
-
-	for(i = 0; i < 23; i++)
-	  {
-	   
-
-	    for(k = 0; k < 80; k++)
-	      { 
-		v = &(tipVector[k / 20][20 * i]);
-#ifdef __SIM_SSE3
-		double *ll =  &left[k * 20];
-				
-		__m128d umpX1v = _mm_setzero_pd();
-		
-		for(l = 0; l < 20; l+=2)
-		  {
-		    __m128d vv = _mm_load_pd(&v[l]);
-		    umpX1v = _mm_add_pd(umpX1v, _mm_mul_pd(vv, _mm_load_pd(&ll[l])));		    					
-		  }
-		
-		umpX1v = _mm_hadd_pd(umpX1v, umpX1v);				
-		_mm_storel_pd(&umpX1[80 * i + k], umpX1v);		
-#else	    
-		umpX1[80 * i + k] = 0.0;
-
-		for(l = 0; l < 20; l++)
-		  umpX1[80 * i + k] +=  v[l] * left[k * 20 + l];
-#endif
-
-	      }
-	  }
-
-	for (i = 0; i < n; i++)
-	  {
-	    uX1 = &umpX1[80 * tipX1[i]];
-
-	    for(k = 0; k < 4; k++)
-	      {
-		v = &(x2[80 * i + k * 20]);
-#ifdef __SIM_SSE3	       
-		for(l = 0; l < 20; l++)
-		  {		   
-		    double *r =  &right[k * 400 + l * 20];
-		    __m128d ump_x2v = _mm_setzero_pd();	    
-		    
-		    for(j = 0; j < 20; j+= 2)
-		      {
-			__m128d vv = _mm_load_pd(&v[j]);
-			__m128d rr = _mm_load_pd(&r[j]);
-			ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(vv, rr));
-		      }
-		     
-		    ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v);
-		    
-		    _mm_storel_pd(&ump_x2[l], ump_x2v);		   		     
-		  }
-
-		v = &(x3[80 * i + 20 * k]);
-
-		__m128d zero =  _mm_setzero_pd();
-		for(l = 0; l < 20; l+=2)		  		    
-		  _mm_store_pd(&v[l], zero);
-		  
-		for(l = 0; l < 20; l++)
-		  {
-		    double *eev = &extEV[k][l * 20];
-		    x1px2 = uX1[k * 20 + l]  * ump_x2[l];
-		    __m128d x1px2v = _mm_set1_pd(x1px2);
-		  
-		    for(j = 0; j < 20; j+=2)
-		      {
-			__m128d vv = _mm_load_pd(&v[j]);
-			__m128d ee = _mm_load_pd(&eev[j]);
-			
-			vv = _mm_add_pd(vv, _mm_mul_pd(x1px2v,ee));
-			
-			_mm_store_pd(&v[j], vv);
-		      }		     		    
-		  }			
-#else
-		for(l = 0; l < 20; l++)
-		  {
-		    ump_x2[l] = 0.0;
-
-		    for(j = 0; j < 20; j++)
-		      ump_x2[l] += v[j] * right[k * 400 + l * 20 + j];
-		  }
-
-		v = &(x3[80 * i + 20 * k]);
-
-		for(l = 0; l < 20; l++)
-		  v[l] = 0;
-
-		for(l = 0; l < 20; l++)
-		  {
-		    x1px2 = uX1[k * 20 + l]  * ump_x2[l];
-		    for(j = 0; j < 20; j++)
-		      v[j] += x1px2 * extEV[k][l * 20  + j];
-		  }
-#endif
-	      }
-	   
-#ifdef __SIM_SSE3
-	    { 
-	      v = &(x3[80 * i]);
-	      __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-	      
-	      scale = 1;
-	      for(l = 0; scale && (l < 80); l += 2)
-		{
-		  __m128d vv = _mm_load_pd(&v[l]);
-		  __m128d v1 = _mm_and_pd(vv, absMask.m);
-		  v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-		  if(_mm_movemask_pd( v1 ) != 3)
-		    scale = 0;
-		}	    	  
-	    }
-#else
-	    v = &x3[80 * i];
-	    scale = 1;
-	    for(l = 0; scale && (l < 80); l++)
-	      scale = (ABS(v[l]) <  minlikelihood);
-#endif
-
-	    if (scale)
-	      {
-#ifdef __SIM_SSE3
-	       __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-	       
-	       for(l = 0; l < 80; l+=2)
-		 {
-		   __m128d ex3v = _mm_load_pd(&v[l]);		  
-		   _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
-		 }		   		  
-#else
-		for(l = 0; l < 80; l++)
-		  v[l] *= twotothe256;
-#endif
-
-		if(useFastScaling)
-		  addScale += wgt[i];
-		else
-		  ex3[i]  += 1;	       
-	      }
-	  }
-      }
-      break;
-    case INNER_INNER:
-      for (i = 0; i < n; i++)
-       {
-	 for(k = 0; k < 4; k++)
-	   {
-	     vl = &(x1[80 * i + 20 * k]);
-	     vr = &(x2[80 * i + 20 * k]);
-	     v =  &(x3[80 * i + 20 * k]);
-
-#ifdef __SIM_SSE3
-	     __m128d zero =  _mm_setzero_pd();
-	     for(l = 0; l < 20; l+=2)		  		    
-	       _mm_store_pd(&v[l], zero);
-#else
-	     for(l = 0; l < 20; l++)
-	       v[l] = 0;
-#endif
-
-	     for(l = 0; l < 20; l++)
-	       {		 
-#ifdef __SIM_SSE3
-		 {
-		   __m128d al = _mm_setzero_pd();
-		   __m128d ar = _mm_setzero_pd();
-
-		   double *ll   = &left[k * 400 + l * 20];
-		   double *rr   = &right[k * 400 + l * 20];
-		   double *EVEV = &extEV[k][20 * l];
-		   
-		   for(j = 0; j < 20; j+=2)
-		     {
-		       __m128d lv  = _mm_load_pd(&ll[j]);
-		       __m128d rv  = _mm_load_pd(&rr[j]);
-		       __m128d vll = _mm_load_pd(&vl[j]);
-		       __m128d vrr = _mm_load_pd(&vr[j]);
-		       
-		       al = _mm_add_pd(al, _mm_mul_pd(vll, lv));
-		       ar = _mm_add_pd(ar, _mm_mul_pd(vrr, rv));
-		     }  		 
-		       
-		   al = _mm_hadd_pd(al, al);
-		   ar = _mm_hadd_pd(ar, ar);
-		   
-		   al = _mm_mul_pd(al, ar);
-
-		   for(j = 0; j < 20; j+=2)
-		     {
-		       __m128d vv  = _mm_load_pd(&v[j]);
-		       __m128d EVV = _mm_load_pd(&EVEV[j]);
-
-		       vv = _mm_add_pd(vv, _mm_mul_pd(al, EVV));
-
-		       _mm_store_pd(&v[j], vv);
-		     }		  		   		  
-		 }		 
-#else
-		 al = 0.0;
-		 ar = 0.0;
-
-		 for(j = 0; j < 20; j++)
-		   {
-		     al += vl[j] * left[k * 400 + l * 20 + j];
-		     ar += vr[j] * right[k * 400 + l * 20 + j];
-		   }
-
-		 x1px2 = al * ar;
-
-		 for(j = 0; j < 20; j++)
-		   v[j] += x1px2 * extEV[k][20 * l + j];
-#endif
-	       }
-	   }
-	 
-
-#ifdef __SIM_SSE3
-	 { 
-	   v = &(x3[80 * i]);
-	   __m128d minlikelihood_sse = _mm_set1_pd( minlikelihood );
-	   
-	   scale = 1;
-	   for(l = 0; scale && (l < 80); l += 2)
-	     {
-	       __m128d vv = _mm_load_pd(&v[l]);
-	       __m128d v1 = _mm_and_pd(vv, absMask.m);
-	       v1 = _mm_cmplt_pd(v1,  minlikelihood_sse);
-	       if(_mm_movemask_pd( v1 ) != 3)
-		 scale = 0;
-	     }	    	  
-	 }
-#else
-	 v = &(x3[80 * i]);
-	 scale = 1;
-	 for(l = 0; scale && (l < 80); l++)
-	   scale = ((ABS(v[l]) <  minlikelihood));
-#endif
-
-	 if (scale)
-	   {
-#ifdef __SIM_SSE3
-	       __m128d twoto = _mm_set_pd(twotothe256, twotothe256);
-	       
-	       for(l = 0; l < 80; l+=2)
-		 {
-		   __m128d ex3v = _mm_load_pd(&v[l]);		  
-		   _mm_store_pd(&v[l], _mm_mul_pd(ex3v,twoto));	
-		 }		   		  
-#else	     
-	     for(l = 0; l < 80; l++)
-	       v[l] *= twotothe256;
-#endif
-
-	     if(useFastScaling)
-	       addScale += wgt[i];
-	     else
-	       ex3[i]  += 1;	  
-	   }
-       }
-      break;
-    default:
-      assert(0);
-    }
-
-  if(useFastScaling)
-    *scalerIncrement = addScale;
-
-}
-
-#endif
-
-#ifdef _OPTIMIZED_FUNCTIONS
-
 /*** BINARY DATA functions *****/
 
 static void newviewGTRCAT_BINARY( int tipCase,  double *EV,  int *cptr,
@@ -6210,9 +1322,3 @@
 
 
 /**** BINARY DATA functions end ****/
-
-
-
-#endif
-
-
--- examl.orig/parser/axml.c
+++ examl/parser/axml.c
@@ -60,7 +60,10 @@
 
 #endif
 
-#if ! (defined(__ppc) || defined(__powerpc__) || defined(PPC))
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include "simde/x86/sse.h"
+
+#if defined(SIMDE_SSE_NATIVE)
 #include <xmmintrin.h>
 /*
   special bug fix, enforces denormalized numbers to be flushed to zero,
--- examl.orig/examl/Makefile.OMP.AVX.gcc
+++ examl/examl/Makefile.OMP.AVX.gcc
@@ -3,28 +3,30 @@
 
 CC = mpicc
 
-COMMON_FLAGS = -D__SIM_SSE3 -D__AVX -D_USE_OMP -fopenmp -D_OPTIMIZED_FUNCTIONS -msse3 -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE  -Wall #  -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast    -Wno-unused-parameter
+COMMON_FLAGS = -D_USE_OMP -fopenmp -D_OPTIMIZED_FUNCTIONS -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE  -Wall #  -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast    -Wno-unused-parameter
 
 OPT_FLAG_1 = -O1
 OPT_FLAG_2 = -O2
 
 CFLAGS += $(COMMON_FLAGS) $(OPT_FLAG_2)
 
-LIBRARIES = -lm -mavx -fopenmp
+LIBRARIES = -lm -fopenmp
 
 RM = rm -f
 
 objs    = axml.o optimizeModel.o trash.o searchAlgo.o topologies.o treeIO.o models.o evaluatePartialGenericSpecial.o evaluateGenericSpecial.o newviewGenericSpecial.o makenewzGenericSpecial.o bipartitionList.o restartHashTable.o avxLikelihood.o byteFile.o partitionAssignment.o communication.o quartets.o
 
-all : clean examl-OMP-AVX
+SFX :=""
+
+all: examl-OMP$(SFX)
 
 GLOBAL_DEPS = axml.h globalVariables.h ../versionHeader/version.h
 
-examl-OMP-AVX : $(objs)
-	$(CC) -o examl-OMP-AVX $(objs) $(LIBRARIES) $(LDFLAGS)
+examl-OMP$(SFX) : $(objs)
+	$(CC) -o $@ $(objs) $(LIBRARIES) $(LDFLAGS)
 
 avxLikelihood.o : avxLikelihood.c $(GLOBAL_DEPS)
-	$(CC) $(CFLAGS) -mavx -c -o avxLikelihood.o avxLikelihood.c
+	$(CC) $(CFLAGS) -c -o avxLikelihood.o avxLikelihood.c
 
 models.o : models.c $(GLOBAL_DEPS)
 	 $(CC) $(COMMON_FLAGS) $(OPT_FLAG_1) -c -o models.o models.c
@@ -49,6 +51,6 @@
 
 
 clean : 
-	$(RM) *.o examl-OMP-AVX
+	$(RM) *.o examl-OMP$(SFX)
 
-dev : examl-OMP-AVX
+dev : examl-OMP$(SFX)
--- examl.orig/parser/Makefile.SSE3.gcc
+++ examl/parser/Makefile.SSE3.gcc
@@ -2,7 +2,7 @@
 # Makefile cleanup October 2006, Courtesy of Peter Cordes <peter@cordes.ca>
 
 CC = gcc 
-CFLAGS += -fomit-frame-pointer -O2 -D_GNU_SOURCE -msse -funroll-loops  -Wall -Wunused-parameter -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes   -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast
+CFLAGS += -fomit-frame-pointer -O2 -D_GNU_SOURCE -funroll-loops  -Wall -Wunused-parameter -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes   -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast
 
 
 LIBRARIES = -lm
@@ -11,19 +11,21 @@
 
 objs    = axml.o parsePartitions.o
 
-all : clean parse-examl
+SFX := ""
+
+all : clean parse-examl$(SFX)
 
 GLOBAL_DEPS = axml.h globalVariables.h ../versionHeader/version.h 
 
-parse-examl : $(objs)
-	$(CC) -o parse-examl $(objs) $(LIBRARIES) $(LDFLAGS)
+parse-examl$(SFX) : $(objs)
+	$(CC) -o $@ $(objs) $(LIBRARIES) $(LDFLAGS)
 
 
 axml.o : axml.c $(GLOBAL_DEPS)
 parsePartitions.o : parsePartitions.c $(GLOBAL_DEPS)
 
 clean : 
-	$(RM) *.o parse-examl
+	$(RM) *.o parse-examl$(SFX)
 
 
 dev : parse-examl
--- examl.orig/examl/axml.c
+++ examl/examl/axml.c
@@ -53,7 +53,8 @@
 #include <mpi.h>
 
 #if ! (defined(__ppc) || defined(__powerpc__) || defined(PPC))
-#include <xmmintrin.h>
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include "simde/x86/sse.h"
 /*
   special bug fix, enforces denormalized numbers to be flushed to zero,
   without this program is a tiny bit faster though.
@@ -2591,7 +2592,8 @@
        substantial run-time differences for vectors of equal length.
     */
     
-#if ! (defined(__ppc) || defined(__powerpc__) || defined(PPC))
+#if defined(SIMDE_SSE_NATIVE)    
+# include <xmmintrin.h>
     _mm_setcsr( _mm_getcsr() | _MM_FLUSH_ZERO_ON);
 #endif   
 
--- examl.orig/examl/Makefile.AVX.gcc
+++ examl/examl/Makefile.AVX.gcc
@@ -3,28 +3,30 @@
 
 CC = mpicc
 
-COMMON_FLAGS = -D__SIM_SSE3 -D__AVX -D_OPTIMIZED_FUNCTIONS -msse3 -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE #-Wall   -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast    -Wno-unused-parameter
+COMMON_FLAGS = -D_OPTIMIZED_FUNCTIONS -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE #-Wall   -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast    -Wno-unused-parameter
 
 OPT_FLAG_1 = -O1
 OPT_FLAG_2 = -O2
 
 CFLAGS += $(COMMON_FLAGS) $(OPT_FLAG_2)
 
-LIBRARIES = -lm -mavx
+LIBRARIES = -lm
 
 RM = rm -f
 
 objs    = axml.o optimizeModel.o trash.o searchAlgo.o topologies.o treeIO.o models.o evaluatePartialGenericSpecial.o evaluateGenericSpecial.o newviewGenericSpecial.o makenewzGenericSpecial.o bipartitionList.o restartHashTable.o avxLikelihood.o byteFile.o partitionAssignment.o communication.o quartets.o
 
-all : clean examl-AVX
+SFX :=""
+
+all : clean examl$(SFX)
 
 GLOBAL_DEPS = axml.h globalVariables.h ../versionHeader/version.h
 
-examl-AVX : $(objs)
-	$(CC) -o examl-AVX $(objs) $(LIBRARIES) $(LDFLAGS)
+examl$(SFX) : $(objs)
+	$(CC) -o $@ $(objs) $(LIBRARIES) $(LDFLAGS)
 
 avxLikelihood.o : avxLikelihood.c $(GLOBAL_DEPS)
-	$(CC) $(CFLAGS) $(CPPFLAGS) -mavx -c -o avxLikelihood.o avxLikelihood.c
+	$(CC) $(CFLAGS) $(CPPFLAGS) -c -o avxLikelihood.o avxLikelihood.c
 
 models.o : models.c $(GLOBAL_DEPS)
 	$(CC) $(CFLAGS) $(COMMON_FLAGS) $(OPT_FLAG_1) $(CPPFLAGS) -c -o models.o models.c
@@ -49,6 +51,6 @@
 
 
 clean : 
-	$(RM) *.o examl-AVX
+	$(RM) *.o examl$(SFX)
 
-dev : examl-AVX
+dev : examl$(SFX)
--- examl.orig/examl/Makefile.KNL.icc
+++ examl/examl/Makefile.KNL.icc
@@ -7,7 +7,7 @@
 
 KNLFLAGS=-xMIC-AVX512 -fma -align -finline-functions -D__MIC_NATIVE -qopenmp -D_USE_OMP
 
-COMMON_FLAGS = -std=c99 -D__SIM_SSE3 -D_OPTIMIZED_FUNCTIONS -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE  $(KNLFLAGS) # -Wall   -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast    -Wno-unused-parameter
+COMMON_FLAGS = -std=c99 -D_OPTIMIZED_FUNCTIONS -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE  $(KNLFLAGS) # -Wall   -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast    -Wno-unused-parameter
 
 OPT_FLAG_1 = -O1
 OPT_FLAG_2 = -O2
--- examl.orig/examl/Makefile.MIC.icc
+++ examl/examl/Makefile.MIC.icc
@@ -4,7 +4,7 @@
 CC = mpicc
 
 MICFLAGS = -D__MIC_NATIVE -mmic -qopt-streaming-cache-evict=0 -qopenmp -D_USE_OMP #-D_PROFILE_MPI
-COMMON_FLAGS = -std=c99 -D__SIM_SSE3 -D_OPTIMIZED_FUNCTIONS -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE  $(MICFLAGS) # -Wall   -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast    -Wno-unused-parameter
+COMMON_FLAGS = -std=c99 -D_OPTIMIZED_FUNCTIONS -D_GNU_SOURCE -fomit-frame-pointer -funroll-loops -D_USE_ALLREDUCE  $(MICFLAGS) # -Wall   -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast    -Wno-unused-parameter
 
 OPT_FLAG_1 = -O1
 OPT_FLAG_2 = -O2
--- examl.orig/examl/Makefile.OMP.SSE3.gcc
+++ examl/examl/Makefile.OMP.SSE3.gcc
@@ -3,7 +3,7 @@
 
 CC = mpicc
 
-COMMON_FLAGS = -D_USE_OMP -fopenmp -D_GNU_SOURCE -D__SIM_SSE3  -msse3 -fomit-frame-pointer -funroll-loops -D_OPTIMIZED_FUNCTIONS -D_USE_ALLREDUCE -Wall #-Wunused-parameter -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast -Wno-unused-parameter
+COMMON_FLAGS = -D_USE_OMP -fopenmp -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D_OPTIMIZED_FUNCTIONS -D_USE_ALLREDUCE -Wall #-Wunused-parameter -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast -Wno-unused-parameter
 
 OPT_FLAG_1 = -O1 
 OPT_FLAG_2 = -O2
--- examl.orig/examl/Makefile.SSE3.gcc
+++ examl/examl/Makefile.SSE3.gcc
@@ -4,7 +4,7 @@
 CC = mpicc
 
 
-COMMON_FLAGS = -D_GNU_SOURCE -D__SIM_SSE3  -msse3 -fomit-frame-pointer -funroll-loops -D_OPTIMIZED_FUNCTIONS -D_USE_ALLREDUCE #-Wall -Wunused-parameter -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast -Wno-unused-parameter
+COMMON_FLAGS = -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D_OPTIMIZED_FUNCTIONS -D_USE_ALLREDUCE #-Wall -Wunused-parameter -Wredundant-decls  -Wreturn-type  -Wswitch-default -Wunused-value -Wimplicit  -Wimplicit-function-declaration  -Wimplicit-int -Wimport  -Wunused  -Wunused-function  -Wunused-label -Wno-int-to-pointer-cast -Wbad-function-cast  -Wmissing-declarations -Wmissing-prototypes  -Wnested-externs  -Wold-style-definition -Wstrict-prototypes -Wpointer-sign -Wextra -Wredundant-decls -Wunused -Wunused-function -Wunused-parameter -Wunused-value  -Wunused-variable -Wformat  -Wformat-nonliteral -Wparentheses -Wsequence-point -Wuninitialized -Wundef -Wbad-function-cast -Wno-unused-parameter
 
 OPT_FLAG_1 = -O1 
 OPT_FLAG_2 = -O2
--- examl.orig/examl/axml.h
+++ examl/examl/axml.h
@@ -42,11 +42,8 @@
 #ifdef __MIC_NATIVE
 #define BYTE_ALIGNMENT 64
 #define VECTOR_PADDING 8
-#elif defined __AVX
-#define BYTE_ALIGNMENT 32
-#define VECTOR_PADDING 1
 #else
-#define BYTE_ALIGNMENT 16
+#define BYTE_ALIGNMENT 32
 #define VECTOR_PADDING 1
 #endif
 
@@ -1337,8 +1334,6 @@
 extern void myBinFwrite(void *ptr, size_t size, size_t nmemb, FILE *byteFile);
 extern void myBinFread(void *ptr, size_t size, size_t nmemb, FILE *byteFile);
 
-#ifdef __AVX
-
 extern void newviewGTRGAMMAPROT_AVX_LG4(int tipCase,
 					double *x1, double *x2, double *x3, double *extEV[4], double *tipVector[4],
 					int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, 
@@ -1401,7 +1396,6 @@
 					 double *left, double *right, int *wgt, int *scalerIncrement, const boolean useFastScaling,
 					 unsigned int *x1_gap, unsigned int *x2_gap, unsigned int *x3_gap, 
 					 double *x1_gapColumn, double *x2_gapColumn, double *x3_gapColumn); 
-#endif
 
 
 
--- examl.orig/parser/axml.h
+++ examl/parser/axml.h
@@ -34,11 +34,7 @@
 #include "../versionHeader/version.h"
 
 
-#ifdef __AVX
 #define BYTE_ALIGNMENT 32
-#else
-#define BYTE_ALIGNMENT 16
-#endif
 
 
 
