/*
 * \file	portC64xplustoSC3900.h
 *
 * \brief	Texas Instruments C64x+ keywords and intrinsics emulation
 *		on Freescale PA devices
 *
 * \version	1.0
 *
 * \date	12/14/2011
 *
 * \copyright	Freescale Semiconductor, Inc 2008-2011
******************************************************************************/

#ifndef _PORT_TI_TO_PA
#define _PORT_TI_TO_PA


/******************************************************************************
*   Data Type Mappings
*
*   Type       TI     *  PA
*   ******************************
*   char       8 bit  *  8
*   short      16     *  16
*   int        32     *  32
*   long       40     *  32
*   long long  64     *  64
*   float      32     *  32
*   double     64     *  64
******************************************************************************/

/******************************************************************************
 KEYWORDS
******************************************************************************/

#define cregister
#define interrupt
#define near
#define _nassert(src)               (cw_assert(src))

/******************************************************************************
 START OF INTRINSIC MAPPING 
******************************************************************************/

/* helper code */

#define MAX_32			(int)0x7fffffffL
#define MIN_32			(int)0x80000000L
#define MAX_16			(short)0x7fff
#define MIN_16			(short)0x8000

#define __l_put_lsb(x)                (short) (x)
#define __l_get_lsb(x)                (short) (x&0xffff)
#define __l_put_msb(x)                ((__l_put_lsb(x)<<16)&0xFFFF0000)
#define __l_get_msb(x)                (short) ((x>>16))


/* brief	Constants used for macro expansion */

#define M1 0xffffffffUL
#define M2 0xffff0000UL
#define M3 0x0000ffffUL
#define M4 0x3e0
#define M5 0x80000000UL


/*  set

    Sets the specified field in src2 to all 1s and returns the src2 value. 
    The beginning and ending bits of the field to be set are specified by
    csta and cstb, respectively.

    Notes:
    Since constants are typically not known at compile time, must use shifts
    in the case they are known, probably better to use C version */


__attribute__ ((noinline)) static unsigned _set(unsigned src2, unsigned csta, unsigned cstb) {

#ifdef _OPT_
  unsigned int temp,temp2,temp3,temp4,temp5,temp6;

   asm ("li %0,-1\n\t"        : "=r" (temp) );  // load -11
   asm ("subfic %0,%1,31\n\t" : "=r" (temp6) : "r" (cstb) ); // 31-cstb
   asm ("slw %0,%1,%2\n\t"    : "=r" (temp2) : "r" (temp), "r" (temp6)); // temp = temp << (31-cstb)
   asm ("addc %0,%0,%1\n\t"   : "+r" (temp6) :"r" (csta) );
   asm ("srw %0,%1,%2\n\t"    : "=r" (temp3) :"r" (temp2), "r" (temp6) ); // >> (31-cstb+csta)
   asm ("slw %0,%1,%2\n\t"    : "=r" (temp5) :"r" (temp3), "r" (csta) ); // << csta
   asm ("or %0,%0,%1\n\t"    : "+r" (src2)  : "r" (temp5) ); // and with res

  return (src2);

#else
  
  unsigned mask;
#define M1 0xffffffffUL

  mask = (( (0xffffffff<<(31-cstb))  >> (31-cstb+csta) ) << csta ) ;
  mask = mask | src2;

  return (mask);

#endif /* _OPT_ */
}


/*  clr

    Clears the specified field in src2 to all 1s and returns the src2 value. 
    The beginning and ending bits of the field to be set are specified by
    csta and cstb, respectively.

    Notes:
    Since constants are typically not known at compile time, must use shifts
    in the case they are known, probably better to use C version */


__attribute__ ((noinline)) static unsigned _clr(unsigned src2, unsigned csta, unsigned cstb) {

  unsigned int temp,temp2,temp3,temp4,temp5,temp6;


#ifdef _OPT_

   asm ("li %0,-1\n\t"        : "=r" (temp) );  // load -11
   asm ("subfic %0,%1,31\n\t" : "=r" (temp6) : "r" (cstb) ); // 31-cstb
   asm ("slw %0,%1,%2\n\t"    : "=r" (temp2) : "r" (temp), "r" (temp6)); // temp = temp << (31-cstb)
   asm ("addc %0,%0,%1\n\t"   : "+r" (temp6) :"r" (csta) );
   asm ("srw %0,%1,%2\n\t"    : "=r" (temp3) :"r" (temp2), "r" (temp6) ); // >> (31-cstb+csta)
   asm ("slw %0,%1,%2\n\t"    : "=r" (temp4) :"r" (temp3), "r" (csta) ); // << csta
   asm ("nor %0,%1,%1\n\t"    : "=r" (temp5) :"r" (temp4) );      // ~mask
   asm ("and %0,%0,%1\n\t"    : "+r" (src2)  : "r" (temp5) ); // and with res

  return (src2);
#else

  unsigned mask;
#define M1 0xffffffffUL

  mask = (( (0xffffffff<<(31-cstb))  >> (31-cstb+csta) ) << csta ) ;
  mask = (~mask) & src2;
  return (mask);
#endif

}


/*!
 *
 * \var		bit_count_table_8bit[]
 *
 * \brief	8-bit SIMD fast lookup table for bit counting.
 *
 * \param	8-bit index to look up number of bits set to 1.
 *
 * Bit set fast lookup table for SIMD bit count functions.
 * Trade off an extra load or two versus loop iterations
 * and excessive masking.
 *
 * \warning	none.
 *
 *
 */
static const unsigned char bit_count_table_8bit[] =	{
		0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,
		1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
		1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
		2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
		1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
		2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
		2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
		3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
		1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
		2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
		2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
		3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
		2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
		3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
		3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
		4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8 };

// end bit_count_table_8bit[]


static unsigned int	_bitc4(unsigned int src) // t
{

  // get the 8-bit simd element of the 32-bit input word.

  unsigned char	simd0	= ((src & 0x000000FF) >>  0);
  unsigned char	simd1	= ((src & 0x0000FF00) >>  8);
  unsigned char	simd2	= ((src & 0x00FF0000) >> 16);
  unsigned char	simd3	= ((src & 0xFF000000) >> 24);


  //
  // get the number of bits set in each 8-bit SIMD element
  // via the fast lookup table.
  //
  // this trades the extra load of an array value, to avoid
  // iterating over an outer four iteration loop and inner
  // loop to shift/mask bit fields for the count.
  //

  unsigned int	count0	= bit_count_table_8bit[simd0] << 0;
  unsigned int	count1	= bit_count_table_8bit[simd1] << 8;
  unsigned int 	count2	= bit_count_table_8bit[simd2] << 16;
  unsigned int 	count3	= bit_count_table_8bit[simd3] << 24;

  unsigned int	count4	= (count0 | count1);
  unsigned int	count5	= (count2 | count3);

  //
  // logically or the 8-bit count values into the 32-bit
  // SIMD word to return.
  //

  return	(unsigned int) (count4 | count5);

}  // end __bitc4()

/* helper function */

static short do__saturate (int L_var1)
{
    short var_out;

    if (L_var1 > 0X00007fffL)
    {
      //		Overflow = 1;
        var_out = MAX_16;
    }
    else if (L_var1 < (int) 0xffff8000L)
    {
      //		Overflow = 1;
        var_out = MIN_16;
    }
    else
    {
        var_out = __l_get_lsb (L_var1);
    }

    return (var_out);
}

/* dotpsu4

   For each pair of 8-bit values in src1 and src2, the 8-bit value from src1 is multiplied 
   with the 8-bit value from src2. The four products are summed together.

*/


static unsigned int	_dotpsu4(unsigned src1, unsigned src2) 
{

  unsigned char	src1_0		= (unsigned char) ((src1 >>  0) & 0x000000FF);
  unsigned char	src1_1		= (unsigned char) ((src1 >>  8) & 0x000000FF);
  unsigned char	src1_2		= (unsigned char) ((src1 >> 16) & 0x000000FF);
  unsigned char	src1_3		= (unsigned char) ((src1 >> 24) & 0x000000FF);

  unsigned char	src2_0		= (unsigned char) ((src2 >>  0) & 0x000000FF);
  unsigned char	src2_1		= (unsigned char) ((src2 >>  8) & 0x000000FF);
  unsigned char	src2_2		= (unsigned char) ((src2 >> 16) & 0x000000FF);
  unsigned char	src2_3		= (unsigned char) ((src2 >> 24) & 0x000000FF);

  // perform partial summation here for performance reasons on recompile.

  int	product_0	= (unsigned int) (src1_0 * src2_0);
  int	product_1	= (unsigned int) (src1_1 * src2_1);
  int	product_2	= (unsigned int) (src1_2 * src2_2);
  int	product_3	= (unsigned int) (src1_3 * src2_3);

  int	psum_0		= product_0 + product_1;
  int	psum_1		= product_2 + product_3;

  return			psum_0 + psum_1;

}  // end __dotpu4()

/* abs2

   Calculates the absolute value for each 16-bit value 
*/


static int _abs2(int __Da)
{

	int __Dn, __Da_2;
	int __Dn_1;
	int __Dn_h, __Dn_l, __Da_h, __Da_l;

	__Da_2 = (__Da) ;

	__Da_h = (__Da_2 & 0xffff0000) >> 16;
	if(__Da_h & 0x8000) __Da_h |= 0xffff0000;

	__Da_l = (__Da_2 & 0xffff);

	if(__Da_l & 0x8000) __Da_l |= 0xffff0000;

	__Dn_h = abs(__Da_h);
	__Dn_l = abs(__Da_l);

	__Dn_h = do__saturate(abs(__Da_h));
	__Dn_l = do__saturate(abs(__Da_l));

	__Dn = ((__Dn_h & 0xffff) << 16) | (__Dn_l & 0xffff);

	return (__Dn);
}

/* min2

   Places the larger/smaller of each pair of values in the corresponding position 
   in the return value. Values can be 16-bit signed or 8-bit unsigned.
*/

static int _min2(int __Da, int __Db)
{
	int __Dn_1, __Dn_2, __Dn;
	short __Dn_1_H, __Dn_1_L, __Dn_2_H, __Dn_2_L, __Dn_H, __Dn_L ;

	__Dn_1 = (__Da);
	__Dn_2 = (__Db);

	__Dn_1_H = (__Dn_1 >> 16);
	__Dn_1_L = (__Dn_1 & 0xffff);

	__Dn_2_H = (__Dn_2 >> 16);
	__Dn_2_L = (__Dn_2 & 0xffff);

	__Dn_H = ((short)(__Dn_1_H) < (short)(__Dn_2_H)) ? __Dn_1_H : __Dn_2_H;
    __Dn_L = ((short)(__Dn_1_L) < (short)(__Dn_2_L)) ? __Dn_1_L : __Dn_2_L;

    __Dn = (((__Dn_H & 0xFFFF) << 16) & 0xFFFF0000) | (__Dn_L & 0xFFFF) ;

    return (__Dn);

}

/* cannot find this intrinsic implemented anywhere in TI docs */

static unsigned int	_dotpus4(unsigned src1, unsigned src2) 
{

  unsigned char	src1_0		= (unsigned char) ((src1 >>  0) & 0x000000FF);
  unsigned char	src1_1		= (unsigned char) ((src1 >>  8) & 0x000000FF);
  unsigned char	src1_2		= (unsigned char) ((src1 >> 16) & 0x000000FF);
  unsigned char	src1_3		= (unsigned char) ((src1 >> 24) & 0x000000FF);

  unsigned char	src2_0		= (unsigned char) ((src2 >>  0) & 0x000000FF);
  unsigned char	src2_1		= (unsigned char) ((src2 >>  8) & 0x000000FF);
  unsigned char	src2_2		= (unsigned char) ((src2 >> 16) & 0x000000FF);
  unsigned char	src2_3		= (unsigned char) ((src2 >> 24) & 0x000000FF);

  // perform partial summation here for performance reasons on recompile.

  int	product_0	= (unsigned int) (src1_0 * src2_0);
  int	product_1	= (unsigned int) (src1_1 * src2_1);
  int	product_2	= (unsigned int) (src1_2 * src2_2);
  int	product_3	= (unsigned int) (src1_3 * src2_3);

  int	psum_0		= product_0 + product_1;
  int	psum_1		= product_2 + product_3;

  return			psum_0 + psum_1;

}  // end 

/*  ext
    
    Extracts the specified field in src2, sign-extended to 32 bits. The extract is performed by a 
    shift left followed by a signed shift right; the shift left and shift right amounts are 
    specified by the lower 10 bits of src1.
*/

__attribute__ ((noinline)) static int _ext (int src2, unsigned csta, unsigned cstb){
#ifdef _OPT_
  unsigned int temp;  

  asm ("slw %0,%1,%2\n\t" : "=r" (temp) : "r" (src2), "r" (csta) );
  asm ("sraw %0,%1,%2\n\t" : "=r" (src2) : "r" (temp), "r" (cstb) );

  return (src2);

#else

  /* observed behavior on 66x simulator: invalid shift results in zero return */
  if ((csta>31) || (cstb>31))
    return 0;
  else
    return  ((((signed int)(src2)<<(csta)))>>(cstb)) ;

#endif

}


/* extu
   
   Extracts the specified field in src2, zero-extended to 32 bits. The extract is 
   performed by a shift left followed by a unsigned shift right; the shift left and shift 
   right amounts are specified by the lower 10 bits of src1.
*/

__attribute__ ((inline)) static unsigned int _extu (unsigned int src2, unsigned csta, unsigned cstb){
#ifdef _OPT_
  unsigned int temp;

  asm ("slw %0,%1,%2\n\t" : "=r" (temp) : "r" (src2), "r" (csta) );
  asm ("srw %0,%1,%2\n\t" : "=r" (src2) : "r" (temp), "r" (cstb) );

  return (src2);


#else
  /* observed behavior on 66x simulator: invalid shift results in zero return */
  if ((csta>31) || (cstb>0x31))
      return 0;
  else
    return  ((((unsigned int)(src2)<<(csta)))>>(cstb)) ;

#endif

}


/*********************************************************************************/
/* END OF TI PORTED INTRINSICS */
/*********************************************************************************/
#endif		//_PORT_TI_TO_PA
