forums.ps2dev.org Forum Index forums.ps2dev.org
Homebrew PS2, PSP & PS3 Development Discussions
 
 FAQFAQ   SearchSearch   MemberlistMemberlist   UsergroupsUsergroups   RegisterRegister 
 ProfileProfile   Log in to check your private messagesLog in to check your private messages   Log inLog in 

VFPU diggins
Goto page 1, 2, 3  Next
 
Post new topic   Reply to topic    forums.ps2dev.org Forum Index -> PSP Development
View previous topic :: View next topic  
Author Message
hlide



Joined: 10 Sep 2006
Posts: 750

PostPosted: Tue Nov 07, 2006 9:50 pm    Post subject: VFPU diggins Reply with quote

This topic is where we can share our VFPU diggins. This first message should grow more and more as our VFPU diggins make progress.

Code:

/////////////////////////////////////////////////////////////
// VFPU diggins
/////////////////
//
// Authors :
//
//   hlide, Raphael
//
// 2006-11-17 01:05PM
//
/////////////////////////////////////////////////////////////


op  operands                             ticks         latency*
-----------------------------------------

mtv rt, vs.s
{
  vs.s = rt; // rt is general purpose register
}

mfv rt, vs.s
{
  rt = vs.s; // rt is general purpose register
}

-----------------------------------------

mtvc rt, vcr
{
  vcr = rt; // vcr is cop2 control register
}

mfvc rt, vcr
{
  rt = vcr; // vcr is cop2 control register
}

-----------------------------------------

vmtvc vcr, vs.s
{
  vcr = vs.s;
}

vmfvc sd, cr
{
  sd = cr;
}

-----------------------------------------

// rm is general purpose register containing a memory address
lv.s vd.s, offset(rm)
{
   vd.s = offset(rm);
}

sv.s vd.s, offset(rm)
{
   offset(rm) = vd.s;
}


// rm needs to be aligned to 16bytes (quadword)
lv.q vd, rm                                 1            0       (cache)
{                                       68                  (memory)
   vd[0] = 0(rm);
   vd[1] = 4(rm);
   vd[2] = 8(rm);
   vd[3] = 12(rm);
}

ulv.q vd, rm                              2            0       (cache)
{                                       68                  (memory)
   vd[0] = 0(rm);
   vd[1] = 4(rm);
   vd[2] = 8(rm);
   vd[3] = 12(rm);
}

// rm needs to be aligned to 16bytes (quadword)
sv.q vd, rm                                 7            2       (cache)
{                                       111                  (memory)
   0(rm) = vd[0];
   4(rm) = vd[1];
   8(rm) = vd[2];
   12(rm) = vd[3];
}

usv.q vd, rm                              14            4       (cache)
{                                       111                  (memory)
   0(rm) = vd[0];
   4(rm) = vd[1];
   8(rm) = vd[2];
   12(rm) = vd[3];
}

-----------------------------------------

// vector register prefixes

vpfxs [?0,?1,?2,?3]
// special prefix for vs like vs.q[X, X, Y, Y] - their values may be :
//  x : vs[0]
//  y : vs[1]
//  z : vs[2]
//  w : vs[3]
//  -x : -vs[0]
//  -y : -vs[1]
//  -z : -vs[2]
//  -w : -vs[3]
//  |x| : |vs[0]| (absolute value of vs[0])
//  |y| : |vs[1]| (absolute value of vs[1])
//  |z| : |vs[2]| (absolute value of vs[2])
//  |w| : |vs[3]| (absolute value of vs[3])
//  0 : constant 0
//  1 : constant 1
//  2 : constant 2
//  1/2 : constant 1/2
//  3 : constant 3
//  1/3 : constant 1/3
//  1/4 : constant 1/4
//  1/6 : constant 1/6
//
// so vmov.q vd, vs[z, |x|, 0, -x] :
//   vd[0] = vs[3];
//   vd[1] = |vs[0]|;
//   vd[2] = 0;
//   vd[3] = -vs[0];

vpfxt [?0,?1,?2,?3]
// special prefix for vt like vt.q[X, X, Y, Y] -  their values may be :
//  x : vt[0]
//  y : vt[1]
//  z : vt[2]
//  w : vt[3]
//  -x : -vt[0]
//  -y : -vt[1]
//  -z : -vt[2]
//  -w : -vt[3]
//  |x| : |vt[0]| (absolute value of vt[0])
//  |y| : |vt[1]| (absolute value of vt[1])
//  |z| : |vt[2]| (absolute value of vt[2])
//  |w| : |vt[3]| (absolute value of vt[3])
//  0 : constant 0
//  1 : constant 1
//  2 : constant 2
//  1/2 : constant 1/2
//  3 : constant 3
//  1/3 : constant 1/3
//  1/4 : constant 1/4
//  1/6 : constant 1/6
//

vpfxd [?4,?5,?6,?7]
// special prefix for vd like vd.q[0:1, 0:1, 0:1, 0:1] -  their values may be :
// 0:1 : min(1, max(0, vd[i]))
// -1:1 : min(1, max(-1, vd[i]))
// m : ???
//
// so vmov.p vd[0:1, -1:1], sd :
//   vd[0] = min(1, max(0, vs[0]));
//   vd[1] = min(1, max(-1, vs[1]));


-----------------------------------------

vadd.q/t/p/s vd, vs, vt                        1            0
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = vs[i] + vt[i];
}

vsub.q/t/p/s vd, vs, vt                        1            0
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = vs[i] - vt[i];
}

-----------------------------------------

vdiv.q/t/p/s vd, vs, vt                        56/42/28/14      30/?/?/?
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = vs[i] / vt[i];
}

vmul.q/t/p/s vd, vs, vt                        1            0
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = vs[i] * vt[i];
}

-----------------------------------------

vdot.q/t/p/s sd.s, vs, vt                     1            0
{
  sd.s = 0;
  for (i = 0; i < |q/t/p/s|; ++i)
    sd.s += vs[i] * vt[i];
}

-----------------------------------------

vscl.q/t/p/s vd, vs, vt.s                     1            0
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = vs[i] * vt.s;
}

-----------------------------------------

// Homogenuous dot product
vhdp.q/t/p/s vd.s, vs, vt (UNSURE)               1            0
{
  vd.s = vt[|q/t/p|];
  for (i = 0; i < |q/t/p|-1; ++i)
    vd.s += vs[i] * vt[i];
}

-----------------------------------------

vcmp.q/t/p/s f2, vs, vt                        1            0
{
  for (i = 0; i < 5; ++i)
    VFPU_CC[i] = 0;

  VFPU_CC[5] = 1;

  for (i = 0; i < |q/t/p|; ++i)
    VFPU_CC[i] = bcmp(f2, vs[i], vt[i]);  // f2 = EQ/NE/LE/LT/GE/GT
 
  for (i = 0; i < |q/t/p|; ++i)
  {
    VFPU_CC[4] ||= VFPU_CC[i];
    VFPU_CC[5] &&= VFPU_CC[i];
  }
}

vcmp.q/t/p/s f1, vs                           1            0
{
  for (i = 0; i < 5; ++i)
    VFPU_CC[i] = 0;

  VFPU_CC[5] = 1;

  for (i = 0; i < |q/t/p|; ++i)
    VFPU_CC[i] = ucmp(f1, vs[i]);  // f1 = EN/EI/EZ/ES/NN/NI/NZ/NS
 
  for (i = 0; i < |q/t/p|; ++i)
  {
    VFPU_CC[4] ||= VFPU_CC[i];
    VFPU_CC[5] &&= VFPU_CC[i];
  }
}

vcmp.q/t/p/s f0
{
  for (i = 0; i < 5; ++i)
    VFPU_CC[i] = 0;

  VFPU_CC[5] = 1;

  for (i = 0; i < |q/t/p|; ++i)
    VFPU_CC[i] = f0; // f0 = TR/FL
 
  for (i = 0; i < |q/t/p|; ++i)
  {
    VFPU_CC[4] ||= VFPU_CC[i];
    VFPU_CC[5] &&= VFPU_CC[i];
  }
}

-----------------------------------------

vmin.q/t/p/s vd, vs, vt                        1            0
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = min(vs[i], vt[i]);
}

vmax.q/t/p/s vd, vs, vt                        1            0
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = max(vs[i], vt[i]);
}

-----------------------------------------

vsgn.q/t/p/s vd, vs                           1            0
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = (vs[i] < 0.0) ? -1.0 : (vs[i] > 0.0) : 1.0 : 0.0;
}

-----------------------------------------

vcst.q/t/p/s vd, VPFU_SPC_CST                  1            0
{
  // VFPU_HUGE = Inf
  // VFPU_SQRT2 = SQRT(2)
  // VFPU_SQRT1_2 = SQRT(1/2)
  // VFPU_2_SQRTPI = 2/SQRT(PI)
  // VFPU_2_PI = 2/PI
  // VFPU_1_PI = 1/PI
  // VFPU_PI_4 = PI/4
  // VFPU_PI_2 = PI/2
  // VFPU_PI = PI
  // VFPU_E = e
  // VFPU_LOG2E = log2(e)
  // VFPU_LOG10E = log10(e)
  // VFPU_LN2 = ln(2)
  // VFPU_LN10 = ln(10)
  // VFPU_2PI = 2*PI
  // VFPU_PI_6 = PI/6
  // VFPU_LOG10TWO = log10(2)
  // VFPU_LOG2TEN = log2(10)
  // VFPU_SQRT3_2 = sqrt(3)/2

  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = vpfu_special_constant[VPFU_SPC_CST]
}

-----------------------------------------

vscmp.q/t/p/s vd, vs, vt                     1            0
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = (vs[i] < vt[i]) ? -1.0 : (vs[i] > vt[i]) ? 1.0 : 0.0;
}

vsge.q/t/p/s vd, vs, vt                        1            0
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = (vs[i] >= vt[i]) ? 1.0 : 0.0;
}

vslt.q/t/p/s vd, vs, vt                        1            0
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = (vs[i] < vt[i]) ? 1.0 : 0.0;
}

-----------------------------------------

vi2uc.q vd.s, vs.q                           1            0
{
  vd.s[0]( 0.. 7) = vs.q[0] & 0xFF;
  vd.s[0]( 8..15) = vs.q[1] & 0xFF;
  vd.s[0](16..23) = vs.q[2] & 0xFF;
  vd.s[0](24..31) = vs.q[3] & 0xFF;
}

vi2c.q vd.s, vs.q                           1            0
{
  vd.s[0]( 0.. 7) = (vs.q[0] & 0x7F) | ((vs.q[0] & 0x80000000) >> 24);
  vd.s[0]( 8..15) = (vs.q[1] & 0x7F) | ((vs.q[1] & 0x80000000) >> 24);
  vd.s[0](16..23) = (vs.q[2] & 0x7F) | ((vs.q[2] & 0x80000000) >> 24);
  vd.s[0](24..31) = (vs.q[3] & 0x7F) | ((vs.q[3] & 0x80000000) >> 24);
}

-----------------------------------------

vmov.q/t/p/s vd, vs                           1            0
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = vs[i];
}

-----------------------------------------

vabs.q/t/p/s vd, vs                           1            0
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = |vs[i]|;
}

-----------------------------------------

vneg.q/t/p/s vd, vs                           1            0
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = -vs[i];
}

-----------------------------------------

vsat0.q/t/p/s vd, vs                        1            0
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = max(0.0, min(vs[i], 1.0));
}

vsat1.q/t/p/s vd, vs                        1            0
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = max(-1.0, min(vs[i], 1.0));
}

-----------------------------------------

vzero.q/t/p/s vd                           3/?/?/?         2
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = 0.0;
}

vone.q/t/p/s vd                              3/?/?/?         2
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = 1.0;
}

vidt.q/t/p/s vd                              3/?/?/?         2
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = (vd[i].column == vd[i].row) ? 1.0 : 0.0;
}

-----------------------------------------

vrcp.q/t/p/s vd, vs                           4/?/?/?         3
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = 1.0 / vs[i];
}

vrsq.q/t/p/s vd, vs                           4/?/?/?         3
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = 1.0 / sqrt(vs[i]);
}

-----------------------------------------

vsin.q/t/p/s vd, vs                           4/?/?/?         3
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = sin(vs[i]*PI/2);
}

vcos.q/t/p/s vd, vs                           4/?/?/?         3
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = cos(vs[i]*PI/2);
}

vasin.q/t/p/s vd, vs                        4/?/?/?         3
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = asin(vs[i]) * 2/PI; // not sure about this conversion
}

-----------------------------------------

vexp2.q/t/p/s vd, vs                        4/?/?/?         3
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = exp2(vs[i]);
}

vlog2.q/t/p/s vd, vs                        4/?/?/?         3
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = log2(vs[i]);
}

-----------------------------------------

vsqrt.q/t/p/s vd, vs                        4/?/?/?         3
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = sqrt(vs[i]);
}

-----------------------------------------

vrnds.s vs                    ?      ?
{
  random_seed(vs);
}

-----------------------------------------

vrndi.q/t/p/s vd                     12/9/6/3      10/7/4/1
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = rand_integer(-1<<31, 1<<31); // -1<<31 <= vd[i] < 1<<31
}

-----------------------------------------

vrndf1.q/t/p/s vd                     12/9/6/3      10/7/4/1
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = rand_float(0.0, 2.0); // 0.0 <= vd[i] < 2.0
}

-----------------------------------------

vrndf2.q/t/p/s vd                     12/9/6/3      10/7/4/1
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = rand_float(0.0, 4.0); // 0.0 <= vd[i] < 4.0
}

-----------------------------------------

// Nvidia Half format [S:1][E:5][M:10]
vf2h.p/q vd, vs   (UNSURE)                           1            0
{
  for (i = 0; i < |q/p|/2; ++i)
    vd[i]( 0..15) = ((vs[i*2] >> 16) & 0x8000) | ((vs[i*2] >> 13) & 0x03FF);

    e = ((vs[i*2] >> 23) & 0xFF) - 0x70;
    if (e < 0)
      e = 0;
    if (e > 31)
      e = 31;
      vd[i] &= ~0x03FF;   // -> make too huge numbers infinity
    if ((vs[i*2] & 0x7FFFFF != 0) && ((vs[i*2] >> 23) & 0xFF == 0xFF))
      vd[i] |= 0x03FF;   // -> But NaNs stay NaNs even with mantissa loss
    vd[i] |= (e << 10);


    vd[i](16..31) = ((vs[i*2+1] >> 16) & 0x8000) | ((vs[i*2+1] >> 13) & 0x03FF);

    e = ((vs[i*2+1] >> 23) & 0xFF) - 0x70;
    if (e < 0)
      e = 0;
    if (e > 31)
      e = 31;
      vd[i] &= ~0x03FF0000;   // -> make too huge numbers infinity
    if ((vs[i*2+1] & 0x7FFFFF != 0) && ((vs[i*2+1] >> 23) & 0xFF == 0xFF))
      vd[i] |= 0x03FF0000;   // -> But NaNs stay NaNs even with mantissa loss
    vd[i] |= (e << 26);
}

-----------------------------------------

vsrt1.q vd, vs                              1            0
{
  vd[0] = min(vs[0], vs[1]);
  vd[1] = max(vs[1], vs[0]);
  vd[2] = min(vs[2], vs[3]);
  vd[3] = max(vs[3], vs[2]);
}

vsrt2.q vd, vs                              1            0
{
  vd[0] = min(vs[0], vs[3]);
  vd[1] = max(vs[1], vs[2]);
  vd[2] = min(vs[2], vs[1]);
  vd[3] = max(vs[3], vs[0]);
}

vsrt3.q vd, vs                              1            0
{
  vd[0] = max(vs[0], vs[1]);
  vd[1] = min(vs[1], vs[0]);
  vd[2] = max(vs[2], vs[3]);
  vd[3] = min(vs[3], vs[2]);
}

vsrt4.q vd, vs                              1            0
{
  vd[0] = max(vs[0], vs[3]);
  vd[1] = max(vs[1], vs[2]);
  vd[2] = min(vs[2], vs[1]);
  vd[3] = min(vs[3], vs[0]);
}

-----------------------------------------

vbfy1.q/p vd, vs                           1            0
{
  for (i = 0; i < |q/p|; i += 2)
    vd[i+0] = vs[i+0] + vs[i+1];
    vd[i+1] = vs[i+0] - vs[i+1];
}

vbfy2.q vd, vs                              1            0   
{
  vd[0] = vs[0] + vs[2];
  vd[1] = vs[1] + vs[3];
  vd[2] = vs[0] - vs[2];
  vd[3] = vs[1] - vs[3];
}

-----------------------------------------

vocp.q/t/p/s vd, vs                           1            0
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = 1.0 - vs[i];
}

-----------------------------------------

// Funnel add components
vfad.q/t/p/s vd.s, vs                        1            0
{
  vd.s = 0;
  for (i = 0; i < |q/t/p/s|; ++i)
    vd.s += vs[i];
}

-----------------------------------------

// Average of components
vavg.q/t/p/s vd.s, vs                        1            0
{
  vd.s = 0.0
  for (i = 0; i < |q/t/p/s|; ++i)
    vd.s += vs[i];
  vd.s /= |q/t/p/s|;
}

-----------------------------------------

// Round
vf2in.q/t/p/s vd, vs, imm                     1            0
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = ROUND(vs[i]) << imm;
}

-----------------------------------------

// Trunc
vf2iz.q/t/p/s vd, vs, imm                     1            0
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = TRUNC(vs[i]) << imm;
}

-----------------------------------------

// Floor
vf2iu.q/t/p/s vd, vs, imm                     1            0
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = FLOOR(vs[i]) << imm;
}

-----------------------------------------

// Ceil
vf2id.q/t/p/s vd, vs, imm                     1            0
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = CEIL(vs[i]) << imm;
}

-----------------------------------------

// (float)
vi2f.q/t/p/s vd, vs, imm                     1            0
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = (float)(vs[i]) / (float)(1<<imm);
}

-----------------------------------------

// Conditional move vector on true
vcmovt.q/t/p/s vd, vs, cc (UNSURE)               5            4
{
  switch (cc)
  {
  case 0...5 :
    if (CC[cc] == TRUE)
     vd = vs;
  case 6:
    for (i = 0; i < |q/t/p/s|; ++i)
     if (CC[i] == TRUE)
       vd[i] = vs[i]
  }
}

// Conditional move vector on false
vcmovf.q/t/p/s vd, vs, cc (UNSURE)               5            4
{
  switch (cc)
  {
  case 0...5 :
    if (CC[cc] == FALSE)
     vd = vs;
  case 6:
    for (i = 0; i < |q/t/p/s|; ++i)
     if (CC[i] == FALSE)
       vd[i] = vs[i]
  }
}

-----------------------------------------

// Matrix multiplication
vmmul.q/t/p md, ms, mt                        16/8/4         15/7/3
{
  for (i = 0; i < |q/t/p|; ++i)
    for (j = 0; j < |q/t/p|; ++j)
      md[i][j] = 0;
      for (k = 0; k < |q/t/p|; ++k)
        md[i][j] += ms[i][k] * mt[k][j];
}

-----------------------------------------

// Matrix-vector transform
vtfm4.q/3.t/2.p vd, md, vt                     4/3/2         3/2/1
{
  for (i = 0; i < |q/t/p|; ++i)
    vd[i] = 0;
    for (j = 0; j < |q/t/p|; ++j)
      vd[i] += md[i][j] * vt[j];
}

-----------------------------------------

// Homogenous transform
vhtfm4.q/3.t/2.p vd, md, vt                     4/3/2         3/2/1
{
  for (i = 0; i < |q/t/p|; ++i)
    vd[i] = 0;
    for (j = 0; j < |q/t/p|; ++j)
      vd[i] += md[i][j] * vt[j];
  for (i = 0; i < |q/t/p|; ++i)
    vd[i] /= vd[|q/t/p|];
}

-----------------------------------------

// Matrix scale
vmscl.q/t/p md, ms, vt.s                     4/3/2         3/2/1
{
  for (i = 0; i < |q/t/p|; ++i)
    for (j = 0; j < |q/t/p|; ++j)
      md[i][j] = ms[i][j] * vt.s;
}

-----------------------------------------

// Quaternion multiply
vqmul.q vd, vs, vt                           4            3
{
  vd[0] = vs[3] * vt[0] + vs[0] * vt[3] + vs[1] * vt[2] - vs[2] * vt[1];
  vd[1] = vs[3] * vt[1] + vs[1] * vt[3] + vs[2] * vt[0] - vs[0] * vt[2];
  vd[2] = vs[3] * vt[2] + vs[2] * vt[3] + vs[0] * vt[1] - vs[1] * vt[0];
  vd[3] = vs[3] * vt[3] - vs[0] * vt[0] - vs[1] * vt[1] - vs[2] * vt[2];
}

-----------------------------------------

// Matrix move
vmmov.q/t/p md, ms                           4/3/2         3/2/1
{
  for (i = 0; i < |q/t/p|; ++i)
    for (j = 0; j < |q/t/p|; ++j)
      md[i][j] = ms[i][j];
}

-----------------------------------------

// Matrix Identity
vmidt.q/t/p md                              6/5/4         5/4/3
{
  for (i = 0; i < |q/t/p|; ++i)
    for (j = 0; j < |q/t/p|; ++j)
      md[i][j] = (i == j) ? 1.0 : 0.0;
}

-----------------------------------------

// Matrix-zero
vmzero.q/t/p md                              6/5/4         5/4/3
{
  for (i = 0; i < |q/t/p|; ++i)
    for (j = 0; j < |q/t/p|; ++j)
      md[i][j] = 0.0;
}

-----------------------------------------

// Matrix-one
vmone.q/t/p md                              6/5/4         5/4/3
{
  for (i = 0; i < |q/t/p|; ++i)
    for (j = 0; j < |q/t/p|; ++j)
      md[i][j] = 1.0;
}

-----------------------------------------

// Rotation vector
vrot.q/t/p vd, vs.s, [+c/-c/-s/+s/0,...]         2            1
{
  for (i = 0; i < |q/t/p|; ++i)
    vd[i] = (+1.0 | -1.0) * (cos | sin)(vs.s*PI/2.0) | 0;
}

-----------------------------------------

vt4444.q vd, vs                              1            0
{
  vd[0]( 0..15) = ((vs[0] & 0xF0000000) >> 16) | ((vs[0] & 0xF00000) >> 12) | ((vs[0] & 0xF000) >> 8) | ((vs[0] & 0xF0) >> 4);
  vd[0](16..31) = ((vs[1] & 0xF0000000) >> 16) | ((vs[1] & 0xF00000) >> 12) | ((vs[1] & 0xF000) >> 8) | ((vs[1] & 0xF0) >> 4);
  vd[1]( 0..15) = ((vs[2] & 0xF0000000) >> 16) | ((vs[2] & 0xF00000) >> 12) | ((vs[2] & 0xF000) >> 8) | ((vs[2] & 0xF0) >> 4);
  vd[1](16..31) = ((vs[3] & 0xF0000000) >> 16) | ((vs[3] & 0xF00000) >> 12) | ((vs[3] & 0xF000) >> 8) | ((vs[3] & 0xF0) >> 4);
}

-----------------------------------------

vt5551.q vd, vs                              1            0
{
  vd[0]( 0..15) = ((vs[0] & 0x80000000) >> 16) | ((vs[0] & 0xF80000) >> 9) | ((vs[0] & 0xF800) >> 6) | ((vs[0] & 0xF8) >> 3);
  vd[0](16..31) = ((vs[1] & 0x80000000) >> 16) | ((vs[1] & 0xF80000) >> 9) | ((vs[1] & 0xF800) >> 6) | ((vs[1] & 0xF8) >> 3);
  vd[1]( 0..15) = ((vs[2] & 0x80000000) >> 16) | ((vs[2] & 0xF80000) >> 9) | ((vs[2] & 0xF800) >> 6) | ((vs[2] & 0xF8) >> 3);
  vd[1](16..31) = ((vs[3] & 0x80000000) >> 16) | ((vs[3] & 0xF80000) >> 9) | ((vs[3] & 0xF800) >> 6) | ((vs[3] & 0xF8) >> 3);
}

-----------------------------------------

vt5650.q vd, vs                              1            0
{
  vd[0]( 0..15) = ((vs[0] & 0xF80000) >> 8) | ((vs[0] & 0xFC00) >> 5) | ((vs[0] & 0xF8) >> 3);
  vd[0](16..31) = ((vs[1] & 0xF80000) >> 8) | ((vs[1] & 0xFC00) >> 5) | ((vs[1] & 0xF8) >> 3);
  vd[1]( 0..15) = ((vs[2] & 0xF80000) >> 8) | ((vs[2] & 0xFC00) >> 5) | ((vs[2] & 0xF8) >> 3);
  vd[1](16..31) = ((vs[3] & 0xF80000) >> 8) | ((vs[3] & 0xFC00) >> 5) | ((vs[3] & 0xF8) >> 3);
}

-----------------------------------------

vcrs.t vd, vs, vt                           1            0
{
  vd[0] = vs[1] * vt[2];
  vd[1] = vs[2] * vt[0];
  vd[2] = vs[0] * vt[1];
}

-----------------------------------------

// Negative reciprocal
vnrcp.q/t/p/s vd, vs (UNSURE)                  4/?/?/?         3
{
  for (i = 0; i < |q/t/p|; ++i)
    vd[i] = -1.0 / vs[i];
}

-----------------------------------------

// Negative sinus
vnsin.q/t/p/s vd, vs (UNSURE)                  4/?/?/?         3
{
  for (i = 0; i < |q/t/p|; ++i)
    vd[i] = -sin(vs[i]*PI/2);
}

-----------------------------------------

// Reciprocal exponent to base 2
vrexp2.q/t/p/s vd, vs                        4/?/?/?         3
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = 1.0 / exp2(vs[i]);
}

-----------------------------------------

// Vector cross-product
vcrsp.t vd, vs, vt                           3            2
{
  vd[0] = vs[1]*vt[2] - vs[2]*vt[1];
  vd[1] = vs[2]*vt[0] - vs[0]*vt[2];
  vd[2] = vs[0]*vt[1] - vs[1]*vt[0];
}

-----------------------------------------

// Vector determinant
vdet.p vd.s, vs, vt                           1            0
{
  vd.s = vs[0] * vt[1] - vs[1] * vt[0];
}

-----------------------------------------

v(u)s2i.s vd.p, vs.s                        1            0
{
  vd.p[0] = (vs.s[0](16..31)) << 16;
  vd.p[1] = (vs.s[0]( 0..15)) << 16;
}

v(u)s2i.p vd.q, vs.p                        1            0
{
  vd.q[0] = (vs.p[0](16..31)) << 16;
  vd.q[1] = (vs.p[0]( 0..15)) << 16;
  vd.q[2] = (vs.p[1](16..31)) << 16;
  vd.q[3] = (vs.p[1]( 0..15)) << 16;
}

-----------------------------------------

vi2(u)s.s vd.s, vs.p                        1            0
{
  vd.s[0](16..31) = vs.p[0] >> 16;
  vd.s[0]( 0..15) = vs.p[1] >> 16;
}

vi2(u)s.p vd.p, vs.q                        1            0
{
  vd.p[0](16..31) = vs.q[0] >> 16;
  vd.p[0]( 0..15) = vs.q[1] >> 16;
  vd.p[1](16..31) = vs.q[2] >> 16;
  vd.p[1]( 0..15) = vs.q[3] >> 16;
}

-----------------------------------------

// Nvidia Half format [S:1][E:5][M:10]
vh2f.p vd, vs                              1            0
{
  vd[0] = ((vs[0] & 0x8000) << 16) | ((((vs[0] >> 10) & 0x1F) + 0x70) << 23) | ((vs[0] & 0x03FF) << 13);
  vd[1] = (vs[0] & 0x80000000) | ((((vs[0] >> 10) & 0x1F0000) + 0x700000) << 7) | ((vs[0] & 0x03FF0000) >> 3);
  vd[2] = ((vs[1] & 0x8000) << 16) | ((((vs[1] >> 10) & 0x1F) + 0x70) << 23) | ((vs[1] & 0x03FF) << 13);
  vd[3] = (vs[1] & 0x80000000) | ((((vs[1] >> 10) & 0x1F0000) + 0x700000) << 7) | ((vs[1] & 0x03FF0000) >> 3);
}

-----------------------------------------

vsocp.p/s vd.q/p, vs.p/s                     1            0
{
  for (i = 0; i < |p/s|; ++i)
    vd[i*2+0] = 1.0 - vs[i];
    vd[i*2+1] = vs[i];
}

-----------------------------------------

vsbz.s vd.s, vs.s                           1            0
{
// TODO Byte To Short Extension ?
}

vsbn.s vd.s, vs.s, vt.s                        1            0
{
// TODO Byte to Short Extension ?
}

vlgb.s vd.s, vs.s                           1            0
{
// TODO
}

vwbn.s vd.s, vs.s, imm                        1            0
{
// TODO Byte to Word Extension ?
}

-----------------------------------------

viim.s vd.s, constant integer                   1            0
{
  vd.s = constant integer (between -32768 and 32767 ?);
}

vfim.s vd.s, constant real                     1            0
{
  vd.s = constant real;
}

-----------------------------------------

vnop                                    1            0
{
  // do nothing except eating 1 cycle
}

-----------------------------------------

vflush                                    5            4
{
  // TODO
}

vsync                                    4            3
{
  // TODO
}

vsync i                                    1            0
{
  // TODO
}





NOTES:

(UNSURE) besides an op means the given C counterpart is questionable

Clock ticks are benched estimates, but should be accurate.

*The latency column is to be understood like this:
the exec cost is the (clock) ticks minus the latency and is unavoidable cost, while latency is the 'playroom' to interleave
the code with other (independant) ops without additional costs.
Unfortunately, this does not seem to work with VFPU ops - so either the VFPU isn't pipelined or most ops with latency
just use the whole pipeline already. It works however with normal mips code (that's how it was benched). This code
interleaving is recommended especially with matrix and other costly ops.


Last edited by hlide on Mon Jun 23, 2008 8:10 pm; edited 17 times in total
Back to top
View user's profile Send private message
Raphael



Joined: 17 Jan 2006
Posts: 646
Location: Germany

PostPosted: Wed Nov 08, 2006 5:53 am    Post subject: Re: VFPU diggins Reply with quote

hlide wrote:

NOTE: in fact i was first puzzled by the <<16 operation but now i find it logical in so far as it simplifies the operation (no need to extend sign this way for vfpu logic circuits).

If you need then to convert them in floats, just do "vi2f vd, vs, 16".

Yep. Had the same problem when I tried converting short arrays to float arrays for VFPU processing in libavcodec. The same goes for the reverse way, ie first do "vf2i vd, vs, 16" and then "vi2(u)s vd, vs".

I'd suggest designing your notation to differentiate between single and quad registers, as sometimes they are combined in operations and it's not immediately clear which operand has which format. Sth. lik vqs/d is quad register and vss/d is single register or alike.
Here's some of my findings:
Code:

vocp.s vsd, vss
{
   vsd = 1.0 - vss
}

vrsq.s vsd, vss
{
  vsd = 1.0 / sqrt(vss)
}

vsat0.q/t/p/s vqd, vqs
{
  (i=0..3)
  vqd[i] = (vqs[i] < 0) ? 0 : ((vqs[i] > 1.0) ? 1.0 : vqs[i])
}

Apart from that, the vscl operation can also saturate using the destination register extension with brackets:
Code:

vscl.q/t/p/s vqd[L1:T1, L2:T2, L3:T3, L4:T4], vqs, vst
{
  (i=0..3)
  vqd[i] = CLAMP(vqs[i] * vst, Li, Ti)
}

So you can clamp to range -1:1 for example (useful for normalizations), or any other constants that can be used in those fields.

Quote:

by the way, psp-documentation from hitmen seems to be in standby :/

Unfortunately, yes :(
_________________
<Don't push the river, it flows.>
http://wordpress.fx-world.org - my devblog
http://wiki.fx-world.org - VFPU documentation wiki

Alexander Berl
Back to top
View user's profile Send private message Visit poster's website
hlide



Joined: 10 Sep 2006
Posts: 750

PostPosted: Wed Nov 08, 2006 9:48 am    Post subject: Reply with quote

added nearly all the instructions but a lot to be done too :/
Back to top
View user's profile Send private message
dot_blank



Joined: 28 Sep 2005
Posts: 498
Location: Brasil

PostPosted: Wed Nov 08, 2006 11:03 am    Post subject: Reply with quote

i am finally glad somebody took it up themselves
to start something like this ....cheers hlide and raphael
_________________
10011011 00101010 11010111 10001001 10111010
Back to top
View user's profile Send private message
Raphael



Joined: 17 Jan 2006
Posts: 646
Location: Germany

PostPosted: Wed Nov 08, 2006 1:31 pm    Post subject: Reply with quote

Some things from the list I can complete/confirm:

Code:

// homogenous dot product
vhdp.q/t/p/s sd.s, vs, vt (UNSURE)
{
  sd.s = vt.s;
  for (i = 1; i < |q/t/p|; ++i)
    sd.s += vs[i] * vt[i];
}

-----------------------------------------

// Funnel add components
vfad.q/t/p/s sd.s, vs
{
  sd.s = 0;
  for (i = 0; i < |q/t/p/s|; ++i)
    sd.s += vs[i];
}

-----------------------------------------

// Average of components
vavg.q/t/p/s sd.s, vs
{
  sd.s = 0.0
  for (i = 0; i < |q/t/p/s|; ++i)
    sd.s += vs[i];
  sd.s /= |q/t/p/s|;
}

-----------------------------------------

// Round
vf2in.q/t/p/s vd, sd, imm
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = ROUND(vs[i]) << imm;
}

-----------------------------------------

// Trunc
vf2iz.q/t/p/s vd, sd, imm
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = TRUNC(vs[i]) << imm;
}

-----------------------------------------

// Floor
vf2iu.q/t/p/s vd, sd, imm
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = FLOOR(vs[i]) << imm;
}

-----------------------------------------

// Ceil
vf2id.q/t/p/s vd, sd, imm
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = CEIL(vs[i]) << imm;
}

-----------------------------------------

vi2f.q/t/p/s vd, sd, imm
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = (float)(vs[i] >> imm);
}

-----------------------------------------

vcmov.q/t/p/s vd, sd, cc (UNSURE)
{
  if (CC[cc])
     vd = sd;
}

vcmovt.q/t/p/s vd, sd, cc (UNSURE)
{
  if (CC[cc] == TRUE)
     vd = sd;
}

vcmovf.q/t/p/s vd, sd, cc (UNSURE)
{
  if (CC[cc] == FALSE)
     vd = sd;
}

-----------------------------------------

// matrix multiplication
vmmul.q/t/p md, ms, mt
{
  for (i = 0; i < |q/t/p|; ++i)
    for (j = 0; j < |q/t/p|; ++j)
      md[i][j] = 0;
      for (k = 0; k < |q/t/p|; ++k)
        md[i][j] += ms[i][k] * mt[k][j];
}

-----------------------------------------

// Matrix-vector transform
vtfm4.q/3.t/2.p vd, md, vt
{
  for (i = 0; i < |q/t/p|; ++i)
    vd[i] = 0;
    for (j = 0; j < |q/t/p|; ++j)
      vd[i] += md[i][j] * vt[j];
}

-----------------------------------------

// homogenous transform
vhtfm4.q/3.t/2.p/1.s vd, md, vt (UNSURE esp 1.s case?)
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = 0;
    for (j = 0; j < |q/t/p/s|; ++j)
      vd[i] += md[i][j] * vt[j];
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] /= vd[|q/t/p/s|];
}

-----------------------------------------

// matrix scale
vmscl.q/t/p md, ms, st
{
  for (i = 0; i < |q/t/p|; ++i)
    for (j = 0; j < |q/t/p|; ++j)
      md[i][j] = ms[i][j] * st;
}

-----------------------------------------

vmmov.q/t/p md, ms
{
  for (i = 0; i < |q/t/p|; ++i)
    for (j = 0; j < |q/t/p|; ++j)
      md[i][j] = ms[i][j];
}

-----------------------------------------

vmidt.q/t/p md
{
  for (i = 0; i < |q/t/p|; ++i)
    for (j = 0; j < |q/t/p|; ++j)
      md[i][j] = (i == j) ? 1.0 : 0.0;
}

-----------------------------------------

vmzero.q/t/p md
{
  for (i = 0; i < |q/t/p|; ++i)
    for (j = 0; j < |q/t/p|; ++j)
      md[i][j] = 0.0;
}

-----------------------------------------

vmone.q/t/p md
{
  for (i = 0; i < |q/t/p|; ++i)
    for (j = 0; j < |q/t/p|; ++j)
      md[i][j] = 1.0;
}

-----------------------------------------

vrot.q/t/p vd, ss, [+c/-c/-s/+s/0,...]
{
  for (i = 0; i < |q/t/p|; ++i)
    vd[i] = +/- cos/sin(ss) | 0;
}

-----------------------------------------

vnrcp.q/t/p/s vd, vs (UNSURE)
{
  for (i = 0; i < |q/t/p|; ++i)
    vd[i] = -1.0 / vs[i];
}

-----------------------------------------

vnsin.q/t/p/s vd, vs (UNSURE)
{
  for (i = 0; i < |q/t/p|; ++i)
    vd[i] = -sin(vs[i]*PI/2);
}

-----------------------------------------

vrexp2.q/t/p/s vd, vs
{
  for (i = 0; i < |q/t/p/s|; ++i)
    vd[i] = 1.0 / exp2(vs[i]);
}

-----------------------------------------

vcrsp.t vd, vs, vt
{
  vd[0] = vs[1]*vt[2] - vs[2]*vt[1];
  vd[1] = vs[2]*vt[0] - vs[0]*vt[2];
  vd[2] = vs[0]*vt[1] - vs[1]*vt[0];
}

-----------------------------------------

I'd also suppose that the half format is [1:5:10], though the conversion steps still has to get found out, but it should be straight forward. No shift arguments there ;)

I wanted to do something like this for some time now, but always was too lazy to begin writing down everything :) I need to slap myself that hlide had to appear before I did something

I wonder what that vcrs.t does, as there already is the cross product. Also vdet.p, though that could possibly just be a simple (vs[0]*vt[1] - vs[1]*vt[0]). Are there definately no .t/q versions? Gonna play around with that when I find time and I'll then add some more things
_________________
<Don't push the river, it flows.>
http://wordpress.fx-world.org - my devblog
http://wiki.fx-world.org - VFPU documentation wiki

Alexander Berl
Back to top
View user's profile Send private message Visit poster's website
hlide



Joined: 10 Sep 2006
Posts: 750

PostPosted: Wed Nov 08, 2006 6:02 pm    Post subject: Reply with quote

nice catch for vfad, i was clueless.

opc-mips.c :

there is only one vcrs.t and vdet.p. if vdet.t exists, its opcode would probably be something like 0x67808000 + vd.t + (vs.t << 8). But my opinion is that the computation of a determinant for 3d vector being different than a 2d vector may explain this :

det([a]) = a
det([[a b][c d]]) = ad - bc.
det([[a b c][d e f][g h i]) = aei + dhc + gbf - ceg - fha - ibd.

I would investigate vcrs.t as soon as I can.

I will add your diggins as soon as possible.

N.B.: is the word "diggins" correct or is this a pure invention of mine ? i fail to find a french traduction for this word.
Back to top
View user's profile Send private message
hlide



Joined: 10 Sep 2006
Posts: 750

PostPosted: Wed Nov 08, 2006 9:59 pm    Post subject: Reply with quote

Raphael wrote:
Some things from the list I can complete/confirm:

Code:

// homogenous dot product
vhdp.q/t/p/s sd.s, vs, vt (UNSURE)
{
  sd.s = vt.s;
  for (i = 1; i < |q/t/p|; ++i)
    sd.s += vs[i] * vt[i];
}



vhdp.q ==> return Xs + Ys*Yt + Zs*Zt + Ws*Wt ?
Back to top
View user's profile Send private message
Raphael



Joined: 17 Jan 2006
Posts: 646
Location: Germany

PostPosted: Wed Nov 08, 2006 10:09 pm    Post subject: Reply with quote

hlide wrote:
Raphael wrote:
Some things from the list I can complete/confirm:

Code:

// homogenous dot product
vhdp.q/t/p/s sd.s, vs, vt (UNSURE)
{
  sd.s = vt.s;
  for (i = 1; i < |q/t/p|; ++i)
    sd.s += vs[i] * vt[i];
}



vhdp.q ==> return Xs + Ys*Yt + Zs*Zt + Ws*Wt ?

Oh, no, actually it should be Xs*Xt + Ys*Yt + Zs*Zt + Wt :D But still not sure if that is correct
_________________
<Don't push the river, it flows.>
http://wordpress.fx-world.org - my devblog
http://wiki.fx-world.org - VFPU documentation wiki

Alexander Berl
Back to top
View user's profile Send private message Visit poster's website
hlide



Joined: 10 Sep 2006
Posts: 750

PostPosted: Wed Nov 08, 2006 11:30 pm    Post subject: Reply with quote

i'm digging vcrs.t :

Code:


vcrs.t [1 0 0],[1 0 0] => [0 0 0]
vcrs.t [1 0 0],[0 1 0] => [0 0 1] => vd[2] = vs[0] x vt[1] ?
vcrs.t [1 0 0],[0 0 1] => [0 0 0]
vcrs.t [0 1 0],[1 0 0] => [0 0 0]
vcrs.t [0 1 0],[0 1 0] => [0 0 0]
vcrs.t [0 1 0],[0 0 1] => [1 0 0] => vd[0] = vs[1] x vt[2] ?
vcrs.t [0 0 1],[1 0 0] => [0 1 0] => vd[1] = vs[2] x vt[0] ?
vcrs.t [0 0 1],[0 1 0] => [0 0 0]
vcrs.t [0 0 1],[0 0 1] => [0 0 0]


vcrs.t [1 2 0],[1 2 0] => [0 0 2] => [ 0, 0, vs[0] x vt[1] ] !
vcrs.t [1 2 0],[0 1 2] => [4 0 1] => [ vs[1] x vt[2], 0, vs[0] x vt[1] ] ! 
vcrs.t [1 2 0],[2 0 1] => [2 0 0] => [ vs[1] x vt[2], 0, 0 ] !
vcrs.t [0 1 2],[1 2 0] => [0 2 0] => [ 0, vs[2] x vt[0], 0 ] !
vcrs.t [0 1 2],[0 1 2] => [2 0 0] => [ vs[1] x vt[2], 0, 0 ] !
vcrs.t [0 1 2],[2 0 1] => [1 4 0] => [ vs[1] x vt[2], vs[2] x vt[0] ] !
vcrs.t [2 0 1],[1 2 0] => [0 1 4] => [ 0, vs[2] x vt[0], vs[0] x vt[1] ] !
vcrs.t [2 0 1],[0 1 2] => [0 0 2] => [ 0, 0, vs[0] x vt[1] ] !
vcrs.t [2 0 1],[2 0 1] => [0 2 0] => [ 0, vs[2] x vt[0], 0 ] !


it looks like :
Code:

vcrs.t vd, vs, vt
{
  vd[0] = vs[1] x vt[2];
  vd[1] = vs[2] x vt[0];
  vd[2] = vs[0] x vt[1];
}
Back to top
View user's profile Send private message
hlide



Joined: 10 Sep 2006
Posts: 750

PostPosted: Wed Nov 08, 2006 11:54 pm    Post subject: Reply with quote

i'm digging vdet.p as we suppose it does : vd.s = vs[0] x vt[1] - vs[1] x vt[0].

some tests just to check :
Code:

vdet.p [1 0],[1 0] => 0
vdet.p [1 0],[0 1] => 1 => vs[0] x vt[1]
vdet.p [1 0],[1 1] => 1 => vs[0] x vt[1]
vdet.p [0 1],[1 0] => -1 => -(vs[1] x vt[0])
vdet.p [0 1],[0 1] => 0
vdet.p [0 1],[1 1] => -1 => -(vs[1] x vt[0])
vdet.p [1 1],[1 0] => -1 => -(vs[1] x vt[0])
vdet.p [1 1],[0 1] => 1 =>  vs[0] x vt[1]
vdet.p [1 1],[1 1] => 0 => vs[0] x vt[1] - vs[1] x vt[0] = 0



Code:

vdet.p vd.s, vs, vt
{
  vd.s = vs[0] * vt[1] - vs[1] * vt[0];
}


Last edited by hlide on Thu Nov 09, 2006 2:34 am; edited 1 time in total
Back to top
View user's profile Send private message
hlide



Joined: 10 Sep 2006
Posts: 750

PostPosted: Wed Nov 08, 2006 11:57 pm    Post subject: Reply with quote

LIST UPDATED !
Back to top
View user's profile Send private message
hlide



Joined: 10 Sep 2006
Posts: 750

PostPosted: Thu Nov 09, 2006 2:28 am    Post subject: Reply with quote

hlide wrote:
if vdet.t exists, its opcode would probably be something like 0x67808000 + vd.t + (vs.t << 8).


I tried this one -> crash. So only vdet.p seems to exist.
Back to top
View user's profile Send private message
Raphael



Joined: 17 Jan 2006
Posts: 646
Location: Germany

PostPosted: Thu Nov 09, 2006 4:57 am    Post subject: Reply with quote

hlide wrote:
i'm digging vcrs.t :

Code:


vcrs.t [1 0 0],[1 0 0] => [0 0 0]
vcrs.t [1 0 0],[0 1 0] => [0 0 1] => vd[2] = vs[0] x vt[1] ?
vcrs.t [1 0 0],[0 0 1] => [0 0 0]
vcrs.t [0 1 0],[1 0 0] => [0 0 0]
vcrs.t [0 1 0],[0 1 0] => [0 0 0]
vcrs.t [0 1 0],[0 0 1] => [1 0 0] => vd[0] = vs[1] x vt[2] ?
vcrs.t [0 0 1],[1 0 0] => [0 1 0] => vd[1] = vs[2] x vt[0] ?
vcrs.t [0 0 1],[0 1 0] => [0 0 0]
vcrs.t [0 0 1],[0 0 1] => [0 0 0]


vcrs.t [1 2 0],[1 2 0] => [0 0 2] => [ 0, 0, vs[0] x vt[1] ] !
vcrs.t [1 2 0],[0 1 2] => [4 0 1] => [ vs[1] x vt[2], 0, vs[0] x vt[1] ] ! 
vcrs.t [1 2 0],[2 0 1] => [2 0 0] => [ vs[1] x vt[2], 0, 0 ] !
vcrs.t [0 1 2],[1 2 0] => [0 2 0] => [ 0, vs[2] x vt[0], 0 ] !
vcrs.t [0 1 2],[0 1 2] => [2 0 0] => [ vs[1] x vt[2], 0, 0 ] !
vcrs.t [0 1 2],[2 0 1] => [1 4 0] => [ vs[1] x vt[2], vs[2] x vt[0] ] !
vcrs.t [2 0 1],[1 2 0] => [0 1 4] => [ 0, vs[2] x vt[0], vs[0] x vt[1] ] !
vcrs.t [2 0 1],[0 1 2] => [0 0 2] => [ 0, 0, vs[0] x vt[1] ] !
vcrs.t [2 0 1],[2 0 1] => [0 2 0] => [ 0, vs[2] x vt[0], 0 ] !


it looks like :
Code:

vcrs.t vd, vs, vt
{
  vd[0] = vs[1] x vt[2];
  vd[1] = vs[2] x vt[0];
  vd[2] = vs[0] x vt[1];
}


That makes sense, as it would be one part of the crossproduct. I need to redo my VFPU clocktick bench with those new ops :)
About the vdet.t I don't know. If it exists, it shouldn't crash normally. Is it supported by GCC?

And the homogenuous dot product needs revision to make up for (x*x+y*y+z*z+w):
Code:

    __[ homogenous dot product ]__

vhdp.q/t/p vd.s, vs, vt (UNSURE)
{
  vd.s = vt[|q/t/p|];
  for (i = 0; i < |q/t/p|-1; ++i)
    vd.s += vs[i] * vt[i];
}

So the last component of the second operand vs is considered to be 1.0 basically. Still unsure/needs checking
_________________
<Don't push the river, it flows.>
http://wordpress.fx-world.org - my devblog
http://wiki.fx-world.org - VFPU documentation wiki

Alexander Berl


Last edited by Raphael on Thu Nov 09, 2006 5:15 am; edited 1 time in total
Back to top
View user's profile Send private message Visit poster's website
hlide



Joined: 10 Sep 2006
Posts: 750

PostPosted: Thu Nov 09, 2006 5:05 am    Post subject: Reply with quote

Raphael wrote:
About the vdet.t I don't know. If it exists, it shouldn't crash normally. Is it supported by GCC?

as already said, it crashes so it is not supported.
Back to top
View user's profile Send private message
Raphael



Joined: 17 Jan 2006
Posts: 646
Location: Germany

PostPosted: Thu Nov 09, 2006 5:08 am    Post subject: Reply with quote

hlide wrote:
Raphael wrote:
About the vdet.t I don't know. If it exists, it shouldn't crash normally. Is it supported by GCC?

as already said, it crashes so it is not supported.

Oh yes, I misread the "if" there :D It makes sense to not exist, since as you said the 3d vector determinant needs three input vectors. And that's not possible at all
_________________
<Don't push the river, it flows.>
http://wordpress.fx-world.org - my devblog
http://wiki.fx-world.org - VFPU documentation wiki

Alexander Berl
Back to top
View user's profile Send private message Visit poster's website
hlide



Joined: 10 Sep 2006
Posts: 750

PostPosted: Fri Nov 10, 2006 6:15 pm    Post subject: Reply with quote

Raphael wrote:
hlide wrote:
Raphael wrote:
About the vdet.t I don't know. If it exists, it shouldn't crash normally. Is it supported by GCC?

as already said, it crashes so it is not supported.

Oh yes, I misread the "if" there :D It makes sense to not exist, since as you said the 3d vector determinant needs three input vectors. And that's not possible at all


if you have their cycles, it would be interesting to add in the list. :)
Back to top
View user's profile Send private message
Raphael



Joined: 17 Jan 2006
Posts: 646
Location: Germany

PostPosted: Fri Nov 17, 2006 1:21 am    Post subject: Reply with quote

Do you have any idea which version of binutils/pspsdk I need to have, to be able to use the vbtf1/2 ops? I just tried updating pspsdk but that didn't help yet, the ops still aren't recognized. I tried updating binutils, but somehow that failed, so I need to try again.
I'll have an update to the document soon. A few new ops decoded plus most clock ticks.
_________________
<Don't push the river, it flows.>
http://wordpress.fx-world.org - my devblog
http://wiki.fx-world.org - VFPU documentation wiki

Alexander Berl
Back to top
View user's profile Send private message Visit poster's website
hlide



Joined: 10 Sep 2006
Posts: 750

PostPosted: Fri Nov 17, 2006 1:34 am    Post subject: Reply with quote

i'm using DevkitPro and the last devkitPSP release 8.

http://sourceforge.net/project/showfiles.php?group_id=114505&package_id=157350
Back to top
View user's profile Send private message
hlide



Joined: 10 Sep 2006
Posts: 750

PostPosted: Fri Nov 17, 2006 1:39 am    Post subject: Reply with quote

oh my ! shouldn't be vbfy1/2 ?

I'm sorry, I DID misname them. I updated the text with the correct names.
Back to top
View user's profile Send private message
Raphael



Joined: 17 Jan 2006
Posts: 646
Location: Germany

PostPosted: Fri Nov 17, 2006 1:50 am    Post subject: Reply with quote

hlide wrote:
oh my ! shouldn't be vbfy1/2 ?

I'm sorry, I DID misname them. I updated the text with the correct names.

Heh, that did the trick :) thanks
_________________
<Don't push the river, it flows.>
http://wordpress.fx-world.org - my devblog
http://wiki.fx-world.org - VFPU documentation wiki

Alexander Berl
Back to top
View user's profile Send private message Visit poster's website
Raphael



Joined: 17 Jan 2006
Posts: 646
Location: Germany

PostPosted: Fri Nov 17, 2006 2:47 am    Post subject: Reply with quote

Update to the document:
- added some ops C counterpart (vi2c, vqmul, ..)
- added lv/sv ops for completeness
- added clock ticks for nearly all ops (only some for .t/.p/.s versions are missing)
- moved operand prefixes up to pretty much the top

Code:

deleted

_________________
<Don't push the river, it flows.>
http://wordpress.fx-world.org - my devblog
http://wiki.fx-world.org - VFPU documentation wiki

Alexander Berl


Last edited by Raphael on Fri Nov 17, 2006 9:06 pm; edited 1 time in total
Back to top
View user's profile Send private message Visit poster's website
hlide



Joined: 10 Sep 2006
Posts: 750

PostPosted: Fri Nov 17, 2006 6:27 am    Post subject: Reply with quote

Code:

vpfxs [?0,?1,?2,?3]

?0, ?1, ?2 or ?3 can be :

x : vs[0]
y : vs[1]
z : vs[2]
w : vs[3]
-x : -vs[0]
-y : -vs[1]
-z : -vs[2]
-w : -vs[3]
|x| : |vs[0]| (absolute value of vs[0])
|y| : |vs[1]| (absolute value of vs[1])
|z| : |vs[2]| (absolute value of vs[2])
|w| : |vs[3]| (absolute value of vs[3])
0 : constant 0
1 : constant 1
2 : constant 2
1/2 : constant 1/2
3 : constant 3
1/3 : constant 1/3
1/4 : constant 1/4
1/6 : constant 1/6

---------------------------------
vpfxt [?0,?1,?2,?3]

same thing as vpfxs but for vt register

---------------------------------
vpfxd [?4,?5,?6,?7]

?4, ?5, ?6 and ?7 can be :

[0:1] : saturated between 0 and 1,
[-1:1] : saturated between -1 and 1,
m : ??? unknown



They are "documented" in opcodes\mips-dis.c :
Code:

static const char * const pfx_cst_names[8] = {
  "0",  "1",  "2",  "1/2",  "3",  "1/3",  "1/4",  "1/6"
};

static const char * const pfx_swz_names[4] = {
  "x",  "y",  "z",  "w"
};

static const char * const pfx_sat_names[4] = {
  "",  "[0:1]",  "",  "[-1:1]"
};

...

            case '0':
            case '1':
            case '2':
            case '3':
              {
                unsigned int pos = *d, base = '0';
                unsigned int negation = (l >> (pos - (base - VFPU_SH_PFX_NEG))) & VFPU_MASK_PFX_NEG;
                unsigned int constant = (l >> (pos - (base - VFPU_SH_PFX_CST))) & VFPU_MASK_PFX_CST;
                unsigned int abs_consthi =
                    (l >> (pos - (base - VFPU_SH_PFX_ABS_CSTHI))) & VFPU_MASK_PFX_ABS_CSTHI;
                unsigned int swz_constlo = (l >> ((pos - base) * 2)) & VFPU_MASK_PFX_SWZ_CSTLO;

                if (negation)
                  (*info->fprintf_func) (info->stream, "-");
                if (constant)
                  {
                    (*info->fprintf_func) (info->stream, "%s",
                                           pfx_cst_names[(abs_consthi << 2) | swz_constlo]);
                  }
                else
                  {
                    if (abs_consthi)
                      (*info->fprintf_func) (info->stream, "|%s|",
                                             pfx_swz_names[swz_constlo]);
                    else
                      (*info->fprintf_func) (info->stream, "%s",
                                             pfx_swz_names[swz_constlo]);
                  }
              }
              break;

            case '4':
            case '5':
            case '6':
            case '7':
              {
                unsigned int pos = *d, base = '4';
                unsigned int mask = (l >> (pos - (base - VFPU_SH_PFX_MASK))) & VFPU_MASK_PFX_MASK;
                unsigned int saturation = (l >> ((pos - base) * 2)) & VFPU_MASK_PFX_SAT;

                if (mask)
                  (*info->fprintf_func) (info->stream, "m");
                else
                  (*info->fprintf_func) (info->stream, "%s",
                                         pfx_sat_names[saturation]);
              }
              break;


Back to top
View user's profile Send private message
Raphael



Joined: 17 Jan 2006
Posts: 646
Location: Germany

PostPosted: Fri Nov 17, 2006 8:43 pm    Post subject: Reply with quote

Another update:
- added vsrt*, vsocp, vf2h/vh2f
- added prefix information from hlide's last post
- removed exec cycles from exec/latency column (better readability) and added missing latencies for .t/p/s variations
- added '?' where clock ticks information is missing

only missing ops now are vcmp versions, byte to X extensions and vflush as well as vsync.

The information should next be formatted in a better readable way into a .pdf or something.


Code:

deleted

_________________
<Don't push the river, it flows.>
http://wordpress.fx-world.org - my devblog
http://wiki.fx-world.org - VFPU documentation wiki

Alexander Berl


Last edited by Raphael on Fri Nov 17, 2006 10:57 pm; edited 1 time in total
Back to top
View user's profile Send private message Visit poster's website
hlide



Joined: 10 Sep 2006
Posts: 750

PostPosted: Fri Nov 17, 2006 9:41 pm    Post subject: Reply with quote

vsrt1/2/3/4.q vd, vs are very tough ones but i think to discover what they do :

Code:

vsrt1.q vd, vs
{
  vd[0] = min(vs[0], vs[1]);
  vd[1] = max(vs[1], vs[0]);
  vd[2] = min(vs[2], vs[3]);
  vd[3] = max(vs[3], vs[2]);
}

vsrt2.q vd, vs
{
  vd[0] = min(vs[0], vs[3]);
  vd[1] = max(vs[1], vs[2]);
  vd[2] = min(vs[2], vs[1]);
  vd[3] = max(vs[3], vs[0]);
}

vsrt3.q vd, vs
{
  vd[0] = max(vs[0], vs[1]);
  vd[1] = min(vs[1], vs[0]);
  vd[2] = max(vs[2], vs[3]);
  vd[3] = min(vs[3], vs[2]);
}

vsrt4.q vd, vs
{
  vd[0] = max(vs[0], vs[3]);
  vd[1] = max(vs[1], vs[2]);
  vd[2] = min(vs[2], vs[1]);
  vd[3] = min(vs[3], vs[0]);
}


I wish Raphael can confirm those operations.

I used 4 vectors as vs :
[1 2 3 4]
[2 3 4 1]
[3 4 1 2]
[4 1 2 3]

results for vsrt1 :
[1 2 3 4]
[2 3 4 1] => 4->1 and 1->4
[3 4 1 2]
[4 1 2 3] => 4->1 and 1->4

results for vsrt2 :
[1 2 3 4]
[1 3 4 2] => 2->1 and 1->2
[2 1 4 3] => 3->2 and 4->1 and 1->4 and 2->3
[4 1 2 3] => 4->1 and 1->4

results for vsrt3 :
[2 1 4 3] => 1->2 and 2->1 and 3->4 and 4->3
[3 2 4 1] => 2->3 and 3->2
[4 3 2 1] => 3->4 and 4->3 and 1->2 and 2->1
[4 1 3 2] => 2->3 and 3->2

results for vsrt4 :
[4 3 2 1] => 1->4 and 2->3 and 3->2 and 4->1
[2 4 3 1] => 3->4 and 4->3
[3 4 1 2]
[4 2 1 3] => 1->2 and 2->1

Due to their apparent "random" permutations, i felt min and max were probably the key to their weirdness.
Back to top
View user's profile Send private message
hlide



Joined: 10 Sep 2006
Posts: 750

PostPosted: Fri Nov 17, 2006 9:45 pm    Post subject: Reply with quote

oh i miss you post, Raphael ! well i can compare yours addition with mine. :)
Back to top
View user's profile Send private message
hlide



Joined: 10 Sep 2006
Posts: 750

PostPosted: Fri Nov 17, 2006 10:00 pm    Post subject: Reply with quote

Raphael:

ok, we found the same thing for vsrt1/2/3/4, that should be okay.

I updated the textfile in the first message, so i think you can erase your long text to alleviate the number of page to browse :).

By the way, groepaz plans to update his document with our findings.
Back to top
View user's profile Send private message
Raphael



Joined: 17 Jan 2006
Posts: 646
Location: Germany

PostPosted: Fri Nov 17, 2006 10:59 pm    Post subject: Reply with quote

Heh, finally, he already said he'd update it when I first posted my VFPU clock cycles :P
EDIT: I think we can leave only your min/max code for vsrt*, it's shorter and easier to read
Oh, and do you know how you can seed the random generator for VFPU?
_________________
<Don't push the river, it flows.>
http://wordpress.fx-world.org - my devblog
http://wiki.fx-world.org - VFPU documentation wiki

Alexander Berl
Back to top
View user's profile Send private message Visit poster's website
hlide



Joined: 10 Sep 2006
Posts: 750

PostPosted: Fri Nov 17, 2006 11:07 pm    Post subject: Reply with quote

Raphael wrote:
Heh, finally, he already said he'd update it when I first posted my VFPU clock cycles :P
EDIT: I think we can leave only your min/max code for vsrt*, it's shorter and easier to read
Oh, and do you know how you can seed the random generator for VFPU?


VFPU has control registers and some are relative to random seed i guess. They are documented in groepaz's document.

Code:

128    VFPU_PFXS    Source prefix stack
129    VFPU_PFXT    Target prefix stack
130    VFPU_PFXD    Destination prefix stack
131    VFPU_CC    Condition information
132    VFPU_INF4    VFPU internal information 4
133    VFPU_RSV5    Not used (reserved)
134    VFPU_RSV6    Not used (reserved)
135    VFPU_REV    VFPU revision information
136    VFPU_RCX0    Pseudorandom number generator information 0
137    VFPU_RCX1    Pseudorandom number generator information 1
138    VFPU_RCX2    Pseudorandom number generator information 2
139    VFPU_RCX3    Pseudorandom number generator information 3
140    VFPU_RCX4    Pseudorandom number generator information 4
141    VFPU_RCX5    Pseudorandom number generator information 5
142    VFPU_RCX6    Pseudorandom number generator information 6
143    VFPU_RCX7    Pseudorandom number generator information 7
Back to top
View user's profile Send private message
Raphael



Joined: 17 Jan 2006
Posts: 646
Location: Germany

PostPosted: Fri Nov 17, 2006 11:16 pm    Post subject: Reply with quote

Yeah, just stumbled upon those too. Hm, unfortunately I have no clue how to use them. Would be nice though, seeing how the vector random generator only takes 3 cycles to generate one random number.
_________________
<Don't push the river, it flows.>
http://wordpress.fx-world.org - my devblog
http://wiki.fx-world.org - VFPU documentation wiki

Alexander Berl
Back to top
View user's profile Send private message Visit poster's website
hlide



Joined: 10 Sep 2006
Posts: 750

PostPosted: Fri Nov 17, 2006 11:30 pm    Post subject: Reply with quote

vone.q and vzero.q take 3 cycles !?

vmov.q vd, vs[1, 1, 1, 1] and vmov.q vd, vs[0, 0, 0, 0] don't give us better cycles ? (at least 2 cyles instead of 3 ?), do they ?

random stuff, i'm trying to see how to use them.
Back to top
View user's profile Send private message
Display posts from previous:   
Post new topic   Reply to topic    forums.ps2dev.org Forum Index -> PSP Development All times are GMT + 10 Hours
Goto page 1, 2, 3  Next
Page 1 of 3

 
Jump to:  
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum


Powered by phpBB © 2001, 2005 phpBB Group