|
forums.ps2dev.org Homebrew PS2, PSP & PS3 Development Discussions
|
View previous topic :: View next topic |
Author |
Message |
hlide
Joined: 10 Sep 2006 Posts: 750
|
Posted: Tue Nov 07, 2006 9:50 pm Post subject: VFPU diggins |
|
|
This topic is where we can share our VFPU diggins. This first message should grow more and more as our VFPU diggins make progress.
Code: |
/////////////////////////////////////////////////////////////
// VFPU diggins
/////////////////
//
// Authors :
//
// hlide, Raphael
//
// 2006-11-17 01:05PM
//
/////////////////////////////////////////////////////////////
op operands ticks latency*
-----------------------------------------
mtv rt, vs.s
{
vs.s = rt; // rt is general purpose register
}
mfv rt, vs.s
{
rt = vs.s; // rt is general purpose register
}
-----------------------------------------
mtvc rt, vcr
{
vcr = rt; // vcr is cop2 control register
}
mfvc rt, vcr
{
rt = vcr; // vcr is cop2 control register
}
-----------------------------------------
vmtvc vcr, vs.s
{
vcr = vs.s;
}
vmfvc sd, cr
{
sd = cr;
}
-----------------------------------------
// rm is general purpose register containing a memory address
lv.s vd.s, offset(rm)
{
vd.s = offset(rm);
}
sv.s vd.s, offset(rm)
{
offset(rm) = vd.s;
}
// rm needs to be aligned to 16bytes (quadword)
lv.q vd, rm 1 0 (cache)
{ 68 (memory)
vd[0] = 0(rm);
vd[1] = 4(rm);
vd[2] = 8(rm);
vd[3] = 12(rm);
}
ulv.q vd, rm 2 0 (cache)
{ 68 (memory)
vd[0] = 0(rm);
vd[1] = 4(rm);
vd[2] = 8(rm);
vd[3] = 12(rm);
}
// rm needs to be aligned to 16bytes (quadword)
sv.q vd, rm 7 2 (cache)
{ 111 (memory)
0(rm) = vd[0];
4(rm) = vd[1];
8(rm) = vd[2];
12(rm) = vd[3];
}
usv.q vd, rm 14 4 (cache)
{ 111 (memory)
0(rm) = vd[0];
4(rm) = vd[1];
8(rm) = vd[2];
12(rm) = vd[3];
}
-----------------------------------------
// vector register prefixes
vpfxs [?0,?1,?2,?3]
// special prefix for vs like vs.q[X, X, Y, Y] - their values may be :
// x : vs[0]
// y : vs[1]
// z : vs[2]
// w : vs[3]
// -x : -vs[0]
// -y : -vs[1]
// -z : -vs[2]
// -w : -vs[3]
// |x| : |vs[0]| (absolute value of vs[0])
// |y| : |vs[1]| (absolute value of vs[1])
// |z| : |vs[2]| (absolute value of vs[2])
// |w| : |vs[3]| (absolute value of vs[3])
// 0 : constant 0
// 1 : constant 1
// 2 : constant 2
// 1/2 : constant 1/2
// 3 : constant 3
// 1/3 : constant 1/3
// 1/4 : constant 1/4
// 1/6 : constant 1/6
//
// so vmov.q vd, vs[z, |x|, 0, -x] :
// vd[0] = vs[3];
// vd[1] = |vs[0]|;
// vd[2] = 0;
// vd[3] = -vs[0];
vpfxt [?0,?1,?2,?3]
// special prefix for vt like vt.q[X, X, Y, Y] - their values may be :
// x : vt[0]
// y : vt[1]
// z : vt[2]
// w : vt[3]
// -x : -vt[0]
// -y : -vt[1]
// -z : -vt[2]
// -w : -vt[3]
// |x| : |vt[0]| (absolute value of vt[0])
// |y| : |vt[1]| (absolute value of vt[1])
// |z| : |vt[2]| (absolute value of vt[2])
// |w| : |vt[3]| (absolute value of vt[3])
// 0 : constant 0
// 1 : constant 1
// 2 : constant 2
// 1/2 : constant 1/2
// 3 : constant 3
// 1/3 : constant 1/3
// 1/4 : constant 1/4
// 1/6 : constant 1/6
//
vpfxd [?4,?5,?6,?7]
// special prefix for vd like vd.q[0:1, 0:1, 0:1, 0:1] - their values may be :
// 0:1 : min(1, max(0, vd[i]))
// -1:1 : min(1, max(-1, vd[i]))
// m : ???
//
// so vmov.p vd[0:1, -1:1], sd :
// vd[0] = min(1, max(0, vs[0]));
// vd[1] = min(1, max(-1, vs[1]));
-----------------------------------------
vadd.q/t/p/s vd, vs, vt 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = vs[i] + vt[i];
}
vsub.q/t/p/s vd, vs, vt 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = vs[i] - vt[i];
}
-----------------------------------------
vdiv.q/t/p/s vd, vs, vt 56/42/28/14 30/?/?/?
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = vs[i] / vt[i];
}
vmul.q/t/p/s vd, vs, vt 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = vs[i] * vt[i];
}
-----------------------------------------
vdot.q/t/p/s sd.s, vs, vt 1 0
{
sd.s = 0;
for (i = 0; i < |q/t/p/s|; ++i)
sd.s += vs[i] * vt[i];
}
-----------------------------------------
vscl.q/t/p/s vd, vs, vt.s 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = vs[i] * vt.s;
}
-----------------------------------------
// Homogenuous dot product
vhdp.q/t/p/s vd.s, vs, vt (UNSURE) 1 0
{
vd.s = vt[|q/t/p|];
for (i = 0; i < |q/t/p|-1; ++i)
vd.s += vs[i] * vt[i];
}
-----------------------------------------
vcmp.q/t/p/s f2, vs, vt 1 0
{
for (i = 0; i < 5; ++i)
VFPU_CC[i] = 0;
VFPU_CC[5] = 1;
for (i = 0; i < |q/t/p|; ++i)
VFPU_CC[i] = bcmp(f2, vs[i], vt[i]); // f2 = EQ/NE/LE/LT/GE/GT
for (i = 0; i < |q/t/p|; ++i)
{
VFPU_CC[4] ||= VFPU_CC[i];
VFPU_CC[5] &&= VFPU_CC[i];
}
}
vcmp.q/t/p/s f1, vs 1 0
{
for (i = 0; i < 5; ++i)
VFPU_CC[i] = 0;
VFPU_CC[5] = 1;
for (i = 0; i < |q/t/p|; ++i)
VFPU_CC[i] = ucmp(f1, vs[i]); // f1 = EN/EI/EZ/ES/NN/NI/NZ/NS
for (i = 0; i < |q/t/p|; ++i)
{
VFPU_CC[4] ||= VFPU_CC[i];
VFPU_CC[5] &&= VFPU_CC[i];
}
}
vcmp.q/t/p/s f0
{
for (i = 0; i < 5; ++i)
VFPU_CC[i] = 0;
VFPU_CC[5] = 1;
for (i = 0; i < |q/t/p|; ++i)
VFPU_CC[i] = f0; // f0 = TR/FL
for (i = 0; i < |q/t/p|; ++i)
{
VFPU_CC[4] ||= VFPU_CC[i];
VFPU_CC[5] &&= VFPU_CC[i];
}
}
-----------------------------------------
vmin.q/t/p/s vd, vs, vt 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = min(vs[i], vt[i]);
}
vmax.q/t/p/s vd, vs, vt 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = max(vs[i], vt[i]);
}
-----------------------------------------
vsgn.q/t/p/s vd, vs 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = (vs[i] < 0.0) ? -1.0 : (vs[i] > 0.0) : 1.0 : 0.0;
}
-----------------------------------------
vcst.q/t/p/s vd, VPFU_SPC_CST 1 0
{
// VFPU_HUGE = Inf
// VFPU_SQRT2 = SQRT(2)
// VFPU_SQRT1_2 = SQRT(1/2)
// VFPU_2_SQRTPI = 2/SQRT(PI)
// VFPU_2_PI = 2/PI
// VFPU_1_PI = 1/PI
// VFPU_PI_4 = PI/4
// VFPU_PI_2 = PI/2
// VFPU_PI = PI
// VFPU_E = e
// VFPU_LOG2E = log2(e)
// VFPU_LOG10E = log10(e)
// VFPU_LN2 = ln(2)
// VFPU_LN10 = ln(10)
// VFPU_2PI = 2*PI
// VFPU_PI_6 = PI/6
// VFPU_LOG10TWO = log10(2)
// VFPU_LOG2TEN = log2(10)
// VFPU_SQRT3_2 = sqrt(3)/2
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = vpfu_special_constant[VPFU_SPC_CST]
}
-----------------------------------------
vscmp.q/t/p/s vd, vs, vt 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = (vs[i] < vt[i]) ? -1.0 : (vs[i] > vt[i]) ? 1.0 : 0.0;
}
vsge.q/t/p/s vd, vs, vt 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = (vs[i] >= vt[i]) ? 1.0 : 0.0;
}
vslt.q/t/p/s vd, vs, vt 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = (vs[i] < vt[i]) ? 1.0 : 0.0;
}
-----------------------------------------
vi2uc.q vd.s, vs.q 1 0
{
vd.s[0]( 0.. 7) = vs.q[0] & 0xFF;
vd.s[0]( 8..15) = vs.q[1] & 0xFF;
vd.s[0](16..23) = vs.q[2] & 0xFF;
vd.s[0](24..31) = vs.q[3] & 0xFF;
}
vi2c.q vd.s, vs.q 1 0
{
vd.s[0]( 0.. 7) = (vs.q[0] & 0x7F) | ((vs.q[0] & 0x80000000) >> 24);
vd.s[0]( 8..15) = (vs.q[1] & 0x7F) | ((vs.q[1] & 0x80000000) >> 24);
vd.s[0](16..23) = (vs.q[2] & 0x7F) | ((vs.q[2] & 0x80000000) >> 24);
vd.s[0](24..31) = (vs.q[3] & 0x7F) | ((vs.q[3] & 0x80000000) >> 24);
}
-----------------------------------------
vmov.q/t/p/s vd, vs 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = vs[i];
}
-----------------------------------------
vabs.q/t/p/s vd, vs 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = |vs[i]|;
}
-----------------------------------------
vneg.q/t/p/s vd, vs 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = -vs[i];
}
-----------------------------------------
vsat0.q/t/p/s vd, vs 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = max(0.0, min(vs[i], 1.0));
}
vsat1.q/t/p/s vd, vs 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = max(-1.0, min(vs[i], 1.0));
}
-----------------------------------------
vzero.q/t/p/s vd 3/?/?/? 2
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = 0.0;
}
vone.q/t/p/s vd 3/?/?/? 2
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = 1.0;
}
vidt.q/t/p/s vd 3/?/?/? 2
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = (vd[i].column == vd[i].row) ? 1.0 : 0.0;
}
-----------------------------------------
vrcp.q/t/p/s vd, vs 4/?/?/? 3
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = 1.0 / vs[i];
}
vrsq.q/t/p/s vd, vs 4/?/?/? 3
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = 1.0 / sqrt(vs[i]);
}
-----------------------------------------
vsin.q/t/p/s vd, vs 4/?/?/? 3
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = sin(vs[i]*PI/2);
}
vcos.q/t/p/s vd, vs 4/?/?/? 3
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = cos(vs[i]*PI/2);
}
vasin.q/t/p/s vd, vs 4/?/?/? 3
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = asin(vs[i]) * 2/PI; // not sure about this conversion
}
-----------------------------------------
vexp2.q/t/p/s vd, vs 4/?/?/? 3
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = exp2(vs[i]);
}
vlog2.q/t/p/s vd, vs 4/?/?/? 3
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = log2(vs[i]);
}
-----------------------------------------
vsqrt.q/t/p/s vd, vs 4/?/?/? 3
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = sqrt(vs[i]);
}
-----------------------------------------
vrnds.s vs ? ?
{
random_seed(vs);
}
-----------------------------------------
vrndi.q/t/p/s vd 12/9/6/3 10/7/4/1
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = rand_integer(-1<<31, 1<<31); // -1<<31 <= vd[i] < 1<<31
}
-----------------------------------------
vrndf1.q/t/p/s vd 12/9/6/3 10/7/4/1
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = rand_float(0.0, 2.0); // 0.0 <= vd[i] < 2.0
}
-----------------------------------------
vrndf2.q/t/p/s vd 12/9/6/3 10/7/4/1
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = rand_float(0.0, 4.0); // 0.0 <= vd[i] < 4.0
}
-----------------------------------------
// Nvidia Half format [S:1][E:5][M:10]
vf2h.p/q vd, vs (UNSURE) 1 0
{
for (i = 0; i < |q/p|/2; ++i)
vd[i]( 0..15) = ((vs[i*2] >> 16) & 0x8000) | ((vs[i*2] >> 13) & 0x03FF);
e = ((vs[i*2] >> 23) & 0xFF) - 0x70;
if (e < 0)
e = 0;
if (e > 31)
e = 31;
vd[i] &= ~0x03FF; // -> make too huge numbers infinity
if ((vs[i*2] & 0x7FFFFF != 0) && ((vs[i*2] >> 23) & 0xFF == 0xFF))
vd[i] |= 0x03FF; // -> But NaNs stay NaNs even with mantissa loss
vd[i] |= (e << 10);
vd[i](16..31) = ((vs[i*2+1] >> 16) & 0x8000) | ((vs[i*2+1] >> 13) & 0x03FF);
e = ((vs[i*2+1] >> 23) & 0xFF) - 0x70;
if (e < 0)
e = 0;
if (e > 31)
e = 31;
vd[i] &= ~0x03FF0000; // -> make too huge numbers infinity
if ((vs[i*2+1] & 0x7FFFFF != 0) && ((vs[i*2+1] >> 23) & 0xFF == 0xFF))
vd[i] |= 0x03FF0000; // -> But NaNs stay NaNs even with mantissa loss
vd[i] |= (e << 26);
}
-----------------------------------------
vsrt1.q vd, vs 1 0
{
vd[0] = min(vs[0], vs[1]);
vd[1] = max(vs[1], vs[0]);
vd[2] = min(vs[2], vs[3]);
vd[3] = max(vs[3], vs[2]);
}
vsrt2.q vd, vs 1 0
{
vd[0] = min(vs[0], vs[3]);
vd[1] = max(vs[1], vs[2]);
vd[2] = min(vs[2], vs[1]);
vd[3] = max(vs[3], vs[0]);
}
vsrt3.q vd, vs 1 0
{
vd[0] = max(vs[0], vs[1]);
vd[1] = min(vs[1], vs[0]);
vd[2] = max(vs[2], vs[3]);
vd[3] = min(vs[3], vs[2]);
}
vsrt4.q vd, vs 1 0
{
vd[0] = max(vs[0], vs[3]);
vd[1] = max(vs[1], vs[2]);
vd[2] = min(vs[2], vs[1]);
vd[3] = min(vs[3], vs[0]);
}
-----------------------------------------
vbfy1.q/p vd, vs 1 0
{
for (i = 0; i < |q/p|; i += 2)
vd[i+0] = vs[i+0] + vs[i+1];
vd[i+1] = vs[i+0] - vs[i+1];
}
vbfy2.q vd, vs 1 0
{
vd[0] = vs[0] + vs[2];
vd[1] = vs[1] + vs[3];
vd[2] = vs[0] - vs[2];
vd[3] = vs[1] - vs[3];
}
-----------------------------------------
vocp.q/t/p/s vd, vs 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = 1.0 - vs[i];
}
-----------------------------------------
// Funnel add components
vfad.q/t/p/s vd.s, vs 1 0
{
vd.s = 0;
for (i = 0; i < |q/t/p/s|; ++i)
vd.s += vs[i];
}
-----------------------------------------
// Average of components
vavg.q/t/p/s vd.s, vs 1 0
{
vd.s = 0.0
for (i = 0; i < |q/t/p/s|; ++i)
vd.s += vs[i];
vd.s /= |q/t/p/s|;
}
-----------------------------------------
// Round
vf2in.q/t/p/s vd, vs, imm 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = ROUND(vs[i]) << imm;
}
-----------------------------------------
// Trunc
vf2iz.q/t/p/s vd, vs, imm 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = TRUNC(vs[i]) << imm;
}
-----------------------------------------
// Floor
vf2iu.q/t/p/s vd, vs, imm 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = FLOOR(vs[i]) << imm;
}
-----------------------------------------
// Ceil
vf2id.q/t/p/s vd, vs, imm 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = CEIL(vs[i]) << imm;
}
-----------------------------------------
// (float)
vi2f.q/t/p/s vd, vs, imm 1 0
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = (float)(vs[i]) / (float)(1<<imm);
}
-----------------------------------------
// Conditional move vector on true
vcmovt.q/t/p/s vd, vs, cc (UNSURE) 5 4
{
switch (cc)
{
case 0...5 :
if (CC[cc] == TRUE)
vd = vs;
case 6:
for (i = 0; i < |q/t/p/s|; ++i)
if (CC[i] == TRUE)
vd[i] = vs[i]
}
}
// Conditional move vector on false
vcmovf.q/t/p/s vd, vs, cc (UNSURE) 5 4
{
switch (cc)
{
case 0...5 :
if (CC[cc] == FALSE)
vd = vs;
case 6:
for (i = 0; i < |q/t/p/s|; ++i)
if (CC[i] == FALSE)
vd[i] = vs[i]
}
}
-----------------------------------------
// Matrix multiplication
vmmul.q/t/p md, ms, mt 16/8/4 15/7/3
{
for (i = 0; i < |q/t/p|; ++i)
for (j = 0; j < |q/t/p|; ++j)
md[i][j] = 0;
for (k = 0; k < |q/t/p|; ++k)
md[i][j] += ms[i][k] * mt[k][j];
}
-----------------------------------------
// Matrix-vector transform
vtfm4.q/3.t/2.p vd, md, vt 4/3/2 3/2/1
{
for (i = 0; i < |q/t/p|; ++i)
vd[i] = 0;
for (j = 0; j < |q/t/p|; ++j)
vd[i] += md[i][j] * vt[j];
}
-----------------------------------------
// Homogenous transform
vhtfm4.q/3.t/2.p vd, md, vt 4/3/2 3/2/1
{
for (i = 0; i < |q/t/p|; ++i)
vd[i] = 0;
for (j = 0; j < |q/t/p|; ++j)
vd[i] += md[i][j] * vt[j];
for (i = 0; i < |q/t/p|; ++i)
vd[i] /= vd[|q/t/p|];
}
-----------------------------------------
// Matrix scale
vmscl.q/t/p md, ms, vt.s 4/3/2 3/2/1
{
for (i = 0; i < |q/t/p|; ++i)
for (j = 0; j < |q/t/p|; ++j)
md[i][j] = ms[i][j] * vt.s;
}
-----------------------------------------
// Quaternion multiply
vqmul.q vd, vs, vt 4 3
{
vd[0] = vs[3] * vt[0] + vs[0] * vt[3] + vs[1] * vt[2] - vs[2] * vt[1];
vd[1] = vs[3] * vt[1] + vs[1] * vt[3] + vs[2] * vt[0] - vs[0] * vt[2];
vd[2] = vs[3] * vt[2] + vs[2] * vt[3] + vs[0] * vt[1] - vs[1] * vt[0];
vd[3] = vs[3] * vt[3] - vs[0] * vt[0] - vs[1] * vt[1] - vs[2] * vt[2];
}
-----------------------------------------
// Matrix move
vmmov.q/t/p md, ms 4/3/2 3/2/1
{
for (i = 0; i < |q/t/p|; ++i)
for (j = 0; j < |q/t/p|; ++j)
md[i][j] = ms[i][j];
}
-----------------------------------------
// Matrix Identity
vmidt.q/t/p md 6/5/4 5/4/3
{
for (i = 0; i < |q/t/p|; ++i)
for (j = 0; j < |q/t/p|; ++j)
md[i][j] = (i == j) ? 1.0 : 0.0;
}
-----------------------------------------
// Matrix-zero
vmzero.q/t/p md 6/5/4 5/4/3
{
for (i = 0; i < |q/t/p|; ++i)
for (j = 0; j < |q/t/p|; ++j)
md[i][j] = 0.0;
}
-----------------------------------------
// Matrix-one
vmone.q/t/p md 6/5/4 5/4/3
{
for (i = 0; i < |q/t/p|; ++i)
for (j = 0; j < |q/t/p|; ++j)
md[i][j] = 1.0;
}
-----------------------------------------
// Rotation vector
vrot.q/t/p vd, vs.s, [+c/-c/-s/+s/0,...] 2 1
{
for (i = 0; i < |q/t/p|; ++i)
vd[i] = (+1.0 | -1.0) * (cos | sin)(vs.s*PI/2.0) | 0;
}
-----------------------------------------
vt4444.q vd, vs 1 0
{
vd[0]( 0..15) = ((vs[0] & 0xF0000000) >> 16) | ((vs[0] & 0xF00000) >> 12) | ((vs[0] & 0xF000) >> 8) | ((vs[0] & 0xF0) >> 4);
vd[0](16..31) = ((vs[1] & 0xF0000000) >> 16) | ((vs[1] & 0xF00000) >> 12) | ((vs[1] & 0xF000) >> 8) | ((vs[1] & 0xF0) >> 4);
vd[1]( 0..15) = ((vs[2] & 0xF0000000) >> 16) | ((vs[2] & 0xF00000) >> 12) | ((vs[2] & 0xF000) >> 8) | ((vs[2] & 0xF0) >> 4);
vd[1](16..31) = ((vs[3] & 0xF0000000) >> 16) | ((vs[3] & 0xF00000) >> 12) | ((vs[3] & 0xF000) >> 8) | ((vs[3] & 0xF0) >> 4);
}
-----------------------------------------
vt5551.q vd, vs 1 0
{
vd[0]( 0..15) = ((vs[0] & 0x80000000) >> 16) | ((vs[0] & 0xF80000) >> 9) | ((vs[0] & 0xF800) >> 6) | ((vs[0] & 0xF8) >> 3);
vd[0](16..31) = ((vs[1] & 0x80000000) >> 16) | ((vs[1] & 0xF80000) >> 9) | ((vs[1] & 0xF800) >> 6) | ((vs[1] & 0xF8) >> 3);
vd[1]( 0..15) = ((vs[2] & 0x80000000) >> 16) | ((vs[2] & 0xF80000) >> 9) | ((vs[2] & 0xF800) >> 6) | ((vs[2] & 0xF8) >> 3);
vd[1](16..31) = ((vs[3] & 0x80000000) >> 16) | ((vs[3] & 0xF80000) >> 9) | ((vs[3] & 0xF800) >> 6) | ((vs[3] & 0xF8) >> 3);
}
-----------------------------------------
vt5650.q vd, vs 1 0
{
vd[0]( 0..15) = ((vs[0] & 0xF80000) >> 8) | ((vs[0] & 0xFC00) >> 5) | ((vs[0] & 0xF8) >> 3);
vd[0](16..31) = ((vs[1] & 0xF80000) >> 8) | ((vs[1] & 0xFC00) >> 5) | ((vs[1] & 0xF8) >> 3);
vd[1]( 0..15) = ((vs[2] & 0xF80000) >> 8) | ((vs[2] & 0xFC00) >> 5) | ((vs[2] & 0xF8) >> 3);
vd[1](16..31) = ((vs[3] & 0xF80000) >> 8) | ((vs[3] & 0xFC00) >> 5) | ((vs[3] & 0xF8) >> 3);
}
-----------------------------------------
vcrs.t vd, vs, vt 1 0
{
vd[0] = vs[1] * vt[2];
vd[1] = vs[2] * vt[0];
vd[2] = vs[0] * vt[1];
}
-----------------------------------------
// Negative reciprocal
vnrcp.q/t/p/s vd, vs (UNSURE) 4/?/?/? 3
{
for (i = 0; i < |q/t/p|; ++i)
vd[i] = -1.0 / vs[i];
}
-----------------------------------------
// Negative sinus
vnsin.q/t/p/s vd, vs (UNSURE) 4/?/?/? 3
{
for (i = 0; i < |q/t/p|; ++i)
vd[i] = -sin(vs[i]*PI/2);
}
-----------------------------------------
// Reciprocal exponent to base 2
vrexp2.q/t/p/s vd, vs 4/?/?/? 3
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = 1.0 / exp2(vs[i]);
}
-----------------------------------------
// Vector cross-product
vcrsp.t vd, vs, vt 3 2
{
vd[0] = vs[1]*vt[2] - vs[2]*vt[1];
vd[1] = vs[2]*vt[0] - vs[0]*vt[2];
vd[2] = vs[0]*vt[1] - vs[1]*vt[0];
}
-----------------------------------------
// Vector determinant
vdet.p vd.s, vs, vt 1 0
{
vd.s = vs[0] * vt[1] - vs[1] * vt[0];
}
-----------------------------------------
v(u)s2i.s vd.p, vs.s 1 0
{
vd.p[0] = (vs.s[0](16..31)) << 16;
vd.p[1] = (vs.s[0]( 0..15)) << 16;
}
v(u)s2i.p vd.q, vs.p 1 0
{
vd.q[0] = (vs.p[0](16..31)) << 16;
vd.q[1] = (vs.p[0]( 0..15)) << 16;
vd.q[2] = (vs.p[1](16..31)) << 16;
vd.q[3] = (vs.p[1]( 0..15)) << 16;
}
-----------------------------------------
vi2(u)s.s vd.s, vs.p 1 0
{
vd.s[0](16..31) = vs.p[0] >> 16;
vd.s[0]( 0..15) = vs.p[1] >> 16;
}
vi2(u)s.p vd.p, vs.q 1 0
{
vd.p[0](16..31) = vs.q[0] >> 16;
vd.p[0]( 0..15) = vs.q[1] >> 16;
vd.p[1](16..31) = vs.q[2] >> 16;
vd.p[1]( 0..15) = vs.q[3] >> 16;
}
-----------------------------------------
// Nvidia Half format [S:1][E:5][M:10]
vh2f.p vd, vs 1 0
{
vd[0] = ((vs[0] & 0x8000) << 16) | ((((vs[0] >> 10) & 0x1F) + 0x70) << 23) | ((vs[0] & 0x03FF) << 13);
vd[1] = (vs[0] & 0x80000000) | ((((vs[0] >> 10) & 0x1F0000) + 0x700000) << 7) | ((vs[0] & 0x03FF0000) >> 3);
vd[2] = ((vs[1] & 0x8000) << 16) | ((((vs[1] >> 10) & 0x1F) + 0x70) << 23) | ((vs[1] & 0x03FF) << 13);
vd[3] = (vs[1] & 0x80000000) | ((((vs[1] >> 10) & 0x1F0000) + 0x700000) << 7) | ((vs[1] & 0x03FF0000) >> 3);
}
-----------------------------------------
vsocp.p/s vd.q/p, vs.p/s 1 0
{
for (i = 0; i < |p/s|; ++i)
vd[i*2+0] = 1.0 - vs[i];
vd[i*2+1] = vs[i];
}
-----------------------------------------
vsbz.s vd.s, vs.s 1 0
{
// TODO Byte To Short Extension ?
}
vsbn.s vd.s, vs.s, vt.s 1 0
{
// TODO Byte to Short Extension ?
}
vlgb.s vd.s, vs.s 1 0
{
// TODO
}
vwbn.s vd.s, vs.s, imm 1 0
{
// TODO Byte to Word Extension ?
}
-----------------------------------------
viim.s vd.s, constant integer 1 0
{
vd.s = constant integer (between -32768 and 32767 ?);
}
vfim.s vd.s, constant real 1 0
{
vd.s = constant real;
}
-----------------------------------------
vnop 1 0
{
// do nothing except eating 1 cycle
}
-----------------------------------------
vflush 5 4
{
// TODO
}
vsync 4 3
{
// TODO
}
vsync i 1 0
{
// TODO
}
NOTES:
(UNSURE) besides an op means the given C counterpart is questionable
Clock ticks are benched estimates, but should be accurate.
*The latency column is to be understood like this:
the exec cost is the (clock) ticks minus the latency and is unavoidable cost, while latency is the 'playroom' to interleave
the code with other (independant) ops without additional costs.
Unfortunately, this does not seem to work with VFPU ops - so either the VFPU isn't pipelined or most ops with latency
just use the whole pipeline already. It works however with normal mips code (that's how it was benched). This code
interleaving is recommended especially with matrix and other costly ops.
|
Last edited by hlide on Mon Jun 23, 2008 8:10 pm; edited 17 times in total |
|
Back to top |
|
|
Raphael
Joined: 17 Jan 2006 Posts: 646 Location: Germany
|
Posted: Wed Nov 08, 2006 5:53 am Post subject: Re: VFPU diggins |
|
|
hlide wrote: |
NOTE: in fact i was first puzzled by the <<16 operation but now i find it logical in so far as it simplifies the operation (no need to extend sign this way for vfpu logic circuits).
If you need then to convert them in floats, just do "vi2f vd, vs, 16".
|
Yep. Had the same problem when I tried converting short arrays to float arrays for VFPU processing in libavcodec. The same goes for the reverse way, ie first do "vf2i vd, vs, 16" and then "vi2(u)s vd, vs".
I'd suggest designing your notation to differentiate between single and quad registers, as sometimes they are combined in operations and it's not immediately clear which operand has which format. Sth. lik vqs/d is quad register and vss/d is single register or alike.
Here's some of my findings:
Code: |
vocp.s vsd, vss
{
vsd = 1.0 - vss
}
vrsq.s vsd, vss
{
vsd = 1.0 / sqrt(vss)
}
vsat0.q/t/p/s vqd, vqs
{
(i=0..3)
vqd[i] = (vqs[i] < 0) ? 0 : ((vqs[i] > 1.0) ? 1.0 : vqs[i])
}
|
Apart from that, the vscl operation can also saturate using the destination register extension with brackets:
Code: |
vscl.q/t/p/s vqd[L1:T1, L2:T2, L3:T3, L4:T4], vqs, vst
{
(i=0..3)
vqd[i] = CLAMP(vqs[i] * vst, Li, Ti)
}
|
So you can clamp to range -1:1 for example (useful for normalizations), or any other constants that can be used in those fields.
Quote: |
by the way, psp-documentation from hitmen seems to be in standby :/ |
Unfortunately, yes :( _________________ <Don't push the river, it flows.>
http://wordpress.fx-world.org - my devblog
http://wiki.fx-world.org - VFPU documentation wiki
Alexander Berl |
|
Back to top |
|
|
hlide
Joined: 10 Sep 2006 Posts: 750
|
Posted: Wed Nov 08, 2006 9:48 am Post subject: |
|
|
added nearly all the instructions but a lot to be done too :/ |
|
Back to top |
|
|
dot_blank
Joined: 28 Sep 2005 Posts: 498 Location: Brasil
|
Posted: Wed Nov 08, 2006 11:03 am Post subject: |
|
|
i am finally glad somebody took it up themselves
to start something like this ....cheers hlide and raphael _________________ 10011011 00101010 11010111 10001001 10111010 |
|
Back to top |
|
|
Raphael
Joined: 17 Jan 2006 Posts: 646 Location: Germany
|
Posted: Wed Nov 08, 2006 1:31 pm Post subject: |
|
|
Some things from the list I can complete/confirm:
Code: |
// homogenous dot product
vhdp.q/t/p/s sd.s, vs, vt (UNSURE)
{
sd.s = vt.s;
for (i = 1; i < |q/t/p|; ++i)
sd.s += vs[i] * vt[i];
}
-----------------------------------------
// Funnel add components
vfad.q/t/p/s sd.s, vs
{
sd.s = 0;
for (i = 0; i < |q/t/p/s|; ++i)
sd.s += vs[i];
}
-----------------------------------------
// Average of components
vavg.q/t/p/s sd.s, vs
{
sd.s = 0.0
for (i = 0; i < |q/t/p/s|; ++i)
sd.s += vs[i];
sd.s /= |q/t/p/s|;
}
-----------------------------------------
// Round
vf2in.q/t/p/s vd, sd, imm
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = ROUND(vs[i]) << imm;
}
-----------------------------------------
// Trunc
vf2iz.q/t/p/s vd, sd, imm
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = TRUNC(vs[i]) << imm;
}
-----------------------------------------
// Floor
vf2iu.q/t/p/s vd, sd, imm
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = FLOOR(vs[i]) << imm;
}
-----------------------------------------
// Ceil
vf2id.q/t/p/s vd, sd, imm
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = CEIL(vs[i]) << imm;
}
-----------------------------------------
vi2f.q/t/p/s vd, sd, imm
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = (float)(vs[i] >> imm);
}
-----------------------------------------
vcmov.q/t/p/s vd, sd, cc (UNSURE)
{
if (CC[cc])
vd = sd;
}
vcmovt.q/t/p/s vd, sd, cc (UNSURE)
{
if (CC[cc] == TRUE)
vd = sd;
}
vcmovf.q/t/p/s vd, sd, cc (UNSURE)
{
if (CC[cc] == FALSE)
vd = sd;
}
-----------------------------------------
// matrix multiplication
vmmul.q/t/p md, ms, mt
{
for (i = 0; i < |q/t/p|; ++i)
for (j = 0; j < |q/t/p|; ++j)
md[i][j] = 0;
for (k = 0; k < |q/t/p|; ++k)
md[i][j] += ms[i][k] * mt[k][j];
}
-----------------------------------------
// Matrix-vector transform
vtfm4.q/3.t/2.p vd, md, vt
{
for (i = 0; i < |q/t/p|; ++i)
vd[i] = 0;
for (j = 0; j < |q/t/p|; ++j)
vd[i] += md[i][j] * vt[j];
}
-----------------------------------------
// homogenous transform
vhtfm4.q/3.t/2.p/1.s vd, md, vt (UNSURE esp 1.s case?)
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = 0;
for (j = 0; j < |q/t/p/s|; ++j)
vd[i] += md[i][j] * vt[j];
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] /= vd[|q/t/p/s|];
}
-----------------------------------------
// matrix scale
vmscl.q/t/p md, ms, st
{
for (i = 0; i < |q/t/p|; ++i)
for (j = 0; j < |q/t/p|; ++j)
md[i][j] = ms[i][j] * st;
}
-----------------------------------------
vmmov.q/t/p md, ms
{
for (i = 0; i < |q/t/p|; ++i)
for (j = 0; j < |q/t/p|; ++j)
md[i][j] = ms[i][j];
}
-----------------------------------------
vmidt.q/t/p md
{
for (i = 0; i < |q/t/p|; ++i)
for (j = 0; j < |q/t/p|; ++j)
md[i][j] = (i == j) ? 1.0 : 0.0;
}
-----------------------------------------
vmzero.q/t/p md
{
for (i = 0; i < |q/t/p|; ++i)
for (j = 0; j < |q/t/p|; ++j)
md[i][j] = 0.0;
}
-----------------------------------------
vmone.q/t/p md
{
for (i = 0; i < |q/t/p|; ++i)
for (j = 0; j < |q/t/p|; ++j)
md[i][j] = 1.0;
}
-----------------------------------------
vrot.q/t/p vd, ss, [+c/-c/-s/+s/0,...]
{
for (i = 0; i < |q/t/p|; ++i)
vd[i] = +/- cos/sin(ss) | 0;
}
-----------------------------------------
vnrcp.q/t/p/s vd, vs (UNSURE)
{
for (i = 0; i < |q/t/p|; ++i)
vd[i] = -1.0 / vs[i];
}
-----------------------------------------
vnsin.q/t/p/s vd, vs (UNSURE)
{
for (i = 0; i < |q/t/p|; ++i)
vd[i] = -sin(vs[i]*PI/2);
}
-----------------------------------------
vrexp2.q/t/p/s vd, vs
{
for (i = 0; i < |q/t/p/s|; ++i)
vd[i] = 1.0 / exp2(vs[i]);
}
-----------------------------------------
vcrsp.t vd, vs, vt
{
vd[0] = vs[1]*vt[2] - vs[2]*vt[1];
vd[1] = vs[2]*vt[0] - vs[0]*vt[2];
vd[2] = vs[0]*vt[1] - vs[1]*vt[0];
}
-----------------------------------------
|
I'd also suppose that the half format is [1:5:10], though the conversion steps still has to get found out, but it should be straight forward. No shift arguments there ;)
I wanted to do something like this for some time now, but always was too lazy to begin writing down everything :) I need to slap myself that hlide had to appear before I did something
I wonder what that vcrs.t does, as there already is the cross product. Also vdet.p, though that could possibly just be a simple (vs[0]*vt[1] - vs[1]*vt[0]). Are there definately no .t/q versions? Gonna play around with that when I find time and I'll then add some more things _________________ <Don't push the river, it flows.>
http://wordpress.fx-world.org - my devblog
http://wiki.fx-world.org - VFPU documentation wiki
Alexander Berl |
|
Back to top |
|
|
hlide
Joined: 10 Sep 2006 Posts: 750
|
Posted: Wed Nov 08, 2006 6:02 pm Post subject: |
|
|
nice catch for vfad, i was clueless.
opc-mips.c :
there is only one vcrs.t and vdet.p. if vdet.t exists, its opcode would probably be something like 0x67808000 + vd.t + (vs.t << 8). But my opinion is that the computation of a determinant for 3d vector being different than a 2d vector may explain this :
det([a]) = a
det([[a b][c d]]) = ad - bc.
det([[a b c][d e f][g h i]) = aei + dhc + gbf - ceg - fha - ibd.
I would investigate vcrs.t as soon as I can.
I will add your diggins as soon as possible.
N.B.: is the word "diggins" correct or is this a pure invention of mine ? i fail to find a french traduction for this word. |
|
Back to top |
|
|
hlide
Joined: 10 Sep 2006 Posts: 750
|
Posted: Wed Nov 08, 2006 9:59 pm Post subject: |
|
|
Raphael wrote: | Some things from the list I can complete/confirm:
Code: |
// homogenous dot product
vhdp.q/t/p/s sd.s, vs, vt (UNSURE)
{
sd.s = vt.s;
for (i = 1; i < |q/t/p|; ++i)
sd.s += vs[i] * vt[i];
} |
|
vhdp.q ==> return Xs + Ys*Yt + Zs*Zt + Ws*Wt ? |
|
Back to top |
|
|
Raphael
Joined: 17 Jan 2006 Posts: 646 Location: Germany
|
Posted: Wed Nov 08, 2006 10:09 pm Post subject: |
|
|
hlide wrote: | Raphael wrote: | Some things from the list I can complete/confirm:
Code: |
// homogenous dot product
vhdp.q/t/p/s sd.s, vs, vt (UNSURE)
{
sd.s = vt.s;
for (i = 1; i < |q/t/p|; ++i)
sd.s += vs[i] * vt[i];
} |
|
vhdp.q ==> return Xs + Ys*Yt + Zs*Zt + Ws*Wt ? |
Oh, no, actually it should be Xs*Xt + Ys*Yt + Zs*Zt + Wt :D But still not sure if that is correct _________________ <Don't push the river, it flows.>
http://wordpress.fx-world.org - my devblog
http://wiki.fx-world.org - VFPU documentation wiki
Alexander Berl |
|
Back to top |
|
|
hlide
Joined: 10 Sep 2006 Posts: 750
|
Posted: Wed Nov 08, 2006 11:30 pm Post subject: |
|
|
i'm digging vcrs.t :
Code: |
vcrs.t [1 0 0],[1 0 0] => [0 0 0]
vcrs.t [1 0 0],[0 1 0] => [0 0 1] => vd[2] = vs[0] x vt[1] ?
vcrs.t [1 0 0],[0 0 1] => [0 0 0]
vcrs.t [0 1 0],[1 0 0] => [0 0 0]
vcrs.t [0 1 0],[0 1 0] => [0 0 0]
vcrs.t [0 1 0],[0 0 1] => [1 0 0] => vd[0] = vs[1] x vt[2] ?
vcrs.t [0 0 1],[1 0 0] => [0 1 0] => vd[1] = vs[2] x vt[0] ?
vcrs.t [0 0 1],[0 1 0] => [0 0 0]
vcrs.t [0 0 1],[0 0 1] => [0 0 0]
vcrs.t [1 2 0],[1 2 0] => [0 0 2] => [ 0, 0, vs[0] x vt[1] ] !
vcrs.t [1 2 0],[0 1 2] => [4 0 1] => [ vs[1] x vt[2], 0, vs[0] x vt[1] ] !
vcrs.t [1 2 0],[2 0 1] => [2 0 0] => [ vs[1] x vt[2], 0, 0 ] !
vcrs.t [0 1 2],[1 2 0] => [0 2 0] => [ 0, vs[2] x vt[0], 0 ] !
vcrs.t [0 1 2],[0 1 2] => [2 0 0] => [ vs[1] x vt[2], 0, 0 ] !
vcrs.t [0 1 2],[2 0 1] => [1 4 0] => [ vs[1] x vt[2], vs[2] x vt[0] ] !
vcrs.t [2 0 1],[1 2 0] => [0 1 4] => [ 0, vs[2] x vt[0], vs[0] x vt[1] ] !
vcrs.t [2 0 1],[0 1 2] => [0 0 2] => [ 0, 0, vs[0] x vt[1] ] !
vcrs.t [2 0 1],[2 0 1] => [0 2 0] => [ 0, vs[2] x vt[0], 0 ] !
|
it looks like :
Code: |
vcrs.t vd, vs, vt
{
vd[0] = vs[1] x vt[2];
vd[1] = vs[2] x vt[0];
vd[2] = vs[0] x vt[1];
}
|
|
|
Back to top |
|
|
hlide
Joined: 10 Sep 2006 Posts: 750
|
Posted: Wed Nov 08, 2006 11:54 pm Post subject: |
|
|
i'm digging vdet.p as we suppose it does : vd.s = vs[0] x vt[1] - vs[1] x vt[0].
some tests just to check :
Code: |
vdet.p [1 0],[1 0] => 0
vdet.p [1 0],[0 1] => 1 => vs[0] x vt[1]
vdet.p [1 0],[1 1] => 1 => vs[0] x vt[1]
vdet.p [0 1],[1 0] => -1 => -(vs[1] x vt[0])
vdet.p [0 1],[0 1] => 0
vdet.p [0 1],[1 1] => -1 => -(vs[1] x vt[0])
vdet.p [1 1],[1 0] => -1 => -(vs[1] x vt[0])
vdet.p [1 1],[0 1] => 1 => vs[0] x vt[1]
vdet.p [1 1],[1 1] => 0 => vs[0] x vt[1] - vs[1] x vt[0] = 0
|
Code: |
vdet.p vd.s, vs, vt
{
vd.s = vs[0] * vt[1] - vs[1] * vt[0];
}
|
Last edited by hlide on Thu Nov 09, 2006 2:34 am; edited 1 time in total |
|
Back to top |
|
|
hlide
Joined: 10 Sep 2006 Posts: 750
|
Posted: Wed Nov 08, 2006 11:57 pm Post subject: |
|
|
LIST UPDATED ! |
|
Back to top |
|
|
hlide
Joined: 10 Sep 2006 Posts: 750
|
Posted: Thu Nov 09, 2006 2:28 am Post subject: |
|
|
hlide wrote: | if vdet.t exists, its opcode would probably be something like 0x67808000 + vd.t + (vs.t << 8). |
I tried this one -> crash. So only vdet.p seems to exist. |
|
Back to top |
|
|
Raphael
Joined: 17 Jan 2006 Posts: 646 Location: Germany
|
Posted: Thu Nov 09, 2006 4:57 am Post subject: |
|
|
hlide wrote: | i'm digging vcrs.t :
Code: |
vcrs.t [1 0 0],[1 0 0] => [0 0 0]
vcrs.t [1 0 0],[0 1 0] => [0 0 1] => vd[2] = vs[0] x vt[1] ?
vcrs.t [1 0 0],[0 0 1] => [0 0 0]
vcrs.t [0 1 0],[1 0 0] => [0 0 0]
vcrs.t [0 1 0],[0 1 0] => [0 0 0]
vcrs.t [0 1 0],[0 0 1] => [1 0 0] => vd[0] = vs[1] x vt[2] ?
vcrs.t [0 0 1],[1 0 0] => [0 1 0] => vd[1] = vs[2] x vt[0] ?
vcrs.t [0 0 1],[0 1 0] => [0 0 0]
vcrs.t [0 0 1],[0 0 1] => [0 0 0]
vcrs.t [1 2 0],[1 2 0] => [0 0 2] => [ 0, 0, vs[0] x vt[1] ] !
vcrs.t [1 2 0],[0 1 2] => [4 0 1] => [ vs[1] x vt[2], 0, vs[0] x vt[1] ] !
vcrs.t [1 2 0],[2 0 1] => [2 0 0] => [ vs[1] x vt[2], 0, 0 ] !
vcrs.t [0 1 2],[1 2 0] => [0 2 0] => [ 0, vs[2] x vt[0], 0 ] !
vcrs.t [0 1 2],[0 1 2] => [2 0 0] => [ vs[1] x vt[2], 0, 0 ] !
vcrs.t [0 1 2],[2 0 1] => [1 4 0] => [ vs[1] x vt[2], vs[2] x vt[0] ] !
vcrs.t [2 0 1],[1 2 0] => [0 1 4] => [ 0, vs[2] x vt[0], vs[0] x vt[1] ] !
vcrs.t [2 0 1],[0 1 2] => [0 0 2] => [ 0, 0, vs[0] x vt[1] ] !
vcrs.t [2 0 1],[2 0 1] => [0 2 0] => [ 0, vs[2] x vt[0], 0 ] !
|
it looks like :
Code: |
vcrs.t vd, vs, vt
{
vd[0] = vs[1] x vt[2];
vd[1] = vs[2] x vt[0];
vd[2] = vs[0] x vt[1];
}
|
|
That makes sense, as it would be one part of the crossproduct. I need to redo my VFPU clocktick bench with those new ops :)
About the vdet.t I don't know. If it exists, it shouldn't crash normally. Is it supported by GCC?
And the homogenuous dot product needs revision to make up for (x*x+y*y+z*z+w):
Code: |
__[ homogenous dot product ]__
vhdp.q/t/p vd.s, vs, vt (UNSURE)
{
vd.s = vt[|q/t/p|];
for (i = 0; i < |q/t/p|-1; ++i)
vd.s += vs[i] * vt[i];
}
|
So the last component of the second operand vs is considered to be 1.0 basically. Still unsure/needs checking _________________ <Don't push the river, it flows.>
http://wordpress.fx-world.org - my devblog
http://wiki.fx-world.org - VFPU documentation wiki
Alexander Berl
Last edited by Raphael on Thu Nov 09, 2006 5:15 am; edited 1 time in total |
|
Back to top |
|
|
hlide
Joined: 10 Sep 2006 Posts: 750
|
Posted: Thu Nov 09, 2006 5:05 am Post subject: |
|
|
Raphael wrote: | About the vdet.t I don't know. If it exists, it shouldn't crash normally. Is it supported by GCC? |
as already said, it crashes so it is not supported. |
|
Back to top |
|
|
Raphael
Joined: 17 Jan 2006 Posts: 646 Location: Germany
|
Posted: Thu Nov 09, 2006 5:08 am Post subject: |
|
|
hlide wrote: | Raphael wrote: | About the vdet.t I don't know. If it exists, it shouldn't crash normally. Is it supported by GCC? |
as already said, it crashes so it is not supported. |
Oh yes, I misread the "if" there :D It makes sense to not exist, since as you said the 3d vector determinant needs three input vectors. And that's not possible at all _________________ <Don't push the river, it flows.>
http://wordpress.fx-world.org - my devblog
http://wiki.fx-world.org - VFPU documentation wiki
Alexander Berl |
|
Back to top |
|
|
hlide
Joined: 10 Sep 2006 Posts: 750
|
Posted: Fri Nov 10, 2006 6:15 pm Post subject: |
|
|
Raphael wrote: | hlide wrote: | Raphael wrote: | About the vdet.t I don't know. If it exists, it shouldn't crash normally. Is it supported by GCC? |
as already said, it crashes so it is not supported. |
Oh yes, I misread the "if" there :D It makes sense to not exist, since as you said the 3d vector determinant needs three input vectors. And that's not possible at all |
if you have their cycles, it would be interesting to add in the list. :) |
|
Back to top |
|
|
Raphael
Joined: 17 Jan 2006 Posts: 646 Location: Germany
|
Posted: Fri Nov 17, 2006 1:21 am Post subject: |
|
|
Do you have any idea which version of binutils/pspsdk I need to have, to be able to use the vbtf1/2 ops? I just tried updating pspsdk but that didn't help yet, the ops still aren't recognized. I tried updating binutils, but somehow that failed, so I need to try again.
I'll have an update to the document soon. A few new ops decoded plus most clock ticks. _________________ <Don't push the river, it flows.>
http://wordpress.fx-world.org - my devblog
http://wiki.fx-world.org - VFPU documentation wiki
Alexander Berl |
|
Back to top |
|
|
hlide
Joined: 10 Sep 2006 Posts: 750
|
|
Back to top |
|
|
hlide
Joined: 10 Sep 2006 Posts: 750
|
Posted: Fri Nov 17, 2006 1:39 am Post subject: |
|
|
oh my ! shouldn't be vbfy1/2 ?
I'm sorry, I DID misname them. I updated the text with the correct names. |
|
Back to top |
|
|
Raphael
Joined: 17 Jan 2006 Posts: 646 Location: Germany
|
Posted: Fri Nov 17, 2006 1:50 am Post subject: |
|
|
hlide wrote: | oh my ! shouldn't be vbfy1/2 ?
I'm sorry, I DID misname them. I updated the text with the correct names. |
Heh, that did the trick :) thanks _________________ <Don't push the river, it flows.>
http://wordpress.fx-world.org - my devblog
http://wiki.fx-world.org - VFPU documentation wiki
Alexander Berl |
|
Back to top |
|
|
Raphael
Joined: 17 Jan 2006 Posts: 646 Location: Germany
|
Posted: Fri Nov 17, 2006 2:47 am Post subject: |
|
|
Update to the document:
- added some ops C counterpart (vi2c, vqmul, ..)
- added lv/sv ops for completeness
- added clock ticks for nearly all ops (only some for .t/.p/.s versions are missing)
- moved operand prefixes up to pretty much the top
_________________ <Don't push the river, it flows.>
http://wordpress.fx-world.org - my devblog
http://wiki.fx-world.org - VFPU documentation wiki
Alexander Berl
Last edited by Raphael on Fri Nov 17, 2006 9:06 pm; edited 1 time in total |
|
Back to top |
|
|
hlide
Joined: 10 Sep 2006 Posts: 750
|
Posted: Fri Nov 17, 2006 6:27 am Post subject: |
|
|
Code: |
vpfxs [?0,?1,?2,?3]
?0, ?1, ?2 or ?3 can be :
x : vs[0]
y : vs[1]
z : vs[2]
w : vs[3]
-x : -vs[0]
-y : -vs[1]
-z : -vs[2]
-w : -vs[3]
|x| : |vs[0]| (absolute value of vs[0])
|y| : |vs[1]| (absolute value of vs[1])
|z| : |vs[2]| (absolute value of vs[2])
|w| : |vs[3]| (absolute value of vs[3])
0 : constant 0
1 : constant 1
2 : constant 2
1/2 : constant 1/2
3 : constant 3
1/3 : constant 1/3
1/4 : constant 1/4
1/6 : constant 1/6
---------------------------------
vpfxt [?0,?1,?2,?3]
same thing as vpfxs but for vt register
---------------------------------
vpfxd [?4,?5,?6,?7]
?4, ?5, ?6 and ?7 can be :
[0:1] : saturated between 0 and 1,
[-1:1] : saturated between -1 and 1,
m : ??? unknown
|
They are "documented" in opcodes\mips-dis.c :
Code: |
static const char * const pfx_cst_names[8] = {
"0", "1", "2", "1/2", "3", "1/3", "1/4", "1/6"
};
static const char * const pfx_swz_names[4] = {
"x", "y", "z", "w"
};
static const char * const pfx_sat_names[4] = {
"", "[0:1]", "", "[-1:1]"
};
...
case '0':
case '1':
case '2':
case '3':
{
unsigned int pos = *d, base = '0';
unsigned int negation = (l >> (pos - (base - VFPU_SH_PFX_NEG))) & VFPU_MASK_PFX_NEG;
unsigned int constant = (l >> (pos - (base - VFPU_SH_PFX_CST))) & VFPU_MASK_PFX_CST;
unsigned int abs_consthi =
(l >> (pos - (base - VFPU_SH_PFX_ABS_CSTHI))) & VFPU_MASK_PFX_ABS_CSTHI;
unsigned int swz_constlo = (l >> ((pos - base) * 2)) & VFPU_MASK_PFX_SWZ_CSTLO;
if (negation)
(*info->fprintf_func) (info->stream, "-");
if (constant)
{
(*info->fprintf_func) (info->stream, "%s",
pfx_cst_names[(abs_consthi << 2) | swz_constlo]);
}
else
{
if (abs_consthi)
(*info->fprintf_func) (info->stream, "|%s|",
pfx_swz_names[swz_constlo]);
else
(*info->fprintf_func) (info->stream, "%s",
pfx_swz_names[swz_constlo]);
}
}
break;
case '4':
case '5':
case '6':
case '7':
{
unsigned int pos = *d, base = '4';
unsigned int mask = (l >> (pos - (base - VFPU_SH_PFX_MASK))) & VFPU_MASK_PFX_MASK;
unsigned int saturation = (l >> ((pos - base) * 2)) & VFPU_MASK_PFX_SAT;
if (mask)
(*info->fprintf_func) (info->stream, "m");
else
(*info->fprintf_func) (info->stream, "%s",
pfx_sat_names[saturation]);
}
break;
|
|
|
Back to top |
|
|
Raphael
Joined: 17 Jan 2006 Posts: 646 Location: Germany
|
Posted: Fri Nov 17, 2006 8:43 pm Post subject: |
|
|
Another update:
- added vsrt*, vsocp, vf2h/vh2f
- added prefix information from hlide's last post
- removed exec cycles from exec/latency column (better readability) and added missing latencies for .t/p/s variations
- added '?' where clock ticks information is missing
only missing ops now are vcmp versions, byte to X extensions and vflush as well as vsync.
The information should next be formatted in a better readable way into a .pdf or something.
_________________ <Don't push the river, it flows.>
http://wordpress.fx-world.org - my devblog
http://wiki.fx-world.org - VFPU documentation wiki
Alexander Berl
Last edited by Raphael on Fri Nov 17, 2006 10:57 pm; edited 1 time in total |
|
Back to top |
|
|
hlide
Joined: 10 Sep 2006 Posts: 750
|
Posted: Fri Nov 17, 2006 9:41 pm Post subject: |
|
|
vsrt1/2/3/4.q vd, vs are very tough ones but i think to discover what they do :
Code: |
vsrt1.q vd, vs
{
vd[0] = min(vs[0], vs[1]);
vd[1] = max(vs[1], vs[0]);
vd[2] = min(vs[2], vs[3]);
vd[3] = max(vs[3], vs[2]);
}
vsrt2.q vd, vs
{
vd[0] = min(vs[0], vs[3]);
vd[1] = max(vs[1], vs[2]);
vd[2] = min(vs[2], vs[1]);
vd[3] = max(vs[3], vs[0]);
}
vsrt3.q vd, vs
{
vd[0] = max(vs[0], vs[1]);
vd[1] = min(vs[1], vs[0]);
vd[2] = max(vs[2], vs[3]);
vd[3] = min(vs[3], vs[2]);
}
vsrt4.q vd, vs
{
vd[0] = max(vs[0], vs[3]);
vd[1] = max(vs[1], vs[2]);
vd[2] = min(vs[2], vs[1]);
vd[3] = min(vs[3], vs[0]);
}
|
I wish Raphael can confirm those operations.
I used 4 vectors as vs :
[1 2 3 4]
[2 3 4 1]
[3 4 1 2]
[4 1 2 3]
results for vsrt1 :
[1 2 3 4]
[2 3 4 1] => 4->1 and 1->4
[3 4 1 2]
[4 1 2 3] => 4->1 and 1->4
results for vsrt2 :
[1 2 3 4]
[1 3 4 2] => 2->1 and 1->2
[2 1 4 3] => 3->2 and 4->1 and 1->4 and 2->3
[4 1 2 3] => 4->1 and 1->4
results for vsrt3 :
[2 1 4 3] => 1->2 and 2->1 and 3->4 and 4->3
[3 2 4 1] => 2->3 and 3->2
[4 3 2 1] => 3->4 and 4->3 and 1->2 and 2->1
[4 1 3 2] => 2->3 and 3->2
results for vsrt4 :
[4 3 2 1] => 1->4 and 2->3 and 3->2 and 4->1
[2 4 3 1] => 3->4 and 4->3
[3 4 1 2]
[4 2 1 3] => 1->2 and 2->1
Due to their apparent "random" permutations, i felt min and max were probably the key to their weirdness. |
|
Back to top |
|
|
hlide
Joined: 10 Sep 2006 Posts: 750
|
Posted: Fri Nov 17, 2006 9:45 pm Post subject: |
|
|
oh i miss you post, Raphael ! well i can compare yours addition with mine. :) |
|
Back to top |
|
|
hlide
Joined: 10 Sep 2006 Posts: 750
|
Posted: Fri Nov 17, 2006 10:00 pm Post subject: |
|
|
Raphael:
ok, we found the same thing for vsrt1/2/3/4, that should be okay.
I updated the textfile in the first message, so i think you can erase your long text to alleviate the number of page to browse :).
By the way, groepaz plans to update his document with our findings. |
|
Back to top |
|
|
Raphael
Joined: 17 Jan 2006 Posts: 646 Location: Germany
|
Posted: Fri Nov 17, 2006 10:59 pm Post subject: |
|
|
Heh, finally, he already said he'd update it when I first posted my VFPU clock cycles :P
EDIT: I think we can leave only your min/max code for vsrt*, it's shorter and easier to read
Oh, and do you know how you can seed the random generator for VFPU? _________________ <Don't push the river, it flows.>
http://wordpress.fx-world.org - my devblog
http://wiki.fx-world.org - VFPU documentation wiki
Alexander Berl |
|
Back to top |
|
|
hlide
Joined: 10 Sep 2006 Posts: 750
|
Posted: Fri Nov 17, 2006 11:07 pm Post subject: |
|
|
Raphael wrote: | Heh, finally, he already said he'd update it when I first posted my VFPU clock cycles :P
EDIT: I think we can leave only your min/max code for vsrt*, it's shorter and easier to read
Oh, and do you know how you can seed the random generator for VFPU? |
VFPU has control registers and some are relative to random seed i guess. They are documented in groepaz's document.
Code: |
128 VFPU_PFXS Source prefix stack
129 VFPU_PFXT Target prefix stack
130 VFPU_PFXD Destination prefix stack
131 VFPU_CC Condition information
132 VFPU_INF4 VFPU internal information 4
133 VFPU_RSV5 Not used (reserved)
134 VFPU_RSV6 Not used (reserved)
135 VFPU_REV VFPU revision information
136 VFPU_RCX0 Pseudorandom number generator information 0
137 VFPU_RCX1 Pseudorandom number generator information 1
138 VFPU_RCX2 Pseudorandom number generator information 2
139 VFPU_RCX3 Pseudorandom number generator information 3
140 VFPU_RCX4 Pseudorandom number generator information 4
141 VFPU_RCX5 Pseudorandom number generator information 5
142 VFPU_RCX6 Pseudorandom number generator information 6
143 VFPU_RCX7 Pseudorandom number generator information 7
|
|
|
Back to top |
|
|
Raphael
Joined: 17 Jan 2006 Posts: 646 Location: Germany
|
Posted: Fri Nov 17, 2006 11:16 pm Post subject: |
|
|
Yeah, just stumbled upon those too. Hm, unfortunately I have no clue how to use them. Would be nice though, seeing how the vector random generator only takes 3 cycles to generate one random number. _________________ <Don't push the river, it flows.>
http://wordpress.fx-world.org - my devblog
http://wiki.fx-world.org - VFPU documentation wiki
Alexander Berl |
|
Back to top |
|
|
hlide
Joined: 10 Sep 2006 Posts: 750
|
Posted: Fri Nov 17, 2006 11:30 pm Post subject: |
|
|
vone.q and vzero.q take 3 cycles !?
vmov.q vd, vs[1, 1, 1, 1] and vmov.q vd, vs[0, 0, 0, 0] don't give us better cycles ? (at least 2 cyles instead of 3 ?), do they ?
random stuff, i'm trying to see how to use them. |
|
Back to top |
|
|
|
|
You cannot post new topics in this forum You cannot reply to topics in this forum You cannot edit your posts in this forum You cannot delete your posts in this forum You cannot vote in polls in this forum
|
Powered by phpBB © 2001, 2005 phpBB Group
|