Using FPU - compared with using pointers in PowerBasic
#COMPILE EXE
#DIM ALL
SUB MatrixMul4x4S(BYREF a() AS SINGLE, BYREF b() AS SINGLE, BYREF c() AS SINGLE )
DIM p AS SINGLE PTR
p=VARPTR(a(0))
! mov ecx,eax
p=VARPTR(b(0))
! mov edx,eax
p=VARPTR(c(0))
block:
! call column
! call column
! call column
! call column
EXIT SUB
column:
! call cell
! call cell
! call cell
! call cell
! add edx,16
! sub ecx,16
! ret
cell: ' row A * column B
! fld dword ptr [ecx ]
! fmul dword ptr [edx ]
! fld dword ptr [ecx+16]
! fmul dword ptr [edx+04]
! fld dword ptr [ecx+32]
! fmul dword ptr [edx+08]
! fld dword ptr [ecx+48]
! fmul dword ptr [edx+12]
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp dword ptr [eax]
! add eax,4
! add ecx,4
! ret
END SUB
SUB MatrixMul4x4(lhs() AS SINGLE, rhs() AS SINGLE, res() AS SINGLE)
DIM lhsp AS SINGLE PTR:lhsp=VARPTR(lhs(0))
DIM rhsp AS SINGLE PTR:rhsp=VARPTR(rhs(0))
' COLUMN 1
res(00)=@lhsp[0]*@rhsp[00]+@lhsp[4]*@rhsp[01]+@lhsp[08]*@rhsp[02]+@lhsp[12]*@rhsp[03]
res(01)=@lhsp[1]*@rhsp[00]+@lhsp[5]*@rhsp[01]+@lhsp[09]*@rhsp[02]+@lhsp[13]*@rhsp[03]
res(02)=@lhsp[2]*@rhsp[00]+@lhsp[6]*@rhsp[01]+@lhsp[10]*@rhsp[02]+@lhsp[14]*@rhsp[03]
res(03)=@lhsp[3]*@rhsp[00]+@lhsp[7]*@rhsp[01]+@lhsp[11]*@rhsp[02]+@lhsp[15]*@rhsp[03]
' COLUMN 1
res(04)=@lhsp[0]*@rhsp[04]+@lhsp[4]*@rhsp[05]+@lhsp[08]*@rhsp[06]+@lhsp[12]*@rhsp[07]
res(05)=@lhsp[1]*@rhsp[04]+@lhsp[5]*@rhsp[05]+@lhsp[09]*@rhsp[06]+@lhsp[13]*@rhsp[07]
res(06)=@lhsp[2]*@rhsp[04]+@lhsp[6]*@rhsp[05]+@lhsp[10]*@rhsp[06]+@lhsp[14]*@rhsp[07]
res(07)=@lhsp[3]*@rhsp[04]+@lhsp[7]*@rhsp[05]+@lhsp[11]*@rhsp[06]+@lhsp[15]*@rhsp[07]
' COLIMN 3
res(08)=@lhsp[0]*@rhsp[08]+@lhsp[4]*@rhsp[09]+@lhsp[08]*@rhsp[10]+@lhsp[12]*@rhsp[11]
res(09)=@lhsp[1]*@rhsp[08]+@lhsp[5]*@rhsp[09]+@lhsp[09]*@rhsp[10]+@lhsp[13]*@rhsp[11]
res(10)=@lhsp[2]*@rhsp[08]+@lhsp[6]*@rhsp[09]+@lhsp[10]*@rhsp[10]+@lhsp[14]*@rhsp[11]
res(11)=@lhsp[3]*@rhsp[08]+@lhsp[7]*@rhsp[09]+@lhsp[11]*@rhsp[10]+@lhsp[15]*@rhsp[11]
' COLUMN 4
res(12)=@lhsp[0]*@rhsp[12]+@lhsp[4]*@rhsp[13]+@lhsp[08]*@rhsp[14]+@lhsp[12]*@rhsp[15]
res(13)=@lhsp[1]*@rhsp[12]+@lhsp[5]*@rhsp[13]+@lhsp[09]*@rhsp[14]+@lhsp[13]*@rhsp[15]
res(14)=@lhsp[2]*@rhsp[12]+@lhsp[6]*@rhsp[13]+@lhsp[10]*@rhsp[14]+@lhsp[14]*@rhsp[15]
res(15)=@lhsp[3]*@rhsp[12]+@lhsp[7]*@rhsp[13]+@lhsp[11]*@rhsp[14]+@lhsp[15]*@rhsp[15]
END SUB
SUB getrow4S(BYREF matr AS SINGLE)
' test this
DIM row4f(3) AS GLOBAL SINGLE
DIM p AS SINGLE PTR
p=VARPTR(matr)
row4f(0)=@p[0]
row4f(1)=@p[4]
row4f(2)=@p[8]
row4f(3)=@p[12]
END SUB
FUNCTION PBMAIN () AS LONG
END FUNCTION
Charles
Then, what do you think would be the fastest ASM solution to perform these (color matrix)
SUB MatAssign(MR() AS SINGLE, MM() AS SINGLE)
'ARRAY ASSIGN MR() = MM(0, 0), MM(0, 1), MM(0, 2), MM(0, 3), MM(0, 4), _
' MM(1, 0), MM(1, 1), MM(1, 2), MM(1, 3), MM(1, 4), _
' MM(2, 0), MM(2, 1), MM(2, 2), MM(2, 3), MM(2, 4), _
' MM(3, 0), MM(3, 1), MM(3, 2), MM(3, 3), MM(3, 4), _
' MM(4, 0), MM(4, 1), MM(4, 2), MM(4, 3), MM(4, 4)
POKE$ VARPTR(MR(0,0)), PEEK$(VARPTR(MM(0,0)), 100)
END SUB
SUB MatMultiply(MR() AS SINGLE, M1() AS SINGLE, M2() AS SINGLE)
MR(0,0) = M1(0,0) * M2(0,0) + M1(0,1) * M2(1,0) + M1(0,2) * M2(2,0) + M1(0,3) * M2(3,0) + M1(0,4) * M2(4,0)
MR(0,1) = M1(1,0) * M2(0,0) + M1(1,1) * M2(1,0) + M1(1,2) * M2(2,0) + M1(1,3) * M2(3,0) + M1(1,4) * M2(4,0)
MR(0,2) = M1(2,0) * M2(0,0) + M1(2,1) * M2(1,0) + M1(2,2) * M2(2,0) + M1(2,3) * M2(3,0) + M1(2,4) * M2(4,0)
MR(0,3) = M1(3,0) * M2(0,0) + M1(3,1) * M2(1,0) + M1(3,2) * M2(2,0) + M1(3,3) * M2(3,0) + M1(3,4) * M2(4,0)
MR(0,4) = M1(4,0) * M2(0,0) + M1(4,1) * M2(1,0) + M1(4,2) * M2(2,0) + M1(4,3) * M2(3,0) + M1(4,4) * M2(4,0)
MR(1,0) = M1(0,0) * M2(0,1) + M1(0,1) * M2(1,1) + M1(0,2) * M2(2,1) + M1(0,3) * M2(3,1) + M1(0,4) * M2(4,1)
MR(1,1) = M1(1,0) * M2(0,1) + M1(1,1) * M2(1,1) + M1(1,2) * M2(2,1) + M1(1,3) * M2(3,1) + M1(1,4) * M2(4,1)
MR(1,2) = M1(2,0) * M2(0,1) + M1(2,1) * M2(1,1) + M1(2,2) * M2(2,1) + M1(2,3) * M2(3,1) + M1(2,4) * M2(4,1)
MR(1,3) = M1(3,0) * M2(0,1) + M1(3,1) * M2(1,1) + M1(3,2) * M2(2,1) + M1(3,3) * M2(3,1) + M1(3,4) * M2(4,1)
MR(1,4) = M1(4,0) * M2(0,1) + M1(4,1) * M2(1,1) + M1(4,2) * M2(2,1) + M1(4,3) * M2(3,1) + M1(4,4) * M2(4,1)
MR(2,0) = M1(0,0) * M2(0,2) + M1(0,1) * M2(1,2) + M1(0,2) * M2(2,2) + M1(0,3) * M2(3,2) + M1(0,4) * M2(4,2)
MR(2,1) = M1(1,0) * M2(0,2) + M1(1,1) * M2(1,2) + M1(1,2) * M2(2,2) + M1(1,3) * M2(3,2) + M1(1,4) * M2(4,2)
MR(2,2) = M1(2,0) * M2(0,2) + M1(2,1) * M2(1,2) + M1(2,2) * M2(2,2) + M1(2,3) * M2(3,2) + M1(2,4) * M2(4,2)
MR(2,3) = M1(3,0) * M2(0,2) + M1(3,1) * M2(1,2) + M1(3,2) * M2(2,2) + M1(3,3) * M2(3,2) + M1(3,4) * M2(4,2)
MR(2,4) = M1(4,0) * M2(0,2) + M1(4,1) * M2(1,2) + M1(4,2) * M2(2,2) + M1(4,3) * M2(3,2) + M1(4,4) * M2(4,2)
MR(3,0) = M1(0,0) * M2(0,3) + M1(0,1) * M2(1,3) + M1(0,2) * M2(2,3) + M1(0,3) * M2(3,3) + M1(0,4) * M2(4,3)
MR(3,1) = M1(1,0) * M2(0,3) + M1(1,1) * M2(1,3) + M1(1,2) * M2(2,3) + M1(1,3) * M2(3,3) + M1(1,4) * M2(4,3)
MR(3,2) = M1(2,0) * M2(0,3) + M1(2,1) * M2(1,3) + M1(2,2) * M2(2,3) + M1(2,3) * M2(3,3) + M1(2,4) * M2(4,3)
MR(3,3) = M1(3,0) * M2(0,3) + M1(3,1) * M2(1,3) + M1(3,2) * M2(2,3) + M1(3,3) * M2(3,3) + M1(3,4) * M2(4,3)
MR(3,4) = M1(4,0) * M2(0,3) + M1(4,1) * M2(1,3) + M1(4,2) * M2(2,1) + M1(4,3) * M2(3,3) + M1(4,4) * M2(4,3)
MR(4,0) = M1(0,0) * M2(0,4) + M1(0,1) * M2(1,4) + M1(0,2) * M2(2,4) + M1(0,3) * M2(3,4) + M1(0,4) * M2(4,4)
MR(4,1) = M1(1,0) * M2(0,4) + M1(1,1) * M2(1,4) + M1(1,2) * M2(2,4) + M1(1,3) * M2(3,4) + M1(1,4) * M2(4,4)
MR(4,2) = M1(2,0) * M2(0,4) + M1(2,1) * M2(1,4) + M1(2,2) * M2(2,4) + M1(2,3) * M2(3,4) + M1(2,4) * M2(4,4)
MR(4,3) = M1(3,0) * M2(0,4) + M1(3,1) * M2(1,4) + M1(3,2) * M2(2,4) + M1(3,3) * M2(3,4) + M1(3,4) * M2(4,4)
MR(4,4) = M1(4,0) * M2(0,4) + M1(4,1) * M2(1,4) + M1(4,2) * M2(2,4) + M1(4,3) * M2(3,4) + M1(4,4) * M2(4,4)
END SUB
Hi Patrice
I've tested your code in comparison with the pointer version and the FPU assembly code version.
My FPU version is about 10X faster. We can attribute this difference to the movement of data between memory and registers and the calculation of array element addresses, since the maths operations are identical.
There is a method described by Intel using SSE2 SIMD instructions which is twice the speed of the FPU. Unfortunately PB8 does not recognise the instructions for SSE2 so we would have to go down to machine code.
FUNCTION PBMAIN () AS LONG
DIM r(15) AS SINGLE
DIM a(15) AS SINGLE
DIM b(15) AS SINGLE
DIM i AS LONG
DIM t AS SINGLE
t=TIMER
FOR i=1 TO 1e6
' MatMultiply(r(),a(),b()) ' 1.6 seconds using 2d arrays
' MatrixMul4x4(a(),b(),r()) ' 0.39 seconds using 1d pointers
MatrixMul4x4S(a(),b(),r()) ' 0.17 seconds using FPU assembly code
NEXT
MSGBOX STR$(TIMER-t)
END FUNCTION
Charles
I have checked with the pointer version (see below) and I can't see any speed boost by me :(
SUB MatMultiply(MR() AS SINGLE, M1() AS SINGLE, M2() AS SINGLE)
' MR(0,0) = M1(0,0) * M2(0,0) + M1(0,1) * M2(1,0) + M1(0,2) * M2(2,0) + M1(0,3) * M2(3,0) + M1(0,4) * M2(4,0)
' MR(0,1) = M1(1,0) * M2(0,0) + M1(1,1) * M2(1,0) + M1(1,2) * M2(2,0) + M1(1,3) * M2(3,0) + M1(1,4) * M2(4,0)
' MR(0,2) = M1(2,0) * M2(0,0) + M1(2,1) * M2(1,0) + M1(2,2) * M2(2,0) + M1(2,3) * M2(3,0) + M1(2,4) * M2(4,0)
' MR(0,3) = M1(3,0) * M2(0,0) + M1(3,1) * M2(1,0) + M1(3,2) * M2(2,0) + M1(3,3) * M2(3,0) + M1(3,4) * M2(4,0)
' MR(0,4) = M1(4,0) * M2(0,0) + M1(4,1) * M2(1,0) + M1(4,2) * M2(2,0) + M1(4,3) * M2(3,0) + M1(4,4) * M2(4,0)
' MR(1,0) = M1(0,0) * M2(0,1) + M1(0,1) * M2(1,1) + M1(0,2) * M2(2,1) + M1(0,3) * M2(3,1) + M1(0,4) * M2(4,1)
' MR(1,1) = M1(1,0) * M2(0,1) + M1(1,1) * M2(1,1) + M1(1,2) * M2(2,1) + M1(1,3) * M2(3,1) + M1(1,4) * M2(4,1)
' MR(1,2) = M1(2,0) * M2(0,1) + M1(2,1) * M2(1,1) + M1(2,2) * M2(2,1) + M1(2,3) * M2(3,1) + M1(2,4) * M2(4,1)
' MR(1,3) = M1(3,0) * M2(0,1) + M1(3,1) * M2(1,1) + M1(3,2) * M2(2,1) + M1(3,3) * M2(3,1) + M1(3,4) * M2(4,1)
' MR(1,4) = M1(4,0) * M2(0,1) + M1(4,1) * M2(1,1) + M1(4,2) * M2(2,1) + M1(4,3) * M2(3,1) + M1(4,4) * M2(4,1)
' MR(2,0) = M1(0,0) * M2(0,2) + M1(0,1) * M2(1,2) + M1(0,2) * M2(2,2) + M1(0,3) * M2(3,2) + M1(0,4) * M2(4,2)
' MR(2,1) = M1(1,0) * M2(0,2) + M1(1,1) * M2(1,2) + M1(1,2) * M2(2,2) + M1(1,3) * M2(3,2) + M1(1,4) * M2(4,2)
' MR(2,2) = M1(2,0) * M2(0,2) + M1(2,1) * M2(1,2) + M1(2,2) * M2(2,2) + M1(2,3) * M2(3,2) + M1(2,4) * M2(4,2)
' MR(2,3) = M1(3,0) * M2(0,2) + M1(3,1) * M2(1,2) + M1(3,2) * M2(2,2) + M1(3,3) * M2(3,2) + M1(3,4) * M2(4,2)
' MR(2,4) = M1(4,0) * M2(0,2) + M1(4,1) * M2(1,2) + M1(4,2) * M2(2,2) + M1(4,3) * M2(3,2) + M1(4,4) * M2(4,2)
' MR(3,0) = M1(0,0) * M2(0,3) + M1(0,1) * M2(1,3) + M1(0,2) * M2(2,3) + M1(0,3) * M2(3,3) + M1(0,4) * M2(4,3)
' MR(3,1) = M1(1,0) * M2(0,3) + M1(1,1) * M2(1,3) + M1(1,2) * M2(2,3) + M1(1,3) * M2(3,3) + M1(1,4) * M2(4,3)
' MR(3,2) = M1(2,0) * M2(0,3) + M1(2,1) * M2(1,3) + M1(2,2) * M2(2,3) + M1(2,3) * M2(3,3) + M1(2,4) * M2(4,3)
' MR(3,3) = M1(3,0) * M2(0,3) + M1(3,1) * M2(1,3) + M1(3,2) * M2(2,3) + M1(3,3) * M2(3,3) + M1(3,4) * M2(4,3)
' MR(3,4) = M1(4,0) * M2(0,3) + M1(4,1) * M2(1,3) + M1(4,2) * M2(2,1) + M1(4,3) * M2(3,3) + M1(4,4) * M2(4,3)
' MR(4,0) = M1(0,0) * M2(0,4) + M1(0,1) * M2(1,4) + M1(0,2) * M2(2,4) + M1(0,3) * M2(3,4) + M1(0,4) * M2(4,4)
' MR(4,1) = M1(1,0) * M2(0,4) + M1(1,1) * M2(1,4) + M1(1,2) * M2(2,4) + M1(1,3) * M2(3,4) + M1(1,4) * M2(4,4)
' MR(4,2) = M1(2,0) * M2(0,4) + M1(2,1) * M2(1,4) + M1(2,2) * M2(2,4) + M1(2,3) * M2(3,4) + M1(2,4) * M2(4,4)
' MR(4,3) = M1(3,0) * M2(0,4) + M1(3,1) * M2(1,4) + M1(3,2) * M2(2,4) + M1(3,3) * M2(3,4) + M1(3,4) * M2(4,4)
' MR(4,4) = M1(4,0) * M2(0,4) + M1(4,1) * M2(1,4) + M1(4,2) * M2(2,4) + M1(4,3) * M2(3,4) + M1(4,4) * M2(4,4)
DIM M1p AS SINGLE PTR: M1p = VARPTR(M1(0,0))
DIM M2p AS SINGLE PTR: M2p = VARPTR(M2(0,0))
MR(0,0)=@M1p[0]*@M2p[00]+@M1p[5]*@M2p[01]+@M1p[10]*@M2p[02]+@M1p[15]*@M2p[03]+@M1p[20]*@M2p[04]
MR(0,1)=@M1p[1]*@M2p[00]+@M1p[6]*@M2p[01]+@M1p[11]*@M2p[02]+@M1p[16]*@M2p[03]+@M1p[21]*@M2p[04]
MR(0,2)=@M1p[2]*@M2p[00]+@M1p[7]*@M2p[01]+@M1p[12]*@M2p[02]+@M1p[17]*@M2p[03]+@M1p[22]*@M2p[04]
MR(0,3)=@M1p[3]*@M2p[00]+@M1p[8]*@M2p[01]+@M1p[13]*@M2p[02]+@M1p[18]*@M2p[03]+@M1p[23]*@M2p[04]
MR(0,4)=@M1p[4]*@M2p[00]+@M1p[9]*@M2p[01]+@M1p[14]*@M2p[02]+@M1p[19]*@M2p[03]+@M1p[24]*@M2p[04]
MR(1,0)=@M1p[0]*@M2p[05]+@M1p[5]*@M2p[06]+@M1p[10]*@M2p[07]+@M1p[15]*@M2p[08]+@M1p[20]*@M2p[09]
MR(1,1)=@M1p[1]*@M2p[05]+@M1p[6]*@M2p[06]+@M1p[11]*@M2p[07]+@M1p[16]*@M2p[08]+@M1p[21]*@M2p[09]
MR(1,2)=@M1p[2]*@M2p[05]+@M1p[7]*@M2p[06]+@M1p[12]*@M2p[07]+@M1p[17]*@M2p[08]+@M1p[22]*@M2p[09]
MR(1,3)=@M1p[3]*@M2p[05]+@M1p[8]*@M2p[06]+@M1p[13]*@M2p[07]+@M1p[18]*@M2p[08]+@M1p[23]*@M2p[09]
MR(1,4)=@M1p[4]*@M2p[05]+@M1p[9]*@M2p[06]+@M1p[14]*@M2p[07]+@M1p[19]*@M2p[08]+@M1p[24]*@M2p[09]
MR(2,0)=@M1p[0]*@M2p[10]+@M1p[5]*@M2p[11]+@M1p[10]*@M2p[12]+@M1p[15]*@M2p[13]+@M1p[20]*@M2p[14]
MR(2,1)=@M1p[1]*@M2p[10]+@M1p[6]*@M2p[11]+@M1p[11]*@M2p[12]+@M1p[16]*@M2p[13]+@M1p[21]*@M2p[14]
MR(2,2)=@M1p[2]*@M2p[10]+@M1p[7]*@M2p[11]+@M1p[12]*@M2p[12]+@M1p[17]*@M2p[13]+@M1p[22]*@M2p[14]
MR(2,3)=@M1p[3]*@M2p[10]+@M1p[8]*@M2p[11]+@M1p[13]*@M2p[12]+@M1p[18]*@M2p[13]+@M1p[23]*@M2p[14]
MR(2,4)=@M1p[4]*@M2p[10]+@M1p[9]*@M2p[11]+@M1p[14]*@M2p[12]+@M1p[19]*@M2p[13]+@M1p[24]*@M2p[14]
MR(3,0)=@M1p[0]*@M2p[15]+@M1p[5]*@M2p[16]+@M1p[10]*@M2p[17]+@M1p[15]*@M2p[18]+@M1p[20]*@M2p[19]
MR(3,1)=@M1p[1]*@M2p[15]+@M1p[6]*@M2p[16]+@M1p[11]*@M2p[17]+@M1p[16]*@M2p[18]+@M1p[21]*@M2p[19]
MR(3,2)=@M1p[2]*@M2p[15]+@M1p[7]*@M2p[16]+@M1p[12]*@M2p[17]+@M1p[17]*@M2p[18]+@M1p[22]*@M2p[19]
MR(3,3)=@M1p[3]*@M2p[15]+@M1p[8]*@M2p[16]+@M1p[13]*@M2p[17]+@M1p[18]*@M2p[18]+@M1p[23]*@M2p[19]
MR(3,4)=@M1p[4]*@M2p[15]+@M1p[9]*@M2p[16]+@M1p[14]*@M2p[17]+@M1p[19]*@M2p[18]+@M1p[24]*@M2p[19]
MR(4,0)=@M1p[0]*@M2p[20]+@M1p[5]*@M2p[21]+@M1p[10]*@M2p[22]+@M1p[15]*@M2p[23]+@M1p[20]*@M2p[24]
MR(4,1)=@M1p[1]*@M2p[20]+@M1p[6]*@M2p[21]+@M1p[11]*@M2p[22]+@M1p[16]*@M2p[23]+@M1p[21]*@M2p[24]
MR(4,2)=@M1p[2]*@M2p[20]+@M1p[7]*@M2p[21]+@M1p[12]*@M2p[22]+@M1p[17]*@M2p[23]+@M1p[22]*@M2p[24]
MR(4,3)=@M1p[3]*@M2p[20]+@M1p[8]*@M2p[21]+@M1p[13]*@M2p[22]+@M1p[18]*@M2p[23]+@M1p[23]*@M2p[24]
MR(4,4)=@M1p[4]*@M2p[20]+@M1p[9]*@M2p[21]+@M1p[14]*@M2p[22]+@M1p[19]*@M2p[23]+@M1p[24]*@M2p[24]
END SUB
I just checked using GDIPLUS
CALL GdipMultiplyMatrix(BYVAL VARPTR(MR(0,0)), BYVAL VARPTR(M2(0,0)), 0)
it doesn't matches my need...
I think the difference is that you are using 2 dimensional arrays and pointers. - mine is a simple array of 16 elements which are resolved by the compiler directly into address offsets. No address calculations are required.
PS I wonder how fast is GdipMultiplyMatrix.
GdipMultiplyMatrix is not good to manipulate the GDImage color matrix with alpha channel.
After doing several benchmarks, it looks like (in my case) the PowerBASIC built-in MAT would be the best solution in case of cumulative settings.
8)
This is an Intel example of 4x4 matrix multiplication. I translated it into Asmosphere under thinBasic. It is rendered here into machine code for PB.
It is the fastest, taking 0.1 seconds to compute 1,000,000 matrixes. The code is compatible with Pentium III processors (ca 1999) so it should be functional on most PCs.
PS: I don't know how it works :)
Streaming SIMD Extensions -
Matrix Multiplication
http://download.intel.com/design/PentiumIII/sml/24504501.pdf
SUB MatrixMul4x4SSE(BYREF a() AS SINGLE, BYREF b() AS SINGLE, BYREF c() AS SINGLE )
DIM p AS SINGLE PTR
p=VARPTR(a(0))
! mov ecx,eax
p=VARPTR(b(0))
! mov edx,eax
p=VARPTR(a(0))
'.sse_matrix_mul ; .sse_matrix_mul
! db &hf3, &h0f, &h10, &h02 ; movss xmm0, [edx]
! db &h0f, &h10, &h09 ; movups xmm1, [ecx]
! db &h0f, &hC6, &hC0, &h00 ; shufps xmm0, xmm0, 0
! db &hf3, &h0f, &h10, &h52, &h04 ; movss xmm2, [edx+4]
! db &h0f, &h59, &hC1 ; mulps xmm0, xmm1
! db &h0f, &hC6, &hD2, &h00 ; shufps xmm2, xmm2, 0
! db &h0f, &h10, &h59, &h10 ; movups xmm3, [ecx+16]
! db &hf3, &h0f, &h10, &h7A, &h08 ; movss xmm7, [edx+8]
! db &h0f, &h59, &hD3 ; mulps xmm2, xmm3
! db &h0f, &hC6, &hFF, &h00 ; shufps xmm7, xmm7, 0
! db &h0f, &h58, &hC2 ; addps xmm0, xmm2
! db &h0f, &h10, &h61, &h20 ; movups xmm4, [ecx+32]
! db &hf3, &h0f, &h10, &h52, &h0C ; movss xmm2, [edx+12]
! db &h0f, &h59, &hFC ; mulps xmm7, xmm4
! db &h0f, &hC6, &hD2, &h00 ; shufps xmm2, xmm2, 0
! db &h0f, &h58, &hC7 ; addps xmm0, xmm7
! db &h0f, &h10, &h69, &h30 ; movups xmm5, [ecx+48]
! db &hf3, &h0f, &h10, &h72, &h10 ; movss xmm6, [edx+16]
! db &h0f, &h59, &hD5 ; mulps xmm2, xmm5
! db &hf3, &h0f, &h10, &h7A, &h14 ; movss xmm7, [edx+20]
! db &h0f, &hC6, &hF6, &h00 ; shufps xmm6, xmm6, 0
! db &h0f, &h58, &hC2 ; addps xmm0, xmm2
! db &h0f, &hC6, &hFF, &h00 ; shufps xmm7, xmm7, 0
! db &h0f, &h13, &h00 ; movlps [eax], xmm0
! db &h0f, &h17, &h40, &h08 ; movhps [eax+8], xmm0
! db &h0f, &h59, &hFB ; mulps xmm7, xmm3
! db &hf3, &h0f, &h10, &h42, &h18 ; movss xmm0, [edx+24]
! db &h0f, &h59, &hF1 ; mulps xmm6, xmm1
! db &h0f, &hC6, &hC0, &h00 ; shufps xmm0, xmm0, 0
! db &h0f, &h58, &hF7 ; addps xmm6, xmm7
! db &h0f, &h59, &hC4 ; mulps xmm0, xmm4
! db &hf3, &h0f, &h10, &h52, &h24 ; movss xmm2, [edx+36]
! db &h0f, &h58, &hF0 ; addps xmm6, xmm0
! db &hf3, &h0f, &h10, &h42, &h1C ; movss xmm0, [edx+28]
! db &hf3, &h0f, &h10, &h7A, &h20 ; movss xmm7, [edx+32]
! db &h0f, &hC6, &hC0, &h00 ; shufps xmm0, xmm0, 0
! db &h0f, &hC6, &hFF, &h00 ; shufps xmm7, xmm7, 0
! db &h0f, &h59, &hC5 ; mulps xmm0, xmm5
! db &h0f, &h59, &hF9 ; mulps xmm7, xmm1
! db &h0f, &h58, &hF0 ; addps xmm6, xmm0
! db &h0f, &hC6, &hD2, &h00 ; shufps xmm2, xmm2, 0
! db &h0f, &h13, &h70, &h10 ; movlps [eax+16], xmm6
! db &h0f, &h17, &h70, &h18 ; movhps [eax+24], xmm6
! db &h0f, &h59, &hD3 ; mulps xmm2, xmm3
! db &hf3, &h0f, &h10, &h72, &h28 ; movss xmm6, [edx+40]
! db &h0f, &h58, &hFA ; addps xmm7, xmm2
! db &h0f, &hC6, &hF6, &h00 ; shufps xmm6, xmm6, 0
! db &hf3, &h0f, &h10, &h52, &h2C ; movss xmm2, [edx+44]
! db &h0f, &h59, &hF4 ; mulps xmm6, xmm4
! db &h0f, &hC6, &hD2, &h00 ; shufps xmm2, xmm2, 0
! db &h0f, &h58, &hFE ; addps xmm7, xmm6
! db &h0f, &h59, &hD5 ; mulps xmm2, xmm5
! db &hf3, &h0f, &h10, &h42, &h34 ; movss xmm0, [edx+52]
! db &h0f, &h58, &hFA ; addps xmm7, xmm2
! db &h0f, &hC6, &hC0, &h00 ; shufps xmm0, xmm0, 0
! db &h0f, &h13, &h78, &h20 ; movlps [eax+32], xmm7
! db &hf3, &h0f, &h10, &h52, &h30 ; movss xmm2, [edx+48]
! db &h0f, &h17, &h78, &h28 ; movhps [eax+40], xmm7
! db &h0f, &h59, &hC3 ; mulps xmm0, xmm3
! db &h0f, &hC6, &hD2, &h00 ; shufps xmm2, xmm2, 0
! db &hf3, &h0f, &h10, &h72, &h38 ; movss xmm6, [edx+56]
! db &h0f, &h59, &hD1 ; mulps xmm2, xmm1
! db &h0f, &hC6, &hF6, &h00 ; shufps xmm6, xmm6, 0
! db &h0f, &h58, &hD0 ; addps xmm2, xmm0
! db &h0f, &h59, &hF4 ; mulps xmm6, xmm4
! db &hf3, &h0f, &h10, &h7A, &h3C ; movss xmm7, [edx+60]
! db &h0f, &hC6, &hFF, &h00 ; shufps xmm7, xmm7, 0
! db &h0f, &h58, &hD6 ; addps xmm2, xmm6
! db &h0f, &h59, &hFD ; mulps xmm7, xmm5
! db &h0f, &h58, &hD7 ; addps xmm2, xmm7
! db &h0f, &h11, &h50, &h30 ; movups [eax+48], xmm2
END SUB
FUNCTION PBMAIN () AS LONG
DIM r(15) AS SINGLE
DIM a(15) AS SINGLE
DIM b(15) AS SINGLE
DIM i AS LONG
DIM t AS SINGLE
t=TIMER
FOR i=1 TO 1e6
' MatMultiply(r(),a(),b()) ' 1.6 seconds using 2d arrays
' MatMultiplyP(r(),a(),b()) ' 1.5 seconds using 2d arrays and pointers
' MatrixMul4x4(a(),b(),r()) ' 0.39 seconds using 1d pointers
' MatrixMul4x4FPU(a(),b(),r()) ' 0.17 seconds using FPU assembly code
MatrixMul4x4SSE(a(),b(),r()) ' 0.1 seconds using SSE assembly code
NEXT
MSGBOX STR$(TIMER-t)
END FUNCTION
Charles
I am using a color matrix of 5x5 not 4x4 or 3x3 that would be of course much faster.
If you can make it work for 5x5 then i could try it, but i am not an assembly guy myself :'(
Patrice,
This takes 0.27 secs per million calcs for a 5x5 Matrix. I have not functionally tested it but it is a simple expansion of the 4x4 function
Result is returned in mr() matching your example prototypes
SUB MatrixMul5x5FPU(BYREF mr() AS SINGLE, BYREF m1() AS SINGLE, BYREF m2() AS SINGLE )
DIM p AS SINGLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))
block:
! call column
! call column
! call column
! call column
! call column
EXIT SUB
column:
! call cell
! call cell
! call cell
! call cell
! call cell
! add edx,20
! sub ecx,20
! ret
cell: ' row A * column B
! fld dword ptr [ecx ]
! fmul dword ptr [edx ]
! fld dword ptr [ecx+20]
! fmul dword ptr [edx+04]
! fld dword ptr [ecx+40]
! fmul dword ptr [edx+08]
! fld dword ptr [ecx+60]
! fmul dword ptr [edx+12]
! fld dword ptr [ecx+80]
! fmul dword ptr [edx+16]
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp dword ptr [eax]
! add eax,4
! add ecx,4
! ret
END SUB
[/cpde]
Charles
Thank you very much!
I did check it over the built-in PB MAT function, and i would say that this confirm that the PB one was already very fast, however yours seems even faster ;)
I shall post the new GDImage.dll in the PhotoSetup thread, for those wanting to try it.
Thanks again.
Glad it was useful Patrice.
Here are 3x3 4x4 and 5x5 matrix multiplications using single and double precision. You can see how the code changes for each matrix type.
Interestingly there seems to be very little difference in speed between single precision and double precision
'================
' SINGLES
'================
SUB MatrixMul3x3SFPU(BYREF mr() AS SINGLE, BYREF m1() AS SINGLE, BYREF m2() AS SINGLE )
DIM p AS SINGLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))
block:
! call column
! call column
! call column
EXIT SUB
column:
! call cell
! call cell
! call cell
! add edx,12
! sub ecx,12
! ret
cell: ' row A * column B
! fld dword ptr [ecx ]
! fmul dword ptr [edx ]
! fld dword ptr [ecx+12]
! fmul dword ptr [edx+04]
! fld dword ptr [ecx+24]
! fmul dword ptr [edx+08]
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp dword ptr [eax]
! add eax,4
! add ecx,4
! ret
END SUB
SUB MatrixMul4x4SFPU(BYREF mr() AS SINGLE, BYREF m1() AS SINGLE, BYREF m2() AS SINGLE )
DIM p AS SINGLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))
block:
! call column
! call column
! call column
! call column
EXIT SUB
column:
! call cell
! call cell
! call cell
! call cell
! add edx,16
! sub ecx,16
! ret
cell: ' row A * column B
! fld dword ptr [ecx ]
! fmul dword ptr [edx ]
! fld dword ptr [ecx+16]
! fmul dword ptr [edx+04]
! fld dword ptr [ecx+32]
! fmul dword ptr [edx+08]
! fld dword ptr [ecx+48]
! fmul dword ptr [edx+12]
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp dword ptr [eax]
! add eax,4
! add ecx,4
! ret
END SUB
SUB MatrixMul5x5SFPU(BYREF mr() AS SINGLE, BYREF m1() AS SINGLE, BYREF m2() AS SINGLE )
DIM p AS SINGLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))
block:
! call column
! call column
! call column
! call column
! call column
EXIT SUB
column:
! call cell
! call cell
! call cell
! call cell
! call cell
! add edx,20
! sub ecx,20
! ret
cell: ' row A * column B
! fld dword ptr [ecx ]
! fmul dword ptr [edx ]
! fld dword ptr [ecx+20]
! fmul dword ptr [edx+04]
! fld dword ptr [ecx+40]
! fmul dword ptr [edx+08]
! fld dword ptr [ecx+60]
! fmul dword ptr [edx+12]
! fld dword ptr [ecx+80]
! fmul dword ptr [edx+16]
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp dword ptr [eax]
! add eax,4
! add ecx,4
! ret
END SUB
'====================================
' DOUBLES
'====================================
SUB MatrixMul3x3DFPU(BYREF mr() AS DOUBLE, BYREF m1() AS DOUBLE, BYREF m2() AS DOUBLE )
DIM p AS DOUBLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))
block:
! call column
! call column
! call column
EXIT SUB
column:
! call cell
! call cell
! call cell
! add edx,24
! sub ecx,24
! ret
cell: ' row A * column B
! fld qword ptr [ecx ]
! fmul qword ptr [edx ]
! fld qword ptr [ecx+24]
! fmul qword ptr [edx+08]
! fld qword ptr [ecx+48]
! fmul qword ptr [edx+16]
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp qword ptr [eax]
! add eax,8
! add ecx,8
! ret
END SUB
SUB MatrixMul4x4DFPU(BYREF mr() AS DOUBLE, BYREF m1() AS DOUBLE, BYREF m2() AS DOUBLE )
DIM p AS DOUBLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))
block:
! call column
! call column
! call column
! call column
EXIT SUB
column:
! call cell
! call cell
! call cell
! call cell
! add edx,32
! sub ecx,32
! ret
cell: ' row A * column B
! fld qword ptr [ecx ]
! fmul qword ptr [edx ]
! fld qword ptr [ecx+32]
! fmul qword ptr [edx+08]
! fld qword ptr [ecx+64]
! fmul qword ptr [edx+16]
! fld qword ptr [ecx+96]
! fmul qword ptr [edx+24]
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp qword ptr [eax]
! add eax,8
! add ecx,8
! ret
END SUB
SUB MatrixMul5x5DFPU(BYREF mr() AS DOUBLE, BYREF m1() AS DOUBLE, BYREF m2() AS DOUBLE )
DIM p AS DOUBLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))
block:
! call column
! call column
! call column
! call column
! call column
EXIT SUB
column:
! call cell
! call cell
! call cell
! call cell
! call cell
! add edx,40
! sub ecx,40
! ret
cell: ' row A * column B
! fld qword ptr [ecx ]
! fmul qword ptr [edx ]
! fld qword ptr [ecx+40]
! fmul qword ptr [edx+08]
! fld qword ptr [ecx+80]
! fmul qword ptr [edx+16]
! fld qword ptr [ecx+120]
! fmul qword ptr [edx+24]
! fld qword ptr [ecx+160]
! fmul qword ptr [edx+32]
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp qword ptr [eax]
! add eax,8
! add ecx,8
! ret
END SUB
Correction to strides 28 July 2008
That is perfect Charles,
may I steal your code for TBGL?
Petr
Of course Petr,
Are there any other matrix operations we need to look at. Multiplication seems to be the most important as far as graphics is concerned.
Thanks a lot Charles,
I used multiplication only, as in 4x4 matrix all can be saved - scale, position and rotation.
Petr
The problem i have to deal with, is that modern digital photographier produce so huge image, that wanting to perform cumulative settings in real time even with assembly, seems to have reach a limit, that could not be further optimized without the help of a graphic accelerated hardware.
Thus i think it is better to use a preview using a small size (for example 512x512) to adjust the settings, then apply them to the real photography once all parameters have been set.
Anyway if ever you can push the limits furthermore with these FPU settings i would be glad to try it ;)