• Welcome to Powerbasic Museum 2020-B.
 

News:

Forum in repository mode. No new members allowed.

Main Menu

Asm: 4*4 Matrix Multiply

Started by Charles Pegge, July 26, 2008, 08:34:47 AM

Previous topic - Next topic

0 Members and 1 Guest are viewing this topic.

Charles Pegge


Using FPU - compared with using pointers in PowerBasic



#COMPILE EXE
#DIM ALL

SUB MatrixMul4x4S(BYREF a() AS SINGLE, BYREF b() AS SINGLE, BYREF c() AS SINGLE )
DIM p AS SINGLE PTR

p=VARPTR(a(0))
! mov ecx,eax
p=VARPTR(b(0))
! mov edx,eax
p=VARPTR(c(0))

block:
! call column
! call column
! call column
! call column
EXIT SUB

column:
! call cell
! call cell
! call cell
! call cell
! add edx,16
! sub ecx,16
! ret

cell: ' row A * column B
! fld   dword ptr [ecx   ]
! fmul  dword ptr [edx   ]
! fld   dword ptr [ecx+16]
! fmul  dword ptr [edx+04]
! fld   dword ptr [ecx+32]
! fmul  dword ptr [edx+08]
! fld   dword ptr [ecx+48]
! fmul  dword ptr [edx+12]
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp  dword ptr [eax]
! add   eax,4
! add   ecx,4
! ret

END SUB
       


SUB MatrixMul4x4(lhs() AS SINGLE, rhs() AS SINGLE, res() AS SINGLE)
     DIM lhsp AS SINGLE PTR:lhsp=VARPTR(lhs(0))
     DIM rhsp AS SINGLE PTR:rhsp=VARPTR(rhs(0))

     ' COLUMN 1

     res(00)=@lhsp[0]*@rhsp[00]+@lhsp[4]*@rhsp[01]+@lhsp[08]*@rhsp[02]+@lhsp[12]*@rhsp[03]
     res(01)=@lhsp[1]*@rhsp[00]+@lhsp[5]*@rhsp[01]+@lhsp[09]*@rhsp[02]+@lhsp[13]*@rhsp[03]
     res(02)=@lhsp[2]*@rhsp[00]+@lhsp[6]*@rhsp[01]+@lhsp[10]*@rhsp[02]+@lhsp[14]*@rhsp[03]
     res(03)=@lhsp[3]*@rhsp[00]+@lhsp[7]*@rhsp[01]+@lhsp[11]*@rhsp[02]+@lhsp[15]*@rhsp[03]

     ' COLUMN 1

     res(04)=@lhsp[0]*@rhsp[04]+@lhsp[4]*@rhsp[05]+@lhsp[08]*@rhsp[06]+@lhsp[12]*@rhsp[07]
     res(05)=@lhsp[1]*@rhsp[04]+@lhsp[5]*@rhsp[05]+@lhsp[09]*@rhsp[06]+@lhsp[13]*@rhsp[07]
     res(06)=@lhsp[2]*@rhsp[04]+@lhsp[6]*@rhsp[05]+@lhsp[10]*@rhsp[06]+@lhsp[14]*@rhsp[07]
     res(07)=@lhsp[3]*@rhsp[04]+@lhsp[7]*@rhsp[05]+@lhsp[11]*@rhsp[06]+@lhsp[15]*@rhsp[07]

     ' COLIMN 3

     res(08)=@lhsp[0]*@rhsp[08]+@lhsp[4]*@rhsp[09]+@lhsp[08]*@rhsp[10]+@lhsp[12]*@rhsp[11]
     res(09)=@lhsp[1]*@rhsp[08]+@lhsp[5]*@rhsp[09]+@lhsp[09]*@rhsp[10]+@lhsp[13]*@rhsp[11]
     res(10)=@lhsp[2]*@rhsp[08]+@lhsp[6]*@rhsp[09]+@lhsp[10]*@rhsp[10]+@lhsp[14]*@rhsp[11]
     res(11)=@lhsp[3]*@rhsp[08]+@lhsp[7]*@rhsp[09]+@lhsp[11]*@rhsp[10]+@lhsp[15]*@rhsp[11]

     ' COLUMN 4

     res(12)=@lhsp[0]*@rhsp[12]+@lhsp[4]*@rhsp[13]+@lhsp[08]*@rhsp[14]+@lhsp[12]*@rhsp[15]
     res(13)=@lhsp[1]*@rhsp[12]+@lhsp[5]*@rhsp[13]+@lhsp[09]*@rhsp[14]+@lhsp[13]*@rhsp[15]
     res(14)=@lhsp[2]*@rhsp[12]+@lhsp[6]*@rhsp[13]+@lhsp[10]*@rhsp[14]+@lhsp[14]*@rhsp[15]
     res(15)=@lhsp[3]*@rhsp[12]+@lhsp[7]*@rhsp[13]+@lhsp[11]*@rhsp[14]+@lhsp[15]*@rhsp[15]
END SUB






SUB getrow4S(BYREF matr AS SINGLE)
    ' test this
    DIM row4f(3) AS GLOBAL SINGLE
    DIM p AS SINGLE PTR
    p=VARPTR(matr)
    row4f(0)=@p[0]
    row4f(1)=@p[4]
    row4f(2)=@p[8]
    row4f(3)=@p[12]
END SUB

 


FUNCTION PBMAIN () AS LONG

   

END FUNCTION

Patrice Terrier

#1
Charles

Then, what do you think would be the fastest ASM solution to perform these (color matrix)


SUB MatAssign(MR() AS SINGLE, MM() AS SINGLE)
    'ARRAY ASSIGN MR() = MM(0, 0), MM(0, 1), MM(0, 2), MM(0, 3), MM(0, 4), _
    '                    MM(1, 0), MM(1, 1), MM(1, 2), MM(1, 3), MM(1, 4), _
    '                    MM(2, 0), MM(2, 1), MM(2, 2), MM(2, 3), MM(2, 4), _
    '                    MM(3, 0), MM(3, 1), MM(3, 2), MM(3, 3), MM(3, 4), _
    '                    MM(4, 0), MM(4, 1), MM(4, 2), MM(4, 3), MM(4, 4)
    POKE$ VARPTR(MR(0,0)), PEEK$(VARPTR(MM(0,0)), 100)
END SUB

SUB MatMultiply(MR() AS SINGLE, M1() AS SINGLE, M2() AS SINGLE)
    MR(0,0) = M1(0,0) * M2(0,0) + M1(0,1) * M2(1,0) + M1(0,2) * M2(2,0) + M1(0,3) * M2(3,0) + M1(0,4) * M2(4,0)
    MR(0,1) = M1(1,0) * M2(0,0) + M1(1,1) * M2(1,0) + M1(1,2) * M2(2,0) + M1(1,3) * M2(3,0) + M1(1,4) * M2(4,0)
    MR(0,2) = M1(2,0) * M2(0,0) + M1(2,1) * M2(1,0) + M1(2,2) * M2(2,0) + M1(2,3) * M2(3,0) + M1(2,4) * M2(4,0)
    MR(0,3) = M1(3,0) * M2(0,0) + M1(3,1) * M2(1,0) + M1(3,2) * M2(2,0) + M1(3,3) * M2(3,0) + M1(3,4) * M2(4,0)
    MR(0,4) = M1(4,0) * M2(0,0) + M1(4,1) * M2(1,0) + M1(4,2) * M2(2,0) + M1(4,3) * M2(3,0) + M1(4,4) * M2(4,0)
    MR(1,0) = M1(0,0) * M2(0,1) + M1(0,1) * M2(1,1) + M1(0,2) * M2(2,1) + M1(0,3) * M2(3,1) + M1(0,4) * M2(4,1)
    MR(1,1) = M1(1,0) * M2(0,1) + M1(1,1) * M2(1,1) + M1(1,2) * M2(2,1) + M1(1,3) * M2(3,1) + M1(1,4) * M2(4,1)
    MR(1,2) = M1(2,0) * M2(0,1) + M1(2,1) * M2(1,1) + M1(2,2) * M2(2,1) + M1(2,3) * M2(3,1) + M1(2,4) * M2(4,1)
    MR(1,3) = M1(3,0) * M2(0,1) + M1(3,1) * M2(1,1) + M1(3,2) * M2(2,1) + M1(3,3) * M2(3,1) + M1(3,4) * M2(4,1)
    MR(1,4) = M1(4,0) * M2(0,1) + M1(4,1) * M2(1,1) + M1(4,2) * M2(2,1) + M1(4,3) * M2(3,1) + M1(4,4) * M2(4,1)
    MR(2,0) = M1(0,0) * M2(0,2) + M1(0,1) * M2(1,2) + M1(0,2) * M2(2,2) + M1(0,3) * M2(3,2) + M1(0,4) * M2(4,2)
    MR(2,1) = M1(1,0) * M2(0,2) + M1(1,1) * M2(1,2) + M1(1,2) * M2(2,2) + M1(1,3) * M2(3,2) + M1(1,4) * M2(4,2)
    MR(2,2) = M1(2,0) * M2(0,2) + M1(2,1) * M2(1,2) + M1(2,2) * M2(2,2) + M1(2,3) * M2(3,2) + M1(2,4) * M2(4,2)
    MR(2,3) = M1(3,0) * M2(0,2) + M1(3,1) * M2(1,2) + M1(3,2) * M2(2,2) + M1(3,3) * M2(3,2) + M1(3,4) * M2(4,2)
    MR(2,4) = M1(4,0) * M2(0,2) + M1(4,1) * M2(1,2) + M1(4,2) * M2(2,2) + M1(4,3) * M2(3,2) + M1(4,4) * M2(4,2)
    MR(3,0) = M1(0,0) * M2(0,3) + M1(0,1) * M2(1,3) + M1(0,2) * M2(2,3) + M1(0,3) * M2(3,3) + M1(0,4) * M2(4,3)
    MR(3,1) = M1(1,0) * M2(0,3) + M1(1,1) * M2(1,3) + M1(1,2) * M2(2,3) + M1(1,3) * M2(3,3) + M1(1,4) * M2(4,3)
    MR(3,2) = M1(2,0) * M2(0,3) + M1(2,1) * M2(1,3) + M1(2,2) * M2(2,3) + M1(2,3) * M2(3,3) + M1(2,4) * M2(4,3)
    MR(3,3) = M1(3,0) * M2(0,3) + M1(3,1) * M2(1,3) + M1(3,2) * M2(2,3) + M1(3,3) * M2(3,3) + M1(3,4) * M2(4,3)
    MR(3,4) = M1(4,0) * M2(0,3) + M1(4,1) * M2(1,3) + M1(4,2) * M2(2,1) + M1(4,3) * M2(3,3) + M1(4,4) * M2(4,3)
    MR(4,0) = M1(0,0) * M2(0,4) + M1(0,1) * M2(1,4) + M1(0,2) * M2(2,4) + M1(0,3) * M2(3,4) + M1(0,4) * M2(4,4)
    MR(4,1) = M1(1,0) * M2(0,4) + M1(1,1) * M2(1,4) + M1(1,2) * M2(2,4) + M1(1,3) * M2(3,4) + M1(1,4) * M2(4,4)
    MR(4,2) = M1(2,0) * M2(0,4) + M1(2,1) * M2(1,4) + M1(2,2) * M2(2,4) + M1(2,3) * M2(3,4) + M1(2,4) * M2(4,4)
    MR(4,3) = M1(3,0) * M2(0,4) + M1(3,1) * M2(1,4) + M1(3,2) * M2(2,4) + M1(3,3) * M2(3,4) + M1(3,4) * M2(4,4)
    MR(4,4) = M1(4,0) * M2(0,4) + M1(4,1) * M2(1,4) + M1(4,2) * M2(2,4) + M1(4,3) * M2(3,4) + M1(4,4) * M2(4,4)
END SUB

Patrice Terrier
GDImage (advanced graphic addon)
http://www.zapsolution.com

Charles Pegge

Hi Patrice

I've tested your code in comparison with the pointer version and the FPU assembly code version.

My FPU version is about 10X faster. We can attribute this difference to the movement of data between memory and registers and the calculation of array  element addresses, since the maths operations are identical.

There is a method described by Intel using SSE2 SIMD instructions which is twice the speed of the FPU. Unfortunately PB8 does not recognise the instructions for SSE2 so we would have to go down to machine code.


FUNCTION PBMAIN () AS LONG

DIM r(15) AS SINGLE
DIM a(15) AS SINGLE
DIM b(15) AS SINGLE

DIM i AS LONG
DIM t AS SINGLE

t=TIMER
FOR i=1 TO 1e6
' MatMultiply(r(),a(),b())   ' 1.6  seconds using 2d arrays
' MatrixMul4x4(a(),b(),r())  ' 0.39 seconds using 1d pointers
  MatrixMul4x4S(a(),b(),r()) ' 0.17 seconds using FPU assembly code
NEXT

MSGBOX STR$(TIMER-t)

END FUNCTION

Patrice Terrier

Charles

I have checked with the pointer version (see below) and I can't see any speed boost by me  :(


SUB MatMultiply(MR() AS SINGLE, M1() AS SINGLE, M2() AS SINGLE)
'    MR(0,0) = M1(0,0) * M2(0,0) + M1(0,1) * M2(1,0) + M1(0,2) * M2(2,0) + M1(0,3) * M2(3,0) + M1(0,4) * M2(4,0)
'    MR(0,1) = M1(1,0) * M2(0,0) + M1(1,1) * M2(1,0) + M1(1,2) * M2(2,0) + M1(1,3) * M2(3,0) + M1(1,4) * M2(4,0)
'    MR(0,2) = M1(2,0) * M2(0,0) + M1(2,1) * M2(1,0) + M1(2,2) * M2(2,0) + M1(2,3) * M2(3,0) + M1(2,4) * M2(4,0)
'    MR(0,3) = M1(3,0) * M2(0,0) + M1(3,1) * M2(1,0) + M1(3,2) * M2(2,0) + M1(3,3) * M2(3,0) + M1(3,4) * M2(4,0)
'    MR(0,4) = M1(4,0) * M2(0,0) + M1(4,1) * M2(1,0) + M1(4,2) * M2(2,0) + M1(4,3) * M2(3,0) + M1(4,4) * M2(4,0)
'    MR(1,0) = M1(0,0) * M2(0,1) + M1(0,1) * M2(1,1) + M1(0,2) * M2(2,1) + M1(0,3) * M2(3,1) + M1(0,4) * M2(4,1)
'    MR(1,1) = M1(1,0) * M2(0,1) + M1(1,1) * M2(1,1) + M1(1,2) * M2(2,1) + M1(1,3) * M2(3,1) + M1(1,4) * M2(4,1)
'    MR(1,2) = M1(2,0) * M2(0,1) + M1(2,1) * M2(1,1) + M1(2,2) * M2(2,1) + M1(2,3) * M2(3,1) + M1(2,4) * M2(4,1)
'    MR(1,3) = M1(3,0) * M2(0,1) + M1(3,1) * M2(1,1) + M1(3,2) * M2(2,1) + M1(3,3) * M2(3,1) + M1(3,4) * M2(4,1)
'    MR(1,4) = M1(4,0) * M2(0,1) + M1(4,1) * M2(1,1) + M1(4,2) * M2(2,1) + M1(4,3) * M2(3,1) + M1(4,4) * M2(4,1)
'    MR(2,0) = M1(0,0) * M2(0,2) + M1(0,1) * M2(1,2) + M1(0,2) * M2(2,2) + M1(0,3) * M2(3,2) + M1(0,4) * M2(4,2)
'    MR(2,1) = M1(1,0) * M2(0,2) + M1(1,1) * M2(1,2) + M1(1,2) * M2(2,2) + M1(1,3) * M2(3,2) + M1(1,4) * M2(4,2)
'    MR(2,2) = M1(2,0) * M2(0,2) + M1(2,1) * M2(1,2) + M1(2,2) * M2(2,2) + M1(2,3) * M2(3,2) + M1(2,4) * M2(4,2)
'    MR(2,3) = M1(3,0) * M2(0,2) + M1(3,1) * M2(1,2) + M1(3,2) * M2(2,2) + M1(3,3) * M2(3,2) + M1(3,4) * M2(4,2)
'    MR(2,4) = M1(4,0) * M2(0,2) + M1(4,1) * M2(1,2) + M1(4,2) * M2(2,2) + M1(4,3) * M2(3,2) + M1(4,4) * M2(4,2)
'    MR(3,0) = M1(0,0) * M2(0,3) + M1(0,1) * M2(1,3) + M1(0,2) * M2(2,3) + M1(0,3) * M2(3,3) + M1(0,4) * M2(4,3)
'    MR(3,1) = M1(1,0) * M2(0,3) + M1(1,1) * M2(1,3) + M1(1,2) * M2(2,3) + M1(1,3) * M2(3,3) + M1(1,4) * M2(4,3)
'    MR(3,2) = M1(2,0) * M2(0,3) + M1(2,1) * M2(1,3) + M1(2,2) * M2(2,3) + M1(2,3) * M2(3,3) + M1(2,4) * M2(4,3)
'    MR(3,3) = M1(3,0) * M2(0,3) + M1(3,1) * M2(1,3) + M1(3,2) * M2(2,3) + M1(3,3) * M2(3,3) + M1(3,4) * M2(4,3)
'    MR(3,4) = M1(4,0) * M2(0,3) + M1(4,1) * M2(1,3) + M1(4,2) * M2(2,1) + M1(4,3) * M2(3,3) + M1(4,4) * M2(4,3)
'    MR(4,0) = M1(0,0) * M2(0,4) + M1(0,1) * M2(1,4) + M1(0,2) * M2(2,4) + M1(0,3) * M2(3,4) + M1(0,4) * M2(4,4)
'    MR(4,1) = M1(1,0) * M2(0,4) + M1(1,1) * M2(1,4) + M1(1,2) * M2(2,4) + M1(1,3) * M2(3,4) + M1(1,4) * M2(4,4)
'    MR(4,2) = M1(2,0) * M2(0,4) + M1(2,1) * M2(1,4) + M1(2,2) * M2(2,4) + M1(2,3) * M2(3,4) + M1(2,4) * M2(4,4)
'    MR(4,3) = M1(3,0) * M2(0,4) + M1(3,1) * M2(1,4) + M1(3,2) * M2(2,4) + M1(3,3) * M2(3,4) + M1(3,4) * M2(4,4)
'    MR(4,4) = M1(4,0) * M2(0,4) + M1(4,1) * M2(1,4) + M1(4,2) * M2(2,4) + M1(4,3) * M2(3,4) + M1(4,4) * M2(4,4)
   
     DIM M1p AS SINGLE PTR: M1p = VARPTR(M1(0,0))
     DIM M2p AS SINGLE PTR: M2p = VARPTR(M2(0,0))

     MR(0,0)=@M1p[0]*@M2p[00]+@M1p[5]*@M2p[01]+@M1p[10]*@M2p[02]+@M1p[15]*@M2p[03]+@M1p[20]*@M2p[04]
     MR(0,1)=@M1p[1]*@M2p[00]+@M1p[6]*@M2p[01]+@M1p[11]*@M2p[02]+@M1p[16]*@M2p[03]+@M1p[21]*@M2p[04]
     MR(0,2)=@M1p[2]*@M2p[00]+@M1p[7]*@M2p[01]+@M1p[12]*@M2p[02]+@M1p[17]*@M2p[03]+@M1p[22]*@M2p[04]
     MR(0,3)=@M1p[3]*@M2p[00]+@M1p[8]*@M2p[01]+@M1p[13]*@M2p[02]+@M1p[18]*@M2p[03]+@M1p[23]*@M2p[04]
     MR(0,4)=@M1p[4]*@M2p[00]+@M1p[9]*@M2p[01]+@M1p[14]*@M2p[02]+@M1p[19]*@M2p[03]+@M1p[24]*@M2p[04]

     MR(1,0)=@M1p[0]*@M2p[05]+@M1p[5]*@M2p[06]+@M1p[10]*@M2p[07]+@M1p[15]*@M2p[08]+@M1p[20]*@M2p[09]
     MR(1,1)=@M1p[1]*@M2p[05]+@M1p[6]*@M2p[06]+@M1p[11]*@M2p[07]+@M1p[16]*@M2p[08]+@M1p[21]*@M2p[09]
     MR(1,2)=@M1p[2]*@M2p[05]+@M1p[7]*@M2p[06]+@M1p[12]*@M2p[07]+@M1p[17]*@M2p[08]+@M1p[22]*@M2p[09]
     MR(1,3)=@M1p[3]*@M2p[05]+@M1p[8]*@M2p[06]+@M1p[13]*@M2p[07]+@M1p[18]*@M2p[08]+@M1p[23]*@M2p[09]
     MR(1,4)=@M1p[4]*@M2p[05]+@M1p[9]*@M2p[06]+@M1p[14]*@M2p[07]+@M1p[19]*@M2p[08]+@M1p[24]*@M2p[09]

     MR(2,0)=@M1p[0]*@M2p[10]+@M1p[5]*@M2p[11]+@M1p[10]*@M2p[12]+@M1p[15]*@M2p[13]+@M1p[20]*@M2p[14]
     MR(2,1)=@M1p[1]*@M2p[10]+@M1p[6]*@M2p[11]+@M1p[11]*@M2p[12]+@M1p[16]*@M2p[13]+@M1p[21]*@M2p[14]
     MR(2,2)=@M1p[2]*@M2p[10]+@M1p[7]*@M2p[11]+@M1p[12]*@M2p[12]+@M1p[17]*@M2p[13]+@M1p[22]*@M2p[14]
     MR(2,3)=@M1p[3]*@M2p[10]+@M1p[8]*@M2p[11]+@M1p[13]*@M2p[12]+@M1p[18]*@M2p[13]+@M1p[23]*@M2p[14]
     MR(2,4)=@M1p[4]*@M2p[10]+@M1p[9]*@M2p[11]+@M1p[14]*@M2p[12]+@M1p[19]*@M2p[13]+@M1p[24]*@M2p[14]

     MR(3,0)=@M1p[0]*@M2p[15]+@M1p[5]*@M2p[16]+@M1p[10]*@M2p[17]+@M1p[15]*@M2p[18]+@M1p[20]*@M2p[19]
     MR(3,1)=@M1p[1]*@M2p[15]+@M1p[6]*@M2p[16]+@M1p[11]*@M2p[17]+@M1p[16]*@M2p[18]+@M1p[21]*@M2p[19]
     MR(3,2)=@M1p[2]*@M2p[15]+@M1p[7]*@M2p[16]+@M1p[12]*@M2p[17]+@M1p[17]*@M2p[18]+@M1p[22]*@M2p[19]
     MR(3,3)=@M1p[3]*@M2p[15]+@M1p[8]*@M2p[16]+@M1p[13]*@M2p[17]+@M1p[18]*@M2p[18]+@M1p[23]*@M2p[19]
     MR(3,4)=@M1p[4]*@M2p[15]+@M1p[9]*@M2p[16]+@M1p[14]*@M2p[17]+@M1p[19]*@M2p[18]+@M1p[24]*@M2p[19]

     MR(4,0)=@M1p[0]*@M2p[20]+@M1p[5]*@M2p[21]+@M1p[10]*@M2p[22]+@M1p[15]*@M2p[23]+@M1p[20]*@M2p[24]
     MR(4,1)=@M1p[1]*@M2p[20]+@M1p[6]*@M2p[21]+@M1p[11]*@M2p[22]+@M1p[16]*@M2p[23]+@M1p[21]*@M2p[24]
     MR(4,2)=@M1p[2]*@M2p[20]+@M1p[7]*@M2p[21]+@M1p[12]*@M2p[22]+@M1p[17]*@M2p[23]+@M1p[22]*@M2p[24]
     MR(4,3)=@M1p[3]*@M2p[20]+@M1p[8]*@M2p[21]+@M1p[13]*@M2p[22]+@M1p[18]*@M2p[23]+@M1p[23]*@M2p[24]
     MR(4,4)=@M1p[4]*@M2p[20]+@M1p[9]*@M2p[21]+@M1p[14]*@M2p[22]+@M1p[19]*@M2p[23]+@M1p[24]*@M2p[24]
   
END SUB
Patrice Terrier
GDImage (advanced graphic addon)
http://www.zapsolution.com

Patrice Terrier

#4
I just checked using GDIPLUS
CALL GdipMultiplyMatrix(BYVAL VARPTR(MR(0,0)), BYVAL VARPTR(M2(0,0)), 0)

it doesn't matches my need...
Patrice Terrier
GDImage (advanced graphic addon)
http://www.zapsolution.com

Charles Pegge


I think the difference is that you are using 2 dimensional arrays and pointers. - mine is a simple array of 16 elements which are resolved by the compiler directly into address offsets. No address calculations are required.

PS I wonder how fast is GdipMultiplyMatrix.

Patrice Terrier

GdipMultiplyMatrix is not good to manipulate the GDImage color matrix with alpha channel.
Patrice Terrier
GDImage (advanced graphic addon)
http://www.zapsolution.com

Patrice Terrier

After doing several benchmarks, it looks like (in my case) the PowerBASIC built-in MAT would be the best solution in case of cumulative settings.

8)
Patrice Terrier
GDImage (advanced graphic addon)
http://www.zapsolution.com

Charles Pegge

#8
This is an Intel example of 4x4 matrix multiplication. I translated it into Asmosphere under thinBasic. It is rendered here into machine code for PB.
It is the fastest, taking 0.1 seconds to compute 1,000,000 matrixes. The code is compatible with Pentium III processors (ca 1999)  so it should be functional on most PCs.

PS: I don't know how it works :)

Streaming SIMD Extensions -
Matrix Multiplication


http://download.intel.com/design/PentiumIII/sml/24504501.pdf



SUB MatrixMul4x4SSE(BYREF a() AS SINGLE, BYREF b() AS SINGLE, BYREF c() AS SINGLE )
DIM p AS SINGLE PTR

p=VARPTR(a(0))
! mov ecx,eax
p=VARPTR(b(0))
! mov edx,eax
p=VARPTR(a(0))
'.sse_matrix_mul               ;  .sse_matrix_mul

! db &hf3, &h0f, &h10, &h02                 ;  movss  xmm0, [edx]
! db &h0f, &h10, &h09                       ;  movups xmm1, [ecx]
! db &h0f, &hC6, &hC0, &h00                 ;  shufps xmm0, xmm0, 0
! db &hf3, &h0f, &h10, &h52, &h04           ;  movss  xmm2, [edx+4]
! db &h0f, &h59, &hC1                       ;  mulps  xmm0, xmm1
! db &h0f, &hC6, &hD2, &h00                 ;  shufps xmm2, xmm2, 0
! db &h0f, &h10, &h59, &h10                 ;  movups xmm3, [ecx+16]
! db &hf3, &h0f, &h10, &h7A, &h08           ;  movss  xmm7, [edx+8]
! db &h0f, &h59, &hD3                       ;  mulps  xmm2, xmm3
! db &h0f, &hC6, &hFF, &h00                 ;  shufps xmm7, xmm7, 0
! db &h0f, &h58, &hC2                       ;  addps  xmm0, xmm2
! db &h0f, &h10, &h61, &h20                 ;  movups xmm4, [ecx+32]
! db &hf3, &h0f, &h10, &h52, &h0C           ;  movss  xmm2, [edx+12]
! db &h0f, &h59, &hFC                       ;  mulps  xmm7, xmm4
! db &h0f, &hC6, &hD2, &h00                 ;  shufps xmm2, xmm2, 0
! db &h0f, &h58, &hC7                       ;  addps  xmm0, xmm7
! db &h0f, &h10, &h69, &h30                 ;  movups xmm5, [ecx+48]
! db &hf3, &h0f, &h10, &h72, &h10           ;  movss  xmm6, [edx+16]
! db &h0f, &h59, &hD5                       ;  mulps  xmm2, xmm5
! db &hf3, &h0f, &h10, &h7A, &h14           ;  movss  xmm7, [edx+20]
! db &h0f, &hC6, &hF6, &h00                 ;  shufps xmm6, xmm6, 0
! db &h0f, &h58, &hC2                       ;  addps  xmm0, xmm2
! db &h0f, &hC6, &hFF, &h00                 ;  shufps xmm7, xmm7, 0
! db &h0f, &h13, &h00                       ;  movlps [eax], xmm0
! db &h0f, &h17, &h40, &h08                 ;  movhps [eax+8], xmm0
! db &h0f, &h59, &hFB                       ;  mulps  xmm7, xmm3
! db &hf3, &h0f, &h10, &h42, &h18           ;  movss  xmm0, [edx+24]
! db &h0f, &h59, &hF1                       ;  mulps  xmm6, xmm1
! db &h0f, &hC6, &hC0, &h00                 ;  shufps xmm0, xmm0, 0
! db &h0f, &h58, &hF7                       ;  addps  xmm6, xmm7
! db &h0f, &h59, &hC4                       ;  mulps  xmm0, xmm4
! db &hf3, &h0f, &h10, &h52, &h24           ;  movss  xmm2, [edx+36]
! db &h0f, &h58, &hF0                       ;  addps  xmm6, xmm0
! db &hf3, &h0f, &h10, &h42, &h1C           ;  movss  xmm0, [edx+28]
! db &hf3, &h0f, &h10, &h7A, &h20           ;  movss  xmm7, [edx+32]
! db &h0f, &hC6, &hC0, &h00                 ;  shufps xmm0, xmm0, 0
! db &h0f, &hC6, &hFF, &h00                 ;  shufps xmm7, xmm7, 0
! db &h0f, &h59, &hC5                       ;  mulps  xmm0, xmm5
! db &h0f, &h59, &hF9                       ;  mulps  xmm7, xmm1
! db &h0f, &h58, &hF0                       ;  addps  xmm6, xmm0
! db &h0f, &hC6, &hD2, &h00                 ;  shufps xmm2, xmm2, 0
! db &h0f, &h13, &h70, &h10                 ;  movlps [eax+16], xmm6
! db &h0f, &h17, &h70, &h18                 ;  movhps [eax+24], xmm6
! db &h0f, &h59, &hD3                       ;  mulps  xmm2, xmm3
! db &hf3, &h0f, &h10, &h72, &h28           ;  movss  xmm6, [edx+40]
! db &h0f, &h58, &hFA                       ;  addps  xmm7, xmm2
! db &h0f, &hC6, &hF6, &h00                 ;  shufps xmm6, xmm6, 0
! db &hf3, &h0f, &h10, &h52, &h2C           ;  movss  xmm2, [edx+44]
! db &h0f, &h59, &hF4                       ;  mulps  xmm6, xmm4
! db &h0f, &hC6, &hD2, &h00                 ;  shufps xmm2, xmm2, 0
! db &h0f, &h58, &hFE                       ;  addps  xmm7, xmm6
! db &h0f, &h59, &hD5                       ;  mulps  xmm2, xmm5
! db &hf3, &h0f, &h10, &h42, &h34           ;  movss  xmm0, [edx+52]
! db &h0f, &h58, &hFA                       ;  addps  xmm7, xmm2
! db &h0f, &hC6, &hC0, &h00                 ;  shufps xmm0, xmm0, 0
! db &h0f, &h13, &h78, &h20                 ;  movlps [eax+32], xmm7
! db &hf3, &h0f, &h10, &h52, &h30           ;  movss  xmm2, [edx+48]
! db &h0f, &h17, &h78, &h28                 ;  movhps [eax+40], xmm7
! db &h0f, &h59, &hC3                       ;  mulps  xmm0, xmm3
! db &h0f, &hC6, &hD2, &h00                 ;  shufps xmm2, xmm2, 0
! db &hf3, &h0f, &h10, &h72, &h38           ;  movss  xmm6, [edx+56]
! db &h0f, &h59, &hD1                       ;  mulps  xmm2, xmm1
! db &h0f, &hC6, &hF6, &h00                 ;  shufps xmm6, xmm6, 0
! db &h0f, &h58, &hD0                       ;  addps  xmm2, xmm0
! db &h0f, &h59, &hF4                       ;  mulps  xmm6, xmm4
! db &hf3, &h0f, &h10, &h7A, &h3C           ;  movss  xmm7, [edx+60]
! db &h0f, &hC6, &hFF, &h00                 ;  shufps xmm7, xmm7, 0
! db &h0f, &h58, &hD6                       ;  addps  xmm2, xmm6
! db &h0f, &h59, &hFD                       ;  mulps  xmm7, xmm5
! db &h0f, &h58, &hD7                       ;  addps  xmm2, xmm7
! db &h0f, &h11, &h50, &h30                 ;  movups [eax+48], xmm2

END SUB


 



FUNCTION PBMAIN () AS LONG

DIM r(15) AS SINGLE
DIM a(15) AS SINGLE
DIM b(15) AS SINGLE

DIM i AS LONG
DIM t AS SINGLE

t=TIMER
FOR i=1 TO 1e6
'  MatMultiply(r(),a(),b())     ' 1.6  seconds using 2d arrays
'  MatMultiplyP(r(),a(),b())    ' 1.5  seconds using 2d arrays and pointers
'  MatrixMul4x4(a(),b(),r())    ' 0.39 seconds using 1d pointers
'  MatrixMul4x4FPU(a(),b(),r()) ' 0.17 seconds using FPU assembly code
   MatrixMul4x4SSE(a(),b(),r()) ' 0.1 seconds using SSE assembly code
NEXT

MSGBOX STR$(TIMER-t)

END FUNCTION
                   

Patrice Terrier

Charles

I am using a color matrix of 5x5 not 4x4 or 3x3 that would be of course much faster.

If you can make it work for 5x5 then i could try it, but i am not an assembly guy myself  :'(
Patrice Terrier
GDImage (advanced graphic addon)
http://www.zapsolution.com

Charles Pegge


Patrice,

This takes 0.27 secs per million calcs for a 5x5 Matrix. I have not functionally tested it but it is a simple expansion of the 4x4 function

Result is returned in mr() matching your example prototypes


SUB MatrixMul5x5FPU(BYREF mr() AS SINGLE, BYREF m1() AS SINGLE, BYREF m2() AS SINGLE )
DIM p AS SINGLE PTR

p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))

block:
! call column
! call column
! call column
! call column
! call column
EXIT SUB

column:
! call cell
! call cell
! call cell
! call cell
! call cell
! add edx,20
! sub ecx,20
! ret

cell: ' row A * column B
! fld   dword ptr [ecx   ]
! fmul  dword ptr [edx   ]
! fld   dword ptr [ecx+20]
! fmul  dword ptr [edx+04]
! fld   dword ptr [ecx+40]
! fmul  dword ptr [edx+08]
! fld   dword ptr [ecx+60]
! fmul  dword ptr [edx+12]
! fld   dword ptr [ecx+80]
! fmul  dword ptr [edx+16]
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp  dword ptr [eax]
! add   eax,4
! add   ecx,4
! ret
END SUB

[/cpde]

Patrice Terrier

Charles

Thank you very much!

I did check it over the built-in PB MAT function, and i would say that this confirm that the PB one was already very fast, however yours seems even faster ;)

I shall post the new GDImage.dll in the PhotoSetup thread, for those wanting to try it.

Thanks again.
Patrice Terrier
GDImage (advanced graphic addon)
http://www.zapsolution.com

Charles Pegge

#12
Glad it was useful Patrice.

Here are 3x3 4x4 and 5x5 matrix multiplications using single and double precision. You can see how the code changes for each matrix type.

Interestingly there seems to be very little difference in speed between single precision and double precision



'================
' SINGLES
'================



SUB MatrixMul3x3SFPU(BYREF mr() AS SINGLE, BYREF m1() AS SINGLE, BYREF m2() AS SINGLE )
DIM p AS SINGLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))

block:
! call column
! call column
! call column
EXIT SUB

column:
! call cell
! call cell
! call cell
! add edx,12
! sub ecx,12
! ret

cell: ' row A * column B
! fld   dword ptr [ecx   ]
! fmul  dword ptr [edx   ]
! fld   dword ptr [ecx+12]
! fmul  dword ptr [edx+04]
! fld   dword ptr [ecx+24]
! fmul  dword ptr [edx+08]
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp  dword ptr [eax]
! add   eax,4
! add   ecx,4
! ret
END SUB

SUB MatrixMul4x4SFPU(BYREF mr() AS SINGLE, BYREF m1() AS SINGLE, BYREF m2() AS SINGLE )
DIM p AS SINGLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))

block:
! call column
! call column
! call column
! call column
EXIT SUB

column:
! call cell
! call cell
! call cell
! call cell
! add edx,16
! sub ecx,16
! ret

cell: ' row A * column B
! fld   dword ptr [ecx   ]
! fmul  dword ptr [edx   ]
! fld   dword ptr [ecx+16]
! fmul  dword ptr [edx+04]
! fld   dword ptr [ecx+32]
! fmul  dword ptr [edx+08]
! fld   dword ptr [ecx+48]
! fmul  dword ptr [edx+12]
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp  dword ptr [eax]
! add   eax,4
! add   ecx,4
! ret
END SUB

SUB MatrixMul5x5SFPU(BYREF mr() AS SINGLE, BYREF m1() AS SINGLE, BYREF m2() AS SINGLE )
DIM p AS SINGLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))

block:
! call column
! call column
! call column
! call column
! call column
EXIT SUB

column:
! call cell
! call cell
! call cell
! call cell
! call cell
! add edx,20
! sub ecx,20
! ret

cell: ' row A * column B
! fld   dword ptr [ecx   ]
! fmul  dword ptr [edx   ]
! fld   dword ptr [ecx+20]
! fmul  dword ptr [edx+04]
! fld   dword ptr [ecx+40]
! fmul  dword ptr [edx+08]
! fld   dword ptr [ecx+60]
! fmul  dword ptr [edx+12]
! fld   dword ptr [ecx+80]
! fmul  dword ptr [edx+16]
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp  dword ptr [eax]
! add   eax,4
! add   ecx,4
! ret
END SUB
 
'====================================
'  DOUBLES
'====================================


SUB MatrixMul3x3DFPU(BYREF mr() AS DOUBLE, BYREF m1() AS DOUBLE, BYREF m2() AS DOUBLE )
DIM p AS DOUBLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))

block:
! call column
! call column
! call column
EXIT SUB

column:
! call cell
! call cell
! call cell
! add edx,24
! sub ecx,24
! ret

cell: ' row A * column B
! fld   qword ptr [ecx   ]
! fmul  qword ptr [edx   ]
! fld   qword ptr [ecx+24]
! fmul  qword ptr [edx+08]
! fld   qword ptr [ecx+48]
! fmul  qword ptr [edx+16]
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp  qword ptr [eax]
! add   eax,8
! add   ecx,8
! ret
END SUB


SUB MatrixMul4x4DFPU(BYREF mr() AS DOUBLE, BYREF m1() AS DOUBLE, BYREF m2() AS DOUBLE )
DIM p AS DOUBLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))

block:
! call column
! call column
! call column
! call column
EXIT SUB

column:
! call cell
! call cell
! call cell
! call cell
! add edx,32
! sub ecx,32
! ret

cell: ' row A * column B
! fld   qword ptr [ecx   ]
! fmul  qword ptr [edx   ]
! fld   qword ptr [ecx+32]
! fmul  qword ptr [edx+08]
! fld   qword ptr [ecx+64]
! fmul  qword ptr [edx+16]
! fld   qword ptr [ecx+96]
! fmul  qword ptr [edx+24]
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp  qword ptr [eax]
! add   eax,8
! add   ecx,8
! ret
END SUB

SUB MatrixMul5x5DFPU(BYREF mr() AS DOUBLE, BYREF m1() AS DOUBLE, BYREF m2() AS DOUBLE )
DIM p AS DOUBLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))

block:
! call column
! call column
! call column
! call column
! call column
EXIT SUB

column:
! call cell
! call cell
! call cell
! call cell
! call cell
! add edx,40
! sub ecx,40
! ret

cell: ' row A * column B
! fld   qword ptr [ecx   ]
! fmul  qword ptr [edx   ]
! fld   qword ptr [ecx+40]
! fmul  qword ptr [edx+08]
! fld   qword ptr [ecx+80]
! fmul  qword ptr [edx+16]
! fld   qword ptr [ecx+120]
! fmul  qword ptr [edx+24]
! fld   qword ptr [ecx+160]
! fmul  qword ptr [edx+32]
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp  qword ptr [eax]
! add   eax,8
! add   ecx,8
! ret
END SUB


                 



Correction to strides 28 July 2008

Petr Schreiber

That is perfect Charles,

may I steal your code for TBGL?


Petr
AMD Sempron 3400+ | 1GB RAM @ 533MHz | GeForce 6200 / GeForce 9500GT | 32bit Windows XP SP3

psch.thinbasic.com

Charles Pegge

Of course Petr,

Are there any other matrix operations we need to look at. Multiplication seems to be the most important as far as graphics is concerned.