Asm: 4*4 Matrix Multiply

Charles Pegge · July 26, 2008, 08:34:47 AM

Using FPU - compared with using pointers in PowerBasic


#COMPILE EXE
#DIM ALL

SUB MatrixMul4x4S(BYREF a() AS SINGLE, BYREF b() AS SINGLE, BYREF c() AS SINGLE )
DIM p AS SINGLE PTR

p=VARPTR(a(0))
! mov ecx,eax
p=VARPTR(b(0))
! mov edx,eax
p=VARPTR(c(0))

block:
! call column
! call column
! call column
! call column
EXIT SUB

column:
! call cell
! call cell
! call cell
! call cell
! add edx,16
! sub ecx,16
! ret

cell: ' row A * column B
! fld   dword ptr [ecx   ]
! fmul  dword ptr [edx   ]
! fld   dword ptr [ecx+16]
! fmul  dword ptr [edx+04]
! fld   dword ptr [ecx+32]
! fmul  dword ptr [edx+08]
! fld   dword ptr [ecx+48]
! fmul  dword ptr [edx+12]
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp  dword ptr [eax]
! add   eax,4
! add   ecx,4
! ret

END SUB
       


 SUB MatrixMul4x4(lhs() AS SINGLE, rhs() AS SINGLE, res() AS SINGLE)
     DIM lhsp AS SINGLE PTR:lhsp=VARPTR(lhs(0))
     DIM rhsp AS SINGLE PTR:rhsp=VARPTR(rhs(0))

     ' COLUMN 1

     res(00)=@lhsp[0]*@rhsp[00]+@lhsp[4]*@rhsp[01]+@lhsp[08]*@rhsp[02]+@lhsp[12]*@rhsp[03]
     res(01)=@lhsp[1]*@rhsp[00]+@lhsp[5]*@rhsp[01]+@lhsp[09]*@rhsp[02]+@lhsp[13]*@rhsp[03]
     res(02)=@lhsp[2]*@rhsp[00]+@lhsp[6]*@rhsp[01]+@lhsp[10]*@rhsp[02]+@lhsp[14]*@rhsp[03]
     res(03)=@lhsp[3]*@rhsp[00]+@lhsp[7]*@rhsp[01]+@lhsp[11]*@rhsp[02]+@lhsp[15]*@rhsp[03]

     ' COLUMN 1

     res(04)=@lhsp[0]*@rhsp[04]+@lhsp[4]*@rhsp[05]+@lhsp[08]*@rhsp[06]+@lhsp[12]*@rhsp[07]
     res(05)=@lhsp[1]*@rhsp[04]+@lhsp[5]*@rhsp[05]+@lhsp[09]*@rhsp[06]+@lhsp[13]*@rhsp[07]
     res(06)=@lhsp[2]*@rhsp[04]+@lhsp[6]*@rhsp[05]+@lhsp[10]*@rhsp[06]+@lhsp[14]*@rhsp[07]
     res(07)=@lhsp[3]*@rhsp[04]+@lhsp[7]*@rhsp[05]+@lhsp[11]*@rhsp[06]+@lhsp[15]*@rhsp[07]

     ' COLIMN 3

     res(08)=@lhsp[0]*@rhsp[08]+@lhsp[4]*@rhsp[09]+@lhsp[08]*@rhsp[10]+@lhsp[12]*@rhsp[11]
     res(09)=@lhsp[1]*@rhsp[08]+@lhsp[5]*@rhsp[09]+@lhsp[09]*@rhsp[10]+@lhsp[13]*@rhsp[11]
     res(10)=@lhsp[2]*@rhsp[08]+@lhsp[6]*@rhsp[09]+@lhsp[10]*@rhsp[10]+@lhsp[14]*@rhsp[11]
     res(11)=@lhsp[3]*@rhsp[08]+@lhsp[7]*@rhsp[09]+@lhsp[11]*@rhsp[10]+@lhsp[15]*@rhsp[11]

     ' COLUMN 4

     res(12)=@lhsp[0]*@rhsp[12]+@lhsp[4]*@rhsp[13]+@lhsp[08]*@rhsp[14]+@lhsp[12]*@rhsp[15]
     res(13)=@lhsp[1]*@rhsp[12]+@lhsp[5]*@rhsp[13]+@lhsp[09]*@rhsp[14]+@lhsp[13]*@rhsp[15]
     res(14)=@lhsp[2]*@rhsp[12]+@lhsp[6]*@rhsp[13]+@lhsp[10]*@rhsp[14]+@lhsp[14]*@rhsp[15]
     res(15)=@lhsp[3]*@rhsp[12]+@lhsp[7]*@rhsp[13]+@lhsp[11]*@rhsp[14]+@lhsp[15]*@rhsp[15]
 END SUB






SUB getrow4S(BYREF matr AS SINGLE)
    ' test this
    DIM row4f(3) AS GLOBAL SINGLE
    DIM p AS SINGLE PTR
    p=VARPTR(matr)
    row4f(0)=@p[0]
    row4f(1)=@p[4]
    row4f(2)=@p[8]
    row4f(3)=@p[12]
END SUB

  


FUNCTION PBMAIN () AS LONG

    

END FUNCTION

Patrice Terrier · July 26, 2008, 11:26:52 AM

Charles

Then, what do you think would be the fastest ASM solution to perform these (color matrix)

Code Select


SUB MatAssign(MR() AS SINGLE, MM() AS SINGLE)
    'ARRAY ASSIGN MR() = MM(0, 0), MM(0, 1), MM(0, 2), MM(0, 3), MM(0, 4), _
    '                    MM(1, 0), MM(1, 1), MM(1, 2), MM(1, 3), MM(1, 4), _
    '                    MM(2, 0), MM(2, 1), MM(2, 2), MM(2, 3), MM(2, 4), _
    '                    MM(3, 0), MM(3, 1), MM(3, 2), MM(3, 3), MM(3, 4), _
    '                    MM(4, 0), MM(4, 1), MM(4, 2), MM(4, 3), MM(4, 4)
    POKE$ VARPTR(MR(0,0)), PEEK$(VARPTR(MM(0,0)), 100)
END SUB

SUB MatMultiply(MR() AS SINGLE, M1() AS SINGLE, M2() AS SINGLE)
    MR(0,0) = M1(0,0) * M2(0,0) + M1(0,1) * M2(1,0) + M1(0,2) * M2(2,0) + M1(0,3) * M2(3,0) + M1(0,4) * M2(4,0)
    MR(0,1) = M1(1,0) * M2(0,0) + M1(1,1) * M2(1,0) + M1(1,2) * M2(2,0) + M1(1,3) * M2(3,0) + M1(1,4) * M2(4,0)
    MR(0,2) = M1(2,0) * M2(0,0) + M1(2,1) * M2(1,0) + M1(2,2) * M2(2,0) + M1(2,3) * M2(3,0) + M1(2,4) * M2(4,0)
    MR(0,3) = M1(3,0) * M2(0,0) + M1(3,1) * M2(1,0) + M1(3,2) * M2(2,0) + M1(3,3) * M2(3,0) + M1(3,4) * M2(4,0)
    MR(0,4) = M1(4,0) * M2(0,0) + M1(4,1) * M2(1,0) + M1(4,2) * M2(2,0) + M1(4,3) * M2(3,0) + M1(4,4) * M2(4,0)
    MR(1,0) = M1(0,0) * M2(0,1) + M1(0,1) * M2(1,1) + M1(0,2) * M2(2,1) + M1(0,3) * M2(3,1) + M1(0,4) * M2(4,1)
    MR(1,1) = M1(1,0) * M2(0,1) + M1(1,1) * M2(1,1) + M1(1,2) * M2(2,1) + M1(1,3) * M2(3,1) + M1(1,4) * M2(4,1)
    MR(1,2) = M1(2,0) * M2(0,1) + M1(2,1) * M2(1,1) + M1(2,2) * M2(2,1) + M1(2,3) * M2(3,1) + M1(2,4) * M2(4,1)
    MR(1,3) = M1(3,0) * M2(0,1) + M1(3,1) * M2(1,1) + M1(3,2) * M2(2,1) + M1(3,3) * M2(3,1) + M1(3,4) * M2(4,1)
    MR(1,4) = M1(4,0) * M2(0,1) + M1(4,1) * M2(1,1) + M1(4,2) * M2(2,1) + M1(4,3) * M2(3,1) + M1(4,4) * M2(4,1)
    MR(2,0) = M1(0,0) * M2(0,2) + M1(0,1) * M2(1,2) + M1(0,2) * M2(2,2) + M1(0,3) * M2(3,2) + M1(0,4) * M2(4,2)
    MR(2,1) = M1(1,0) * M2(0,2) + M1(1,1) * M2(1,2) + M1(1,2) * M2(2,2) + M1(1,3) * M2(3,2) + M1(1,4) * M2(4,2)
    MR(2,2) = M1(2,0) * M2(0,2) + M1(2,1) * M2(1,2) + M1(2,2) * M2(2,2) + M1(2,3) * M2(3,2) + M1(2,4) * M2(4,2)
    MR(2,3) = M1(3,0) * M2(0,2) + M1(3,1) * M2(1,2) + M1(3,2) * M2(2,2) + M1(3,3) * M2(3,2) + M1(3,4) * M2(4,2)
    MR(2,4) = M1(4,0) * M2(0,2) + M1(4,1) * M2(1,2) + M1(4,2) * M2(2,2) + M1(4,3) * M2(3,2) + M1(4,4) * M2(4,2)
    MR(3,0) = M1(0,0) * M2(0,3) + M1(0,1) * M2(1,3) + M1(0,2) * M2(2,3) + M1(0,3) * M2(3,3) + M1(0,4) * M2(4,3)
    MR(3,1) = M1(1,0) * M2(0,3) + M1(1,1) * M2(1,3) + M1(1,2) * M2(2,3) + M1(1,3) * M2(3,3) + M1(1,4) * M2(4,3)
    MR(3,2) = M1(2,0) * M2(0,3) + M1(2,1) * M2(1,3) + M1(2,2) * M2(2,3) + M1(2,3) * M2(3,3) + M1(2,4) * M2(4,3)
    MR(3,3) = M1(3,0) * M2(0,3) + M1(3,1) * M2(1,3) + M1(3,2) * M2(2,3) + M1(3,3) * M2(3,3) + M1(3,4) * M2(4,3)
    MR(3,4) = M1(4,0) * M2(0,3) + M1(4,1) * M2(1,3) + M1(4,2) * M2(2,1) + M1(4,3) * M2(3,3) + M1(4,4) * M2(4,3)
    MR(4,0) = M1(0,0) * M2(0,4) + M1(0,1) * M2(1,4) + M1(0,2) * M2(2,4) + M1(0,3) * M2(3,4) + M1(0,4) * M2(4,4)
    MR(4,1) = M1(1,0) * M2(0,4) + M1(1,1) * M2(1,4) + M1(1,2) * M2(2,4) + M1(1,3) * M2(3,4) + M1(1,4) * M2(4,4)
    MR(4,2) = M1(2,0) * M2(0,4) + M1(2,1) * M2(1,4) + M1(2,2) * M2(2,4) + M1(2,3) * M2(3,4) + M1(2,4) * M2(4,4)
    MR(4,3) = M1(3,0) * M2(0,4) + M1(3,1) * M2(1,4) + M1(3,2) * M2(2,4) + M1(3,3) * M2(3,4) + M1(3,4) * M2(4,4)
    MR(4,4) = M1(4,0) * M2(0,4) + M1(4,1) * M2(1,4) + M1(4,2) * M2(2,4) + M1(4,3) * M2(3,4) + M1(4,4) * M2(4,4)
END SUB

Charles Pegge · July 26, 2008, 12:17:01 PM

Hi Patrice

I've tested your code in comparison with the pointer version and the FPU assembly code version.

My FPU version is about 10X faster. We can attribute this difference to the movement of data between memory and registers and the calculation of array element addresses, since the maths operations are identical.

There is a method described by Intel using SSE2 SIMD instructions which is twice the speed of the FPU. Unfortunately PB8 does not recognise the instructions for SSE2 so we would have to go down to machine code.

Code Select


FUNCTION PBMAIN () AS LONG

DIM r(15) AS SINGLE
DIM a(15) AS SINGLE
DIM b(15) AS SINGLE

DIM i AS LONG
DIM t AS SINGLE

t=TIMER
FOR i=1 TO 1e6
' MatMultiply(r(),a(),b())   ' 1.6  seconds using 2d arrays
' MatrixMul4x4(a(),b(),r())  ' 0.39 seconds using 1d pointers
  MatrixMul4x4S(a(),b(),r()) ' 0.17 seconds using FPU assembly code
NEXT

MSGBOX STR$(TIMER-t)

END FUNCTION

Patrice Terrier · July 26, 2008, 12:37:13 PM

Charles

I have checked with the pointer version (see below) and I can't see any speed boost by me

Code Select


SUB MatMultiply(MR() AS SINGLE, M1() AS SINGLE, M2() AS SINGLE)
'    MR(0,0) = M1(0,0) * M2(0,0) + M1(0,1) * M2(1,0) + M1(0,2) * M2(2,0) + M1(0,3) * M2(3,0) + M1(0,4) * M2(4,0)
'    MR(0,1) = M1(1,0) * M2(0,0) + M1(1,1) * M2(1,0) + M1(1,2) * M2(2,0) + M1(1,3) * M2(3,0) + M1(1,4) * M2(4,0)
'    MR(0,2) = M1(2,0) * M2(0,0) + M1(2,1) * M2(1,0) + M1(2,2) * M2(2,0) + M1(2,3) * M2(3,0) + M1(2,4) * M2(4,0)
'    MR(0,3) = M1(3,0) * M2(0,0) + M1(3,1) * M2(1,0) + M1(3,2) * M2(2,0) + M1(3,3) * M2(3,0) + M1(3,4) * M2(4,0)
'    MR(0,4) = M1(4,0) * M2(0,0) + M1(4,1) * M2(1,0) + M1(4,2) * M2(2,0) + M1(4,3) * M2(3,0) + M1(4,4) * M2(4,0)
'    MR(1,0) = M1(0,0) * M2(0,1) + M1(0,1) * M2(1,1) + M1(0,2) * M2(2,1) + M1(0,3) * M2(3,1) + M1(0,4) * M2(4,1)
'    MR(1,1) = M1(1,0) * M2(0,1) + M1(1,1) * M2(1,1) + M1(1,2) * M2(2,1) + M1(1,3) * M2(3,1) + M1(1,4) * M2(4,1)
'    MR(1,2) = M1(2,0) * M2(0,1) + M1(2,1) * M2(1,1) + M1(2,2) * M2(2,1) + M1(2,3) * M2(3,1) + M1(2,4) * M2(4,1)
'    MR(1,3) = M1(3,0) * M2(0,1) + M1(3,1) * M2(1,1) + M1(3,2) * M2(2,1) + M1(3,3) * M2(3,1) + M1(3,4) * M2(4,1)
'    MR(1,4) = M1(4,0) * M2(0,1) + M1(4,1) * M2(1,1) + M1(4,2) * M2(2,1) + M1(4,3) * M2(3,1) + M1(4,4) * M2(4,1)
'    MR(2,0) = M1(0,0) * M2(0,2) + M1(0,1) * M2(1,2) + M1(0,2) * M2(2,2) + M1(0,3) * M2(3,2) + M1(0,4) * M2(4,2)
'    MR(2,1) = M1(1,0) * M2(0,2) + M1(1,1) * M2(1,2) + M1(1,2) * M2(2,2) + M1(1,3) * M2(3,2) + M1(1,4) * M2(4,2)
'    MR(2,2) = M1(2,0) * M2(0,2) + M1(2,1) * M2(1,2) + M1(2,2) * M2(2,2) + M1(2,3) * M2(3,2) + M1(2,4) * M2(4,2)
'    MR(2,3) = M1(3,0) * M2(0,2) + M1(3,1) * M2(1,2) + M1(3,2) * M2(2,2) + M1(3,3) * M2(3,2) + M1(3,4) * M2(4,2)
'    MR(2,4) = M1(4,0) * M2(0,2) + M1(4,1) * M2(1,2) + M1(4,2) * M2(2,2) + M1(4,3) * M2(3,2) + M1(4,4) * M2(4,2)
'    MR(3,0) = M1(0,0) * M2(0,3) + M1(0,1) * M2(1,3) + M1(0,2) * M2(2,3) + M1(0,3) * M2(3,3) + M1(0,4) * M2(4,3)
'    MR(3,1) = M1(1,0) * M2(0,3) + M1(1,1) * M2(1,3) + M1(1,2) * M2(2,3) + M1(1,3) * M2(3,3) + M1(1,4) * M2(4,3)
'    MR(3,2) = M1(2,0) * M2(0,3) + M1(2,1) * M2(1,3) + M1(2,2) * M2(2,3) + M1(2,3) * M2(3,3) + M1(2,4) * M2(4,3)
'    MR(3,3) = M1(3,0) * M2(0,3) + M1(3,1) * M2(1,3) + M1(3,2) * M2(2,3) + M1(3,3) * M2(3,3) + M1(3,4) * M2(4,3)
'    MR(3,4) = M1(4,0) * M2(0,3) + M1(4,1) * M2(1,3) + M1(4,2) * M2(2,1) + M1(4,3) * M2(3,3) + M1(4,4) * M2(4,3)
'    MR(4,0) = M1(0,0) * M2(0,4) + M1(0,1) * M2(1,4) + M1(0,2) * M2(2,4) + M1(0,3) * M2(3,4) + M1(0,4) * M2(4,4)
'    MR(4,1) = M1(1,0) * M2(0,4) + M1(1,1) * M2(1,4) + M1(1,2) * M2(2,4) + M1(1,3) * M2(3,4) + M1(1,4) * M2(4,4)
'    MR(4,2) = M1(2,0) * M2(0,4) + M1(2,1) * M2(1,4) + M1(2,2) * M2(2,4) + M1(2,3) * M2(3,4) + M1(2,4) * M2(4,4)
'    MR(4,3) = M1(3,0) * M2(0,4) + M1(3,1) * M2(1,4) + M1(3,2) * M2(2,4) + M1(3,3) * M2(3,4) + M1(3,4) * M2(4,4)
'    MR(4,4) = M1(4,0) * M2(0,4) + M1(4,1) * M2(1,4) + M1(4,2) * M2(2,4) + M1(4,3) * M2(3,4) + M1(4,4) * M2(4,4)
    
     DIM M1p AS SINGLE PTR: M1p = VARPTR(M1(0,0))
     DIM M2p AS SINGLE PTR: M2p = VARPTR(M2(0,0))

     MR(0,0)=@M1p[0]*@M2p[00]+@M1p[5]*@M2p[01]+@M1p[10]*@M2p[02]+@M1p[15]*@M2p[03]+@M1p[20]*@M2p[04]
     MR(0,1)=@M1p[1]*@M2p[00]+@M1p[6]*@M2p[01]+@M1p[11]*@M2p[02]+@M1p[16]*@M2p[03]+@M1p[21]*@M2p[04]
     MR(0,2)=@M1p[2]*@M2p[00]+@M1p[7]*@M2p[01]+@M1p[12]*@M2p[02]+@M1p[17]*@M2p[03]+@M1p[22]*@M2p[04]
     MR(0,3)=@M1p[3]*@M2p[00]+@M1p[8]*@M2p[01]+@M1p[13]*@M2p[02]+@M1p[18]*@M2p[03]+@M1p[23]*@M2p[04]
     MR(0,4)=@M1p[4]*@M2p[00]+@M1p[9]*@M2p[01]+@M1p[14]*@M2p[02]+@M1p[19]*@M2p[03]+@M1p[24]*@M2p[04]

     MR(1,0)=@M1p[0]*@M2p[05]+@M1p[5]*@M2p[06]+@M1p[10]*@M2p[07]+@M1p[15]*@M2p[08]+@M1p[20]*@M2p[09]
     MR(1,1)=@M1p[1]*@M2p[05]+@M1p[6]*@M2p[06]+@M1p[11]*@M2p[07]+@M1p[16]*@M2p[08]+@M1p[21]*@M2p[09]
     MR(1,2)=@M1p[2]*@M2p[05]+@M1p[7]*@M2p[06]+@M1p[12]*@M2p[07]+@M1p[17]*@M2p[08]+@M1p[22]*@M2p[09]
     MR(1,3)=@M1p[3]*@M2p[05]+@M1p[8]*@M2p[06]+@M1p[13]*@M2p[07]+@M1p[18]*@M2p[08]+@M1p[23]*@M2p[09]
     MR(1,4)=@M1p[4]*@M2p[05]+@M1p[9]*@M2p[06]+@M1p[14]*@M2p[07]+@M1p[19]*@M2p[08]+@M1p[24]*@M2p[09]

     MR(2,0)=@M1p[0]*@M2p[10]+@M1p[5]*@M2p[11]+@M1p[10]*@M2p[12]+@M1p[15]*@M2p[13]+@M1p[20]*@M2p[14]
     MR(2,1)=@M1p[1]*@M2p[10]+@M1p[6]*@M2p[11]+@M1p[11]*@M2p[12]+@M1p[16]*@M2p[13]+@M1p[21]*@M2p[14]
     MR(2,2)=@M1p[2]*@M2p[10]+@M1p[7]*@M2p[11]+@M1p[12]*@M2p[12]+@M1p[17]*@M2p[13]+@M1p[22]*@M2p[14]
     MR(2,3)=@M1p[3]*@M2p[10]+@M1p[8]*@M2p[11]+@M1p[13]*@M2p[12]+@M1p[18]*@M2p[13]+@M1p[23]*@M2p[14]
     MR(2,4)=@M1p[4]*@M2p[10]+@M1p[9]*@M2p[11]+@M1p[14]*@M2p[12]+@M1p[19]*@M2p[13]+@M1p[24]*@M2p[14]

     MR(3,0)=@M1p[0]*@M2p[15]+@M1p[5]*@M2p[16]+@M1p[10]*@M2p[17]+@M1p[15]*@M2p[18]+@M1p[20]*@M2p[19]
     MR(3,1)=@M1p[1]*@M2p[15]+@M1p[6]*@M2p[16]+@M1p[11]*@M2p[17]+@M1p[16]*@M2p[18]+@M1p[21]*@M2p[19]
     MR(3,2)=@M1p[2]*@M2p[15]+@M1p[7]*@M2p[16]+@M1p[12]*@M2p[17]+@M1p[17]*@M2p[18]+@M1p[22]*@M2p[19]
     MR(3,3)=@M1p[3]*@M2p[15]+@M1p[8]*@M2p[16]+@M1p[13]*@M2p[17]+@M1p[18]*@M2p[18]+@M1p[23]*@M2p[19]
     MR(3,4)=@M1p[4]*@M2p[15]+@M1p[9]*@M2p[16]+@M1p[14]*@M2p[17]+@M1p[19]*@M2p[18]+@M1p[24]*@M2p[19]

     MR(4,0)=@M1p[0]*@M2p[20]+@M1p[5]*@M2p[21]+@M1p[10]*@M2p[22]+@M1p[15]*@M2p[23]+@M1p[20]*@M2p[24]
     MR(4,1)=@M1p[1]*@M2p[20]+@M1p[6]*@M2p[21]+@M1p[11]*@M2p[22]+@M1p[16]*@M2p[23]+@M1p[21]*@M2p[24]
     MR(4,2)=@M1p[2]*@M2p[20]+@M1p[7]*@M2p[21]+@M1p[12]*@M2p[22]+@M1p[17]*@M2p[23]+@M1p[22]*@M2p[24]
     MR(4,3)=@M1p[3]*@M2p[20]+@M1p[8]*@M2p[21]+@M1p[13]*@M2p[22]+@M1p[18]*@M2p[23]+@M1p[23]*@M2p[24]
     MR(4,4)=@M1p[4]*@M2p[20]+@M1p[9]*@M2p[21]+@M1p[14]*@M2p[22]+@M1p[19]*@M2p[23]+@M1p[24]*@M2p[24]
    
END SUB

Patrice Terrier · July 26, 2008, 01:04:12 PM

I just checked using GDIPLUS
CALL GdipMultiplyMatrix(BYVAL VARPTR(MR(0,0)), BYVAL VARPTR(M2(0,0)), 0)

it doesn't matches my need...

Charles Pegge · July 26, 2008, 01:14:26 PM

I think the difference is that you are using 2 dimensional arrays and pointers. - mine is a simple array of 16 elements which are resolved by the compiler directly into address offsets. No address calculations are required.

PS I wonder how fast is GdipMultiplyMatrix.

Patrice Terrier · July 26, 2008, 02:25:38 PM

GdipMultiplyMatrix is not good to manipulate the GDImage color matrix with alpha channel.

Patrice Terrier · July 26, 2008, 04:14:06 PM

After doing several benchmarks, it looks like (in my case) the PowerBASIC built-in MAT would be the best solution in case of cumulative settings.

Charles Pegge · July 26, 2008, 06:44:46 PM

This is an Intel example of 4x4 matrix multiplication. I translated it into Asmosphere under thinBasic. It is rendered here into machine code for PB.
It is the fastest, taking 0.1 seconds to compute 1,000,000 matrixes. The code is compatible with Pentium III processors (ca 1999) so it should be functional on most PCs.

PS: I don't know how it works

Streaming SIMD Extensions -
Matrix Multiplication

http://download.intel.com/design/PentiumIII/sml/24504501.pdf

Code Select


SUB MatrixMul4x4SSE(BYREF a() AS SINGLE, BYREF b() AS SINGLE, BYREF c() AS SINGLE )
DIM p AS SINGLE PTR

p=VARPTR(a(0))
! mov ecx,eax
p=VARPTR(b(0))
! mov edx,eax
p=VARPTR(a(0))
 '.sse_matrix_mul               ;  .sse_matrix_mul

 ! db &hf3, &h0f, &h10, &h02                 ;  movss  xmm0, [edx]
 ! db &h0f, &h10, &h09                       ;  movups xmm1, [ecx]
 ! db &h0f, &hC6, &hC0, &h00                 ;  shufps xmm0, xmm0, 0
 ! db &hf3, &h0f, &h10, &h52, &h04           ;  movss  xmm2, [edx+4]
 ! db &h0f, &h59, &hC1                       ;  mulps  xmm0, xmm1
 ! db &h0f, &hC6, &hD2, &h00                 ;  shufps xmm2, xmm2, 0
 ! db &h0f, &h10, &h59, &h10                 ;  movups xmm3, [ecx+16]
 ! db &hf3, &h0f, &h10, &h7A, &h08           ;  movss  xmm7, [edx+8]
 ! db &h0f, &h59, &hD3                       ;  mulps  xmm2, xmm3
 ! db &h0f, &hC6, &hFF, &h00                 ;  shufps xmm7, xmm7, 0
 ! db &h0f, &h58, &hC2                       ;  addps  xmm0, xmm2
 ! db &h0f, &h10, &h61, &h20                 ;  movups xmm4, [ecx+32]
 ! db &hf3, &h0f, &h10, &h52, &h0C           ;  movss  xmm2, [edx+12]
 ! db &h0f, &h59, &hFC                       ;  mulps  xmm7, xmm4
 ! db &h0f, &hC6, &hD2, &h00                 ;  shufps xmm2, xmm2, 0
 ! db &h0f, &h58, &hC7                       ;  addps  xmm0, xmm7
 ! db &h0f, &h10, &h69, &h30                 ;  movups xmm5, [ecx+48]
 ! db &hf3, &h0f, &h10, &h72, &h10           ;  movss  xmm6, [edx+16]
 ! db &h0f, &h59, &hD5                       ;  mulps  xmm2, xmm5
 ! db &hf3, &h0f, &h10, &h7A, &h14           ;  movss  xmm7, [edx+20]
 ! db &h0f, &hC6, &hF6, &h00                 ;  shufps xmm6, xmm6, 0
 ! db &h0f, &h58, &hC2                       ;  addps  xmm0, xmm2
 ! db &h0f, &hC6, &hFF, &h00                 ;  shufps xmm7, xmm7, 0
 ! db &h0f, &h13, &h00                       ;  movlps [eax], xmm0
 ! db &h0f, &h17, &h40, &h08                 ;  movhps [eax+8], xmm0
 ! db &h0f, &h59, &hFB                       ;  mulps  xmm7, xmm3
 ! db &hf3, &h0f, &h10, &h42, &h18           ;  movss  xmm0, [edx+24]
 ! db &h0f, &h59, &hF1                       ;  mulps  xmm6, xmm1
 ! db &h0f, &hC6, &hC0, &h00                 ;  shufps xmm0, xmm0, 0
 ! db &h0f, &h58, &hF7                       ;  addps  xmm6, xmm7
 ! db &h0f, &h59, &hC4                       ;  mulps  xmm0, xmm4
 ! db &hf3, &h0f, &h10, &h52, &h24           ;  movss  xmm2, [edx+36]
 ! db &h0f, &h58, &hF0                       ;  addps  xmm6, xmm0
 ! db &hf3, &h0f, &h10, &h42, &h1C           ;  movss  xmm0, [edx+28]
 ! db &hf3, &h0f, &h10, &h7A, &h20           ;  movss  xmm7, [edx+32]
 ! db &h0f, &hC6, &hC0, &h00                 ;  shufps xmm0, xmm0, 0
 ! db &h0f, &hC6, &hFF, &h00                 ;  shufps xmm7, xmm7, 0
 ! db &h0f, &h59, &hC5                       ;  mulps  xmm0, xmm5
 ! db &h0f, &h59, &hF9                       ;  mulps  xmm7, xmm1
 ! db &h0f, &h58, &hF0                       ;  addps  xmm6, xmm0
 ! db &h0f, &hC6, &hD2, &h00                 ;  shufps xmm2, xmm2, 0
 ! db &h0f, &h13, &h70, &h10                 ;  movlps [eax+16], xmm6
 ! db &h0f, &h17, &h70, &h18                 ;  movhps [eax+24], xmm6
 ! db &h0f, &h59, &hD3                       ;  mulps  xmm2, xmm3
 ! db &hf3, &h0f, &h10, &h72, &h28           ;  movss  xmm6, [edx+40]
 ! db &h0f, &h58, &hFA                       ;  addps  xmm7, xmm2
 ! db &h0f, &hC6, &hF6, &h00                 ;  shufps xmm6, xmm6, 0
 ! db &hf3, &h0f, &h10, &h52, &h2C           ;  movss  xmm2, [edx+44]
 ! db &h0f, &h59, &hF4                       ;  mulps  xmm6, xmm4
 ! db &h0f, &hC6, &hD2, &h00                 ;  shufps xmm2, xmm2, 0
 ! db &h0f, &h58, &hFE                       ;  addps  xmm7, xmm6
 ! db &h0f, &h59, &hD5                       ;  mulps  xmm2, xmm5
 ! db &hf3, &h0f, &h10, &h42, &h34           ;  movss  xmm0, [edx+52]
 ! db &h0f, &h58, &hFA                       ;  addps  xmm7, xmm2
 ! db &h0f, &hC6, &hC0, &h00                 ;  shufps xmm0, xmm0, 0
 ! db &h0f, &h13, &h78, &h20                 ;  movlps [eax+32], xmm7
 ! db &hf3, &h0f, &h10, &h52, &h30           ;  movss  xmm2, [edx+48]
 ! db &h0f, &h17, &h78, &h28                 ;  movhps [eax+40], xmm7
 ! db &h0f, &h59, &hC3                       ;  mulps  xmm0, xmm3
 ! db &h0f, &hC6, &hD2, &h00                 ;  shufps xmm2, xmm2, 0
 ! db &hf3, &h0f, &h10, &h72, &h38           ;  movss  xmm6, [edx+56]
 ! db &h0f, &h59, &hD1                       ;  mulps  xmm2, xmm1
 ! db &h0f, &hC6, &hF6, &h00                 ;  shufps xmm6, xmm6, 0
 ! db &h0f, &h58, &hD0                       ;  addps  xmm2, xmm0
 ! db &h0f, &h59, &hF4                       ;  mulps  xmm6, xmm4
 ! db &hf3, &h0f, &h10, &h7A, &h3C           ;  movss  xmm7, [edx+60]
 ! db &h0f, &hC6, &hFF, &h00                 ;  shufps xmm7, xmm7, 0
 ! db &h0f, &h58, &hD6                       ;  addps  xmm2, xmm6
 ! db &h0f, &h59, &hFD                       ;  mulps  xmm7, xmm5
 ! db &h0f, &h58, &hD7                       ;  addps  xmm2, xmm7
 ! db &h0f, &h11, &h50, &h30                 ;  movups [eax+48], xmm2

END SUB

Code Select


FUNCTION PBMAIN () AS LONG

DIM r(15) AS SINGLE
DIM a(15) AS SINGLE
DIM b(15) AS SINGLE

DIM i AS LONG
DIM t AS SINGLE

t=TIMER
FOR i=1 TO 1e6
'  MatMultiply(r(),a(),b())     ' 1.6  seconds using 2d arrays
'  MatMultiplyP(r(),a(),b())    ' 1.5  seconds using 2d arrays and pointers
'  MatrixMul4x4(a(),b(),r())    ' 0.39 seconds using 1d pointers
'  MatrixMul4x4FPU(a(),b(),r()) ' 0.17 seconds using FPU assembly code
   MatrixMul4x4SSE(a(),b(),r()) ' 0.1 seconds using SSE assembly code
NEXT

MSGBOX STR$(TIMER-t)

END FUNCTION

Patrice Terrier · July 26, 2008, 07:28:03 PM

Charles

I am using a color matrix of 5x5 not 4x4 or 3x3 that would be of course much faster.

If you can make it work for 5x5 then i could try it, but i am not an assembly guy myself

Charles Pegge · July 26, 2008, 08:21:29 PM

Patrice,

This takes 0.27 secs per million calcs for a 5x5 Matrix. I have not functionally tested it but it is a simple expansion of the 4x4 function

Result is returned in mr() matching your example prototypes

Code Select


SUB MatrixMul5x5FPU(BYREF mr() AS SINGLE, BYREF m1() AS SINGLE, BYREF m2() AS SINGLE )
DIM p AS SINGLE PTR

p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))

block:
! call column
! call column
! call column
! call column
! call column
EXIT SUB

column:
! call cell
! call cell
! call cell
! call cell
! call cell
! add edx,20
! sub ecx,20
! ret

cell: ' row A * column B
! fld   dword ptr [ecx   ]
! fmul  dword ptr [edx   ]
! fld   dword ptr [ecx+20]
! fmul  dword ptr [edx+04]
! fld   dword ptr [ecx+40]
! fmul  dword ptr [edx+08]
! fld   dword ptr [ecx+60]
! fmul  dword ptr [edx+12]
! fld   dword ptr [ecx+80]
! fmul  dword ptr [edx+16]
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp  dword ptr [eax]
! add   eax,4
! add   ecx,4
! ret
END SUB

[/cpde]

Patrice Terrier · July 26, 2008, 08:39:19 PM

Charles

Thank you very much!

I did check it over the built-in PB MAT function, and i would say that this confirm that the PB one was already very fast, however yours seems even faster

I shall post the new GDImage.dll in the PhotoSetup thread, for those wanting to try it.

Thanks again.

Charles Pegge · July 27, 2008, 12:00:32 PM

Glad it was useful Patrice.

Here are 3x3 4x4 and 5x5 matrix multiplications using single and double precision. You can see how the code changes for each matrix type.

Interestingly there seems to be very little difference in speed between single precision and double precision

Code Select



'================
' SINGLES
'================



SUB MatrixMul3x3SFPU(BYREF mr() AS SINGLE, BYREF m1() AS SINGLE, BYREF m2() AS SINGLE )
DIM p AS SINGLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))

block:
! call column
! call column
! call column
EXIT SUB

column:
! call cell
! call cell
! call cell
! add edx,12
! sub ecx,12
! ret

cell: ' row A * column B
! fld   dword ptr [ecx   ]
! fmul  dword ptr [edx   ]
! fld   dword ptr [ecx+12]
! fmul  dword ptr [edx+04]
! fld   dword ptr [ecx+24]
! fmul  dword ptr [edx+08]
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp  dword ptr [eax]
! add   eax,4
! add   ecx,4
! ret
END SUB

SUB MatrixMul4x4SFPU(BYREF mr() AS SINGLE, BYREF m1() AS SINGLE, BYREF m2() AS SINGLE )
DIM p AS SINGLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))

block:
! call column
! call column
! call column
! call column
EXIT SUB

column:
! call cell
! call cell
! call cell
! call cell
! add edx,16
! sub ecx,16
! ret

cell: ' row A * column B
! fld   dword ptr [ecx   ]
! fmul  dword ptr [edx   ]
! fld   dword ptr [ecx+16]
! fmul  dword ptr [edx+04]
! fld   dword ptr [ecx+32]
! fmul  dword ptr [edx+08]
! fld   dword ptr [ecx+48]
! fmul  dword ptr [edx+12]
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp  dword ptr [eax]
! add   eax,4
! add   ecx,4
! ret
END SUB

SUB MatrixMul5x5SFPU(BYREF mr() AS SINGLE, BYREF m1() AS SINGLE, BYREF m2() AS SINGLE )
DIM p AS SINGLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))

block:
! call column
! call column
! call column
! call column
! call column
EXIT SUB

column:
! call cell
! call cell
! call cell
! call cell
! call cell
! add edx,20
! sub ecx,20
! ret

cell: ' row A * column B
! fld   dword ptr [ecx   ]
! fmul  dword ptr [edx   ]
! fld   dword ptr [ecx+20]
! fmul  dword ptr [edx+04]
! fld   dword ptr [ecx+40]
! fmul  dword ptr [edx+08]
! fld   dword ptr [ecx+60]
! fmul  dword ptr [edx+12]
! fld   dword ptr [ecx+80]
! fmul  dword ptr [edx+16]
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp  dword ptr [eax]
! add   eax,4
! add   ecx,4
! ret
END SUB
  
'====================================
'  DOUBLES
'====================================


SUB MatrixMul3x3DFPU(BYREF mr() AS DOUBLE, BYREF m1() AS DOUBLE, BYREF m2() AS DOUBLE )
DIM p AS DOUBLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))

block:
! call column
! call column
! call column
EXIT SUB

column:
! call cell
! call cell
! call cell
! add edx,24
! sub ecx,24
! ret

cell: ' row A * column B
! fld   qword ptr [ecx   ]
! fmul  qword ptr [edx   ]
! fld   qword ptr [ecx+24]
! fmul  qword ptr [edx+08]
! fld   qword ptr [ecx+48]
! fmul  qword ptr [edx+16]
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp  qword ptr [eax]
! add   eax,8
! add   ecx,8
! ret
END SUB


SUB MatrixMul4x4DFPU(BYREF mr() AS DOUBLE, BYREF m1() AS DOUBLE, BYREF m2() AS DOUBLE )
DIM p AS DOUBLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))

block:
! call column
! call column
! call column
! call column
EXIT SUB

column:
! call cell
! call cell
! call cell
! call cell
! add edx,32
! sub ecx,32
! ret

cell: ' row A * column B
! fld   qword ptr [ecx   ]
! fmul  qword ptr [edx   ]
! fld   qword ptr [ecx+32]
! fmul  qword ptr [edx+08]
! fld   qword ptr [ecx+64]
! fmul  qword ptr [edx+16]
! fld   qword ptr [ecx+96]
! fmul  qword ptr [edx+24]
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp  qword ptr [eax]
! add   eax,8
! add   ecx,8
! ret
END SUB

SUB MatrixMul5x5DFPU(BYREF mr() AS DOUBLE, BYREF m1() AS DOUBLE, BYREF m2() AS DOUBLE )
DIM p AS DOUBLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))

block:
! call column
! call column
! call column
! call column
! call column
EXIT SUB

column:
! call cell
! call cell
! call cell
! call cell
! call cell
! add edx,40
! sub ecx,40
! ret

cell: ' row A * column B
! fld   qword ptr [ecx   ]
! fmul  qword ptr [edx   ]
! fld   qword ptr [ecx+40]
! fmul  qword ptr [edx+08]
! fld   qword ptr [ecx+80]
! fmul  qword ptr [edx+16]
! fld   qword ptr [ecx+120]
! fmul  qword ptr [edx+24]
! fld   qword ptr [ecx+160]
! fmul  qword ptr [edx+32]
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp  qword ptr [eax]
! add   eax,8
! add   ecx,8
! ret
END SUB

Correction to strides 28 July 2008

Petr Schreiber · July 27, 2008, 12:50:34 PM

That is perfect Charles,

may I steal your code for TBGL?

Petr

Charles Pegge · July 27, 2008, 12:59:52 PM

Of course Petr,

Are there any other matrix operations we need to look at. Multiplication seems to be the most important as far as graphics is concerned.

News:

Asm: 4*4 Matrix Multiply