Print Page - Asm: 4*4 Matrix Multiply

Title: Asm: 4*4 Matrix Multiply
Post by: Charles Pegge on July 26, 2008, 08:34:47 AM

Using FPU - compared with using pointers in PowerBasic


#COMPILE EXE
#DIM ALL

SUB MatrixMul4x4S(BYREF a() AS SINGLE, BYREF b() AS SINGLE, BYREF c() AS SINGLE )
DIM p AS SINGLE PTR

p=VARPTR(a(0))
! mov ecx,eax
p=VARPTR(b(0))
! mov edx,eax
p=VARPTR(c(0))

block:
! call column
! call column
! call column
! call column
EXIT SUB

column:
! call cell
! call cell
! call cell
! call cell
! add edx,16
! sub ecx,16
! ret

cell: ' row A * column B
! fld   dword ptr [ecx   ]
! fmul  dword ptr [edx   ]
! fld   dword ptr [ecx+16]
! fmul  dword ptr [edx+04]
! fld   dword ptr [ecx+32]
! fmul  dword ptr [edx+08]
! fld   dword ptr [ecx+48]
! fmul  dword ptr [edx+12]
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp  dword ptr [eax]
! add   eax,4
! add   ecx,4
! ret

END SUB
       


 SUB MatrixMul4x4(lhs() AS SINGLE, rhs() AS SINGLE, res() AS SINGLE)
     DIM lhsp AS SINGLE PTR:lhsp=VARPTR(lhs(0))
     DIM rhsp AS SINGLE PTR:rhsp=VARPTR(rhs(0))

     ' COLUMN 1

     res(00)=@lhsp[0]*@rhsp[00]+@lhsp[4]*@rhsp[01]+@lhsp[08]*@rhsp[02]+@lhsp[12]*@rhsp[03]
     res(01)=@lhsp[1]*@rhsp[00]+@lhsp[5]*@rhsp[01]+@lhsp[09]*@rhsp[02]+@lhsp[13]*@rhsp[03]
     res(02)=@lhsp[2]*@rhsp[00]+@lhsp[6]*@rhsp[01]+@lhsp[10]*@rhsp[02]+@lhsp[14]*@rhsp[03]
     res(03)=@lhsp[3]*@rhsp[00]+@lhsp[7]*@rhsp[01]+@lhsp[11]*@rhsp[02]+@lhsp[15]*@rhsp[03]

     ' COLUMN 1

     res(04)=@lhsp[0]*@rhsp[04]+@lhsp[4]*@rhsp[05]+@lhsp[08]*@rhsp[06]+@lhsp[12]*@rhsp[07]
     res(05)=@lhsp[1]*@rhsp[04]+@lhsp[5]*@rhsp[05]+@lhsp[09]*@rhsp[06]+@lhsp[13]*@rhsp[07]
     res(06)=@lhsp[2]*@rhsp[04]+@lhsp[6]*@rhsp[05]+@lhsp[10]*@rhsp[06]+@lhsp[14]*@rhsp[07]
     res(07)=@lhsp[3]*@rhsp[04]+@lhsp[7]*@rhsp[05]+@lhsp[11]*@rhsp[06]+@lhsp[15]*@rhsp[07]

     ' COLIMN 3

     res(08)=@lhsp[0]*@rhsp[08]+@lhsp[4]*@rhsp[09]+@lhsp[08]*@rhsp[10]+@lhsp[12]*@rhsp[11]
     res(09)=@lhsp[1]*@rhsp[08]+@lhsp[5]*@rhsp[09]+@lhsp[09]*@rhsp[10]+@lhsp[13]*@rhsp[11]
     res(10)=@lhsp[2]*@rhsp[08]+@lhsp[6]*@rhsp[09]+@lhsp[10]*@rhsp[10]+@lhsp[14]*@rhsp[11]
     res(11)=@lhsp[3]*@rhsp[08]+@lhsp[7]*@rhsp[09]+@lhsp[11]*@rhsp[10]+@lhsp[15]*@rhsp[11]

     ' COLUMN 4

     res(12)=@lhsp[0]*@rhsp[12]+@lhsp[4]*@rhsp[13]+@lhsp[08]*@rhsp[14]+@lhsp[12]*@rhsp[15]
     res(13)=@lhsp[1]*@rhsp[12]+@lhsp[5]*@rhsp[13]+@lhsp[09]*@rhsp[14]+@lhsp[13]*@rhsp[15]
     res(14)=@lhsp[2]*@rhsp[12]+@lhsp[6]*@rhsp[13]+@lhsp[10]*@rhsp[14]+@lhsp[14]*@rhsp[15]
     res(15)=@lhsp[3]*@rhsp[12]+@lhsp[7]*@rhsp[13]+@lhsp[11]*@rhsp[14]+@lhsp[15]*@rhsp[15]
 END SUB






SUB getrow4S(BYREF matr AS SINGLE)
    ' test this
    DIM row4f(3) AS GLOBAL SINGLE
    DIM p AS SINGLE PTR
    p=VARPTR(matr)
    row4f(0)=@p[0]
    row4f(1)=@p[4]
    row4f(2)=@p[8]
    row4f(3)=@p[12]
END SUB

  


FUNCTION PBMAIN () AS LONG

    

END FUNCTION

Title: Re: Asm: 4*4 Matrix Multiply
Post by: Patrice Terrier on July 26, 2008, 11:26:52 AM

Charles

Then, what do you think would be the fastest ASM solution to perform these (color matrix)

Code Select


SUB MatAssign(MR() AS SINGLE, MM() AS SINGLE)
    'ARRAY ASSIGN MR() = MM(0, 0), MM(0, 1), MM(0, 2), MM(0, 3), MM(0, 4), _
    '                    MM(1, 0), MM(1, 1), MM(1, 2), MM(1, 3), MM(1, 4), _
    '                    MM(2, 0), MM(2, 1), MM(2, 2), MM(2, 3), MM(2, 4), _
    '                    MM(3, 0), MM(3, 1), MM(3, 2), MM(3, 3), MM(3, 4), _
    '                    MM(4, 0), MM(4, 1), MM(4, 2), MM(4, 3), MM(4, 4)
    POKE$ VARPTR(MR(0,0)), PEEK$(VARPTR(MM(0,0)), 100)
END SUB

SUB MatMultiply(MR() AS SINGLE, M1() AS SINGLE, M2() AS SINGLE)
    MR(0,0) = M1(0,0) * M2(0,0) + M1(0,1) * M2(1,0) + M1(0,2) * M2(2,0) + M1(0,3) * M2(3,0) + M1(0,4) * M2(4,0)
    MR(0,1) = M1(1,0) * M2(0,0) + M1(1,1) * M2(1,0) + M1(1,2) * M2(2,0) + M1(1,3) * M2(3,0) + M1(1,4) * M2(4,0)
    MR(0,2) = M1(2,0) * M2(0,0) + M1(2,1) * M2(1,0) + M1(2,2) * M2(2,0) + M1(2,3) * M2(3,0) + M1(2,4) * M2(4,0)
    MR(0,3) = M1(3,0) * M2(0,0) + M1(3,1) * M2(1,0) + M1(3,2) * M2(2,0) + M1(3,3) * M2(3,0) + M1(3,4) * M2(4,0)
    MR(0,4) = M1(4,0) * M2(0,0) + M1(4,1) * M2(1,0) + M1(4,2) * M2(2,0) + M1(4,3) * M2(3,0) + M1(4,4) * M2(4,0)
    MR(1,0) = M1(0,0) * M2(0,1) + M1(0,1) * M2(1,1) + M1(0,2) * M2(2,1) + M1(0,3) * M2(3,1) + M1(0,4) * M2(4,1)
    MR(1,1) = M1(1,0) * M2(0,1) + M1(1,1) * M2(1,1) + M1(1,2) * M2(2,1) + M1(1,3) * M2(3,1) + M1(1,4) * M2(4,1)
    MR(1,2) = M1(2,0) * M2(0,1) + M1(2,1) * M2(1,1) + M1(2,2) * M2(2,1) + M1(2,3) * M2(3,1) + M1(2,4) * M2(4,1)
    MR(1,3) = M1(3,0) * M2(0,1) + M1(3,1) * M2(1,1) + M1(3,2) * M2(2,1) + M1(3,3) * M2(3,1) + M1(3,4) * M2(4,1)
    MR(1,4) = M1(4,0) * M2(0,1) + M1(4,1) * M2(1,1) + M1(4,2) * M2(2,1) + M1(4,3) * M2(3,1) + M1(4,4) * M2(4,1)
    MR(2,0) = M1(0,0) * M2(0,2) + M1(0,1) * M2(1,2) + M1(0,2) * M2(2,2) + M1(0,3) * M2(3,2) + M1(0,4) * M2(4,2)
    MR(2,1) = M1(1,0) * M2(0,2) + M1(1,1) * M2(1,2) + M1(1,2) * M2(2,2) + M1(1,3) * M2(3,2) + M1(1,4) * M2(4,2)
    MR(2,2) = M1(2,0) * M2(0,2) + M1(2,1) * M2(1,2) + M1(2,2) * M2(2,2) + M1(2,3) * M2(3,2) + M1(2,4) * M2(4,2)
    MR(2,3) = M1(3,0) * M2(0,2) + M1(3,1) * M2(1,2) + M1(3,2) * M2(2,2) + M1(3,3) * M2(3,2) + M1(3,4) * M2(4,2)
    MR(2,4) = M1(4,0) * M2(0,2) + M1(4,1) * M2(1,2) + M1(4,2) * M2(2,2) + M1(4,3) * M2(3,2) + M1(4,4) * M2(4,2)
    MR(3,0) = M1(0,0) * M2(0,3) + M1(0,1) * M2(1,3) + M1(0,2) * M2(2,3) + M1(0,3) * M2(3,3) + M1(0,4) * M2(4,3)
    MR(3,1) = M1(1,0) * M2(0,3) + M1(1,1) * M2(1,3) + M1(1,2) * M2(2,3) + M1(1,3) * M2(3,3) + M1(1,4) * M2(4,3)
    MR(3,2) = M1(2,0) * M2(0,3) + M1(2,1) * M2(1,3) + M1(2,2) * M2(2,3) + M1(2,3) * M2(3,3) + M1(2,4) * M2(4,3)
    MR(3,3) = M1(3,0) * M2(0,3) + M1(3,1) * M2(1,3) + M1(3,2) * M2(2,3) + M1(3,3) * M2(3,3) + M1(3,4) * M2(4,3)
    MR(3,4) = M1(4,0) * M2(0,3) + M1(4,1) * M2(1,3) + M1(4,2) * M2(2,1) + M1(4,3) * M2(3,3) + M1(4,4) * M2(4,3)
    MR(4,0) = M1(0,0) * M2(0,4) + M1(0,1) * M2(1,4) + M1(0,2) * M2(2,4) + M1(0,3) * M2(3,4) + M1(0,4) * M2(4,4)
    MR(4,1) = M1(1,0) * M2(0,4) + M1(1,1) * M2(1,4) + M1(1,2) * M2(2,4) + M1(1,3) * M2(3,4) + M1(1,4) * M2(4,4)
    MR(4,2) = M1(2,0) * M2(0,4) + M1(2,1) * M2(1,4) + M1(2,2) * M2(2,4) + M1(2,3) * M2(3,4) + M1(2,4) * M2(4,4)
    MR(4,3) = M1(3,0) * M2(0,4) + M1(3,1) * M2(1,4) + M1(3,2) * M2(2,4) + M1(3,3) * M2(3,4) + M1(3,4) * M2(4,4)
    MR(4,4) = M1(4,0) * M2(0,4) + M1(4,1) * M2(1,4) + M1(4,2) * M2(2,4) + M1(4,3) * M2(3,4) + M1(4,4) * M2(4,4)
END SUB

Title: Re: Asm: 4*4 Matrix Multiply
Post by: Charles Pegge on July 26, 2008, 12:17:01 PM

Hi Patrice

I've tested your code in comparison with the pointer version and the FPU assembly code version.

My FPU version is about 10X faster. We can attribute this difference to the movement of data between memory and registers and the calculation of array element addresses, since the maths operations are identical.

There is a method described by Intel using SSE2 SIMD instructions which is twice the speed of the FPU. Unfortunately PB8 does not recognise the instructions for SSE2 so we would have to go down to machine code.

Code Select


FUNCTION PBMAIN () AS LONG

DIM r(15) AS SINGLE
DIM a(15) AS SINGLE
DIM b(15) AS SINGLE

DIM i AS LONG
DIM t AS SINGLE

t=TIMER
FOR i=1 TO 1e6
' MatMultiply(r(),a(),b())   ' 1.6  seconds using 2d arrays
' MatrixMul4x4(a(),b(),r())  ' 0.39 seconds using 1d pointers
  MatrixMul4x4S(a(),b(),r()) ' 0.17 seconds using FPU assembly code
NEXT

MSGBOX STR$(TIMER-t)

END FUNCTION

Title: Re: Asm: 4*4 Matrix Multiply
Post by: Patrice Terrier on July 26, 2008, 12:37:13 PM

Charles

I have checked with the pointer version (see below) and I can't see any speed boost by me :(

Code Select


SUB MatMultiply(MR() AS SINGLE, M1() AS SINGLE, M2() AS SINGLE)
'    MR(0,0) = M1(0,0) * M2(0,0) + M1(0,1) * M2(1,0) + M1(0,2) * M2(2,0) + M1(0,3) * M2(3,0) + M1(0,4) * M2(4,0)
'    MR(0,1) = M1(1,0) * M2(0,0) + M1(1,1) * M2(1,0) + M1(1,2) * M2(2,0) + M1(1,3) * M2(3,0) + M1(1,4) * M2(4,0)
'    MR(0,2) = M1(2,0) * M2(0,0) + M1(2,1) * M2(1,0) + M1(2,2) * M2(2,0) + M1(2,3) * M2(3,0) + M1(2,4) * M2(4,0)
'    MR(0,3) = M1(3,0) * M2(0,0) + M1(3,1) * M2(1,0) + M1(3,2) * M2(2,0) + M1(3,3) * M2(3,0) + M1(3,4) * M2(4,0)
'    MR(0,4) = M1(4,0) * M2(0,0) + M1(4,1) * M2(1,0) + M1(4,2) * M2(2,0) + M1(4,3) * M2(3,0) + M1(4,4) * M2(4,0)
'    MR(1,0) = M1(0,0) * M2(0,1) + M1(0,1) * M2(1,1) + M1(0,2) * M2(2,1) + M1(0,3) * M2(3,1) + M1(0,4) * M2(4,1)
'    MR(1,1) = M1(1,0) * M2(0,1) + M1(1,1) * M2(1,1) + M1(1,2) * M2(2,1) + M1(1,3) * M2(3,1) + M1(1,4) * M2(4,1)
'    MR(1,2) = M1(2,0) * M2(0,1) + M1(2,1) * M2(1,1) + M1(2,2) * M2(2,1) + M1(2,3) * M2(3,1) + M1(2,4) * M2(4,1)
'    MR(1,3) = M1(3,0) * M2(0,1) + M1(3,1) * M2(1,1) + M1(3,2) * M2(2,1) + M1(3,3) * M2(3,1) + M1(3,4) * M2(4,1)
'    MR(1,4) = M1(4,0) * M2(0,1) + M1(4,1) * M2(1,1) + M1(4,2) * M2(2,1) + M1(4,3) * M2(3,1) + M1(4,4) * M2(4,1)
'    MR(2,0) = M1(0,0) * M2(0,2) + M1(0,1) * M2(1,2) + M1(0,2) * M2(2,2) + M1(0,3) * M2(3,2) + M1(0,4) * M2(4,2)
'    MR(2,1) = M1(1,0) * M2(0,2) + M1(1,1) * M2(1,2) + M1(1,2) * M2(2,2) + M1(1,3) * M2(3,2) + M1(1,4) * M2(4,2)
'    MR(2,2) = M1(2,0) * M2(0,2) + M1(2,1) * M2(1,2) + M1(2,2) * M2(2,2) + M1(2,3) * M2(3,2) + M1(2,4) * M2(4,2)
'    MR(2,3) = M1(3,0) * M2(0,2) + M1(3,1) * M2(1,2) + M1(3,2) * M2(2,2) + M1(3,3) * M2(3,2) + M1(3,4) * M2(4,2)
'    MR(2,4) = M1(4,0) * M2(0,2) + M1(4,1) * M2(1,2) + M1(4,2) * M2(2,2) + M1(4,3) * M2(3,2) + M1(4,4) * M2(4,2)
'    MR(3,0) = M1(0,0) * M2(0,3) + M1(0,1) * M2(1,3) + M1(0,2) * M2(2,3) + M1(0,3) * M2(3,3) + M1(0,4) * M2(4,3)
'    MR(3,1) = M1(1,0) * M2(0,3) + M1(1,1) * M2(1,3) + M1(1,2) * M2(2,3) + M1(1,3) * M2(3,3) + M1(1,4) * M2(4,3)
'    MR(3,2) = M1(2,0) * M2(0,3) + M1(2,1) * M2(1,3) + M1(2,2) * M2(2,3) + M1(2,3) * M2(3,3) + M1(2,4) * M2(4,3)
'    MR(3,3) = M1(3,0) * M2(0,3) + M1(3,1) * M2(1,3) + M1(3,2) * M2(2,3) + M1(3,3) * M2(3,3) + M1(3,4) * M2(4,3)
'    MR(3,4) = M1(4,0) * M2(0,3) + M1(4,1) * M2(1,3) + M1(4,2) * M2(2,1) + M1(4,3) * M2(3,3) + M1(4,4) * M2(4,3)
'    MR(4,0) = M1(0,0) * M2(0,4) + M1(0,1) * M2(1,4) + M1(0,2) * M2(2,4) + M1(0,3) * M2(3,4) + M1(0,4) * M2(4,4)
'    MR(4,1) = M1(1,0) * M2(0,4) + M1(1,1) * M2(1,4) + M1(1,2) * M2(2,4) + M1(1,3) * M2(3,4) + M1(1,4) * M2(4,4)
'    MR(4,2) = M1(2,0) * M2(0,4) + M1(2,1) * M2(1,4) + M1(2,2) * M2(2,4) + M1(2,3) * M2(3,4) + M1(2,4) * M2(4,4)
'    MR(4,3) = M1(3,0) * M2(0,4) + M1(3,1) * M2(1,4) + M1(3,2) * M2(2,4) + M1(3,3) * M2(3,4) + M1(3,4) * M2(4,4)
'    MR(4,4) = M1(4,0) * M2(0,4) + M1(4,1) * M2(1,4) + M1(4,2) * M2(2,4) + M1(4,3) * M2(3,4) + M1(4,4) * M2(4,4)
    
     DIM M1p AS SINGLE PTR: M1p = VARPTR(M1(0,0))
     DIM M2p AS SINGLE PTR: M2p = VARPTR(M2(0,0))

     MR(0,0)=@M1p[0]*@M2p[00]+@M1p[5]*@M2p[01]+@M1p[10]*@M2p[02]+@M1p[15]*@M2p[03]+@M1p[20]*@M2p[04]
     MR(0,1)=@M1p[1]*@M2p[00]+@M1p[6]*@M2p[01]+@M1p[11]*@M2p[02]+@M1p[16]*@M2p[03]+@M1p[21]*@M2p[04]
     MR(0,2)=@M1p[2]*@M2p[00]+@M1p[7]*@M2p[01]+@M1p[12]*@M2p[02]+@M1p[17]*@M2p[03]+@M1p[22]*@M2p[04]
     MR(0,3)=@M1p[3]*@M2p[00]+@M1p[8]*@M2p[01]+@M1p[13]*@M2p[02]+@M1p[18]*@M2p[03]+@M1p[23]*@M2p[04]
     MR(0,4)=@M1p[4]*@M2p[00]+@M1p[9]*@M2p[01]+@M1p[14]*@M2p[02]+@M1p[19]*@M2p[03]+@M1p[24]*@M2p[04]

     MR(1,0)=@M1p[0]*@M2p[05]+@M1p[5]*@M2p[06]+@M1p[10]*@M2p[07]+@M1p[15]*@M2p[08]+@M1p[20]*@M2p[09]
     MR(1,1)=@M1p[1]*@M2p[05]+@M1p[6]*@M2p[06]+@M1p[11]*@M2p[07]+@M1p[16]*@M2p[08]+@M1p[21]*@M2p[09]
     MR(1,2)=@M1p[2]*@M2p[05]+@M1p[7]*@M2p[06]+@M1p[12]*@M2p[07]+@M1p[17]*@M2p[08]+@M1p[22]*@M2p[09]
     MR(1,3)=@M1p[3]*@M2p[05]+@M1p[8]*@M2p[06]+@M1p[13]*@M2p[07]+@M1p[18]*@M2p[08]+@M1p[23]*@M2p[09]
     MR(1,4)=@M1p[4]*@M2p[05]+@M1p[9]*@M2p[06]+@M1p[14]*@M2p[07]+@M1p[19]*@M2p[08]+@M1p[24]*@M2p[09]

     MR(2,0)=@M1p[0]*@M2p[10]+@M1p[5]*@M2p[11]+@M1p[10]*@M2p[12]+@M1p[15]*@M2p[13]+@M1p[20]*@M2p[14]
     MR(2,1)=@M1p[1]*@M2p[10]+@M1p[6]*@M2p[11]+@M1p[11]*@M2p[12]+@M1p[16]*@M2p[13]+@M1p[21]*@M2p[14]
     MR(2,2)=@M1p[2]*@M2p[10]+@M1p[7]*@M2p[11]+@M1p[12]*@M2p[12]+@M1p[17]*@M2p[13]+@M1p[22]*@M2p[14]
     MR(2,3)=@M1p[3]*@M2p[10]+@M1p[8]*@M2p[11]+@M1p[13]*@M2p[12]+@M1p[18]*@M2p[13]+@M1p[23]*@M2p[14]
     MR(2,4)=@M1p[4]*@M2p[10]+@M1p[9]*@M2p[11]+@M1p[14]*@M2p[12]+@M1p[19]*@M2p[13]+@M1p[24]*@M2p[14]

     MR(3,0)=@M1p[0]*@M2p[15]+@M1p[5]*@M2p[16]+@M1p[10]*@M2p[17]+@M1p[15]*@M2p[18]+@M1p[20]*@M2p[19]
     MR(3,1)=@M1p[1]*@M2p[15]+@M1p[6]*@M2p[16]+@M1p[11]*@M2p[17]+@M1p[16]*@M2p[18]+@M1p[21]*@M2p[19]
     MR(3,2)=@M1p[2]*@M2p[15]+@M1p[7]*@M2p[16]+@M1p[12]*@M2p[17]+@M1p[17]*@M2p[18]+@M1p[22]*@M2p[19]
     MR(3,3)=@M1p[3]*@M2p[15]+@M1p[8]*@M2p[16]+@M1p[13]*@M2p[17]+@M1p[18]*@M2p[18]+@M1p[23]*@M2p[19]
     MR(3,4)=@M1p[4]*@M2p[15]+@M1p[9]*@M2p[16]+@M1p[14]*@M2p[17]+@M1p[19]*@M2p[18]+@M1p[24]*@M2p[19]

     MR(4,0)=@M1p[0]*@M2p[20]+@M1p[5]*@M2p[21]+@M1p[10]*@M2p[22]+@M1p[15]*@M2p[23]+@M1p[20]*@M2p[24]
     MR(4,1)=@M1p[1]*@M2p[20]+@M1p[6]*@M2p[21]+@M1p[11]*@M2p[22]+@M1p[16]*@M2p[23]+@M1p[21]*@M2p[24]
     MR(4,2)=@M1p[2]*@M2p[20]+@M1p[7]*@M2p[21]+@M1p[12]*@M2p[22]+@M1p[17]*@M2p[23]+@M1p[22]*@M2p[24]
     MR(4,3)=@M1p[3]*@M2p[20]+@M1p[8]*@M2p[21]+@M1p[13]*@M2p[22]+@M1p[18]*@M2p[23]+@M1p[23]*@M2p[24]
     MR(4,4)=@M1p[4]*@M2p[20]+@M1p[9]*@M2p[21]+@M1p[14]*@M2p[22]+@M1p[19]*@M2p[23]+@M1p[24]*@M2p[24]
    
END SUB

Title: Re: Asm: 4*4 Matrix Multiply
Post by: Patrice Terrier on July 26, 2008, 01:04:12 PM

I just checked using GDIPLUS
CALL GdipMultiplyMatrix(BYVAL VARPTR(MR(0,0)), BYVAL VARPTR(M2(0,0)), 0)

it doesn't matches my need...

Title: Re: Asm: 4*4 Matrix Multiply
Post by: Charles Pegge on July 26, 2008, 01:14:26 PM

I think the difference is that you are using 2 dimensional arrays and pointers. - mine is a simple array of 16 elements which are resolved by the compiler directly into address offsets. No address calculations are required.

PS I wonder how fast is GdipMultiplyMatrix.

Title: Re: Asm: 4*4 Matrix Multiply
Post by: Patrice Terrier on July 26, 2008, 02:25:38 PM

GdipMultiplyMatrix is not good to manipulate the GDImage color matrix with alpha channel.

Title: Re: Asm: 4*4 Matrix Multiply
Post by: Patrice Terrier on July 26, 2008, 04:14:06 PM

After doing several benchmarks, it looks like (in my case) the PowerBASIC built-in MAT would be the best solution in case of cumulative settings.

8)

Title: Re: Asm: 4*4 Matrix Multiply
Post by: Charles Pegge on July 26, 2008, 06:44:46 PM

This is an Intel example of 4x4 matrix multiplication. I translated it into Asmosphere under thinBasic. It is rendered here into machine code for PB.
It is the fastest, taking 0.1 seconds to compute 1,000,000 matrixes. The code is compatible with Pentium III processors (ca 1999) so it should be functional on most PCs.

PS: I don't know how it works :)

Streaming SIMD Extensions -
Matrix Multiplication

http://download.intel.com/design/PentiumIII/sml/24504501.pdf

Code Select


SUB MatrixMul4x4SSE(BYREF a() AS SINGLE, BYREF b() AS SINGLE, BYREF c() AS SINGLE )
DIM p AS SINGLE PTR

p=VARPTR(a(0))
! mov ecx,eax
p=VARPTR(b(0))
! mov edx,eax
p=VARPTR(a(0))
 '.sse_matrix_mul               ;  .sse_matrix_mul

 ! db &hf3, &h0f, &h10, &h02                 ;  movss  xmm0, [edx]
 ! db &h0f, &h10, &h09                       ;  movups xmm1, [ecx]
 ! db &h0f, &hC6, &hC0, &h00                 ;  shufps xmm0, xmm0, 0
 ! db &hf3, &h0f, &h10, &h52, &h04           ;  movss  xmm2, [edx+4]
 ! db &h0f, &h59, &hC1                       ;  mulps  xmm0, xmm1
 ! db &h0f, &hC6, &hD2, &h00                 ;  shufps xmm2, xmm2, 0
 ! db &h0f, &h10, &h59, &h10                 ;  movups xmm3, [ecx+16]
 ! db &hf3, &h0f, &h10, &h7A, &h08           ;  movss  xmm7, [edx+8]
 ! db &h0f, &h59, &hD3                       ;  mulps  xmm2, xmm3
 ! db &h0f, &hC6, &hFF, &h00                 ;  shufps xmm7, xmm7, 0
 ! db &h0f, &h58, &hC2                       ;  addps  xmm0, xmm2
 ! db &h0f, &h10, &h61, &h20                 ;  movups xmm4, [ecx+32]
 ! db &hf3, &h0f, &h10, &h52, &h0C           ;  movss  xmm2, [edx+12]
 ! db &h0f, &h59, &hFC                       ;  mulps  xmm7, xmm4
 ! db &h0f, &hC6, &hD2, &h00                 ;  shufps xmm2, xmm2, 0
 ! db &h0f, &h58, &hC7                       ;  addps  xmm0, xmm7
 ! db &h0f, &h10, &h69, &h30                 ;  movups xmm5, [ecx+48]
 ! db &hf3, &h0f, &h10, &h72, &h10           ;  movss  xmm6, [edx+16]
 ! db &h0f, &h59, &hD5                       ;  mulps  xmm2, xmm5
 ! db &hf3, &h0f, &h10, &h7A, &h14           ;  movss  xmm7, [edx+20]
 ! db &h0f, &hC6, &hF6, &h00                 ;  shufps xmm6, xmm6, 0
 ! db &h0f, &h58, &hC2                       ;  addps  xmm0, xmm2
 ! db &h0f, &hC6, &hFF, &h00                 ;  shufps xmm7, xmm7, 0
 ! db &h0f, &h13, &h00                       ;  movlps [eax], xmm0
 ! db &h0f, &h17, &h40, &h08                 ;  movhps [eax+8], xmm0
 ! db &h0f, &h59, &hFB                       ;  mulps  xmm7, xmm3
 ! db &hf3, &h0f, &h10, &h42, &h18           ;  movss  xmm0, [edx+24]
 ! db &h0f, &h59, &hF1                       ;  mulps  xmm6, xmm1
 ! db &h0f, &hC6, &hC0, &h00                 ;  shufps xmm0, xmm0, 0
 ! db &h0f, &h58, &hF7                       ;  addps  xmm6, xmm7
 ! db &h0f, &h59, &hC4                       ;  mulps  xmm0, xmm4
 ! db &hf3, &h0f, &h10, &h52, &h24           ;  movss  xmm2, [edx+36]
 ! db &h0f, &h58, &hF0                       ;  addps  xmm6, xmm0
 ! db &hf3, &h0f, &h10, &h42, &h1C           ;  movss  xmm0, [edx+28]
 ! db &hf3, &h0f, &h10, &h7A, &h20           ;  movss  xmm7, [edx+32]
 ! db &h0f, &hC6, &hC0, &h00                 ;  shufps xmm0, xmm0, 0
 ! db &h0f, &hC6, &hFF, &h00                 ;  shufps xmm7, xmm7, 0
 ! db &h0f, &h59, &hC5                       ;  mulps  xmm0, xmm5
 ! db &h0f, &h59, &hF9                       ;  mulps  xmm7, xmm1
 ! db &h0f, &h58, &hF0                       ;  addps  xmm6, xmm0
 ! db &h0f, &hC6, &hD2, &h00                 ;  shufps xmm2, xmm2, 0
 ! db &h0f, &h13, &h70, &h10                 ;  movlps [eax+16], xmm6
 ! db &h0f, &h17, &h70, &h18                 ;  movhps [eax+24], xmm6
 ! db &h0f, &h59, &hD3                       ;  mulps  xmm2, xmm3
 ! db &hf3, &h0f, &h10, &h72, &h28           ;  movss  xmm6, [edx+40]
 ! db &h0f, &h58, &hFA                       ;  addps  xmm7, xmm2
 ! db &h0f, &hC6, &hF6, &h00                 ;  shufps xmm6, xmm6, 0
 ! db &hf3, &h0f, &h10, &h52, &h2C           ;  movss  xmm2, [edx+44]
 ! db &h0f, &h59, &hF4                       ;  mulps  xmm6, xmm4
 ! db &h0f, &hC6, &hD2, &h00                 ;  shufps xmm2, xmm2, 0
 ! db &h0f, &h58, &hFE                       ;  addps  xmm7, xmm6
 ! db &h0f, &h59, &hD5                       ;  mulps  xmm2, xmm5
 ! db &hf3, &h0f, &h10, &h42, &h34           ;  movss  xmm0, [edx+52]
 ! db &h0f, &h58, &hFA                       ;  addps  xmm7, xmm2
 ! db &h0f, &hC6, &hC0, &h00                 ;  shufps xmm0, xmm0, 0
 ! db &h0f, &h13, &h78, &h20                 ;  movlps [eax+32], xmm7
 ! db &hf3, &h0f, &h10, &h52, &h30           ;  movss  xmm2, [edx+48]
 ! db &h0f, &h17, &h78, &h28                 ;  movhps [eax+40], xmm7
 ! db &h0f, &h59, &hC3                       ;  mulps  xmm0, xmm3
 ! db &h0f, &hC6, &hD2, &h00                 ;  shufps xmm2, xmm2, 0
 ! db &hf3, &h0f, &h10, &h72, &h38           ;  movss  xmm6, [edx+56]
 ! db &h0f, &h59, &hD1                       ;  mulps  xmm2, xmm1
 ! db &h0f, &hC6, &hF6, &h00                 ;  shufps xmm6, xmm6, 0
 ! db &h0f, &h58, &hD0                       ;  addps  xmm2, xmm0
 ! db &h0f, &h59, &hF4                       ;  mulps  xmm6, xmm4
 ! db &hf3, &h0f, &h10, &h7A, &h3C           ;  movss  xmm7, [edx+60]
 ! db &h0f, &hC6, &hFF, &h00                 ;  shufps xmm7, xmm7, 0
 ! db &h0f, &h58, &hD6                       ;  addps  xmm2, xmm6
 ! db &h0f, &h59, &hFD                       ;  mulps  xmm7, xmm5
 ! db &h0f, &h58, &hD7                       ;  addps  xmm2, xmm7
 ! db &h0f, &h11, &h50, &h30                 ;  movups [eax+48], xmm2

END SUB

Code Select


FUNCTION PBMAIN () AS LONG

DIM r(15) AS SINGLE
DIM a(15) AS SINGLE
DIM b(15) AS SINGLE

DIM i AS LONG
DIM t AS SINGLE

t=TIMER
FOR i=1 TO 1e6
'  MatMultiply(r(),a(),b())     ' 1.6  seconds using 2d arrays
'  MatMultiplyP(r(),a(),b())    ' 1.5  seconds using 2d arrays and pointers
'  MatrixMul4x4(a(),b(),r())    ' 0.39 seconds using 1d pointers
'  MatrixMul4x4FPU(a(),b(),r()) ' 0.17 seconds using FPU assembly code
   MatrixMul4x4SSE(a(),b(),r()) ' 0.1 seconds using SSE assembly code
NEXT

MSGBOX STR$(TIMER-t)

END FUNCTION

Title: Re: Asm: 4*4 Matrix Multiply
Post by: Patrice Terrier on July 26, 2008, 07:28:03 PM

Charles

I am using a color matrix of 5x5 not 4x4 or 3x3 that would be of course much faster.

If you can make it work for 5x5 then i could try it, but i am not an assembly guy myself :'(

Title: Re: Asm: 4*4 Matrix Multiply
Post by: Charles Pegge on July 26, 2008, 08:21:29 PM

Patrice,

This takes 0.27 secs per million calcs for a 5x5 Matrix. I have not functionally tested it but it is a simple expansion of the 4x4 function

Result is returned in mr() matching your example prototypes

Code Select


SUB MatrixMul5x5FPU(BYREF mr() AS SINGLE, BYREF m1() AS SINGLE, BYREF m2() AS SINGLE )
DIM p AS SINGLE PTR

p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))

block:
! call column
! call column
! call column
! call column
! call column
EXIT SUB

column:
! call cell
! call cell
! call cell
! call cell
! call cell
! add edx,20
! sub ecx,20
! ret

cell: ' row A * column B
! fld   dword ptr [ecx   ]
! fmul  dword ptr [edx   ]
! fld   dword ptr [ecx+20]
! fmul  dword ptr [edx+04]
! fld   dword ptr [ecx+40]
! fmul  dword ptr [edx+08]
! fld   dword ptr [ecx+60]
! fmul  dword ptr [edx+12]
! fld   dword ptr [ecx+80]
! fmul  dword ptr [edx+16]
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp  dword ptr [eax]
! add   eax,4
! add   ecx,4
! ret
END SUB

[/cpde]

Title: Re: Asm: 4*4 Matrix Multiply
Post by: Patrice Terrier on July 26, 2008, 08:39:19 PM

Charles

Thank you very much!

I did check it over the built-in PB MAT function, and i would say that this confirm that the PB one was already very fast, however yours seems even faster ;)

I shall post the new GDImage.dll in the PhotoSetup thread, for those wanting to try it.

Thanks again.

Title: Re: Asm: 4*4 Matrix Multiply and others
Post by: Charles Pegge on July 27, 2008, 12:00:32 PM

Glad it was useful Patrice.

Here are 3x3 4x4 and 5x5 matrix multiplications using single and double precision. You can see how the code changes for each matrix type.

Interestingly there seems to be very little difference in speed between single precision and double precision

Code Select



'================
' SINGLES
'================



SUB MatrixMul3x3SFPU(BYREF mr() AS SINGLE, BYREF m1() AS SINGLE, BYREF m2() AS SINGLE )
DIM p AS SINGLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))

block:
! call column
! call column
! call column
EXIT SUB

column:
! call cell
! call cell
! call cell
! add edx,12
! sub ecx,12
! ret

cell: ' row A * column B
! fld   dword ptr [ecx   ]
! fmul  dword ptr [edx   ]
! fld   dword ptr [ecx+12]
! fmul  dword ptr [edx+04]
! fld   dword ptr [ecx+24]
! fmul  dword ptr [edx+08]
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp  dword ptr [eax]
! add   eax,4
! add   ecx,4
! ret
END SUB

SUB MatrixMul4x4SFPU(BYREF mr() AS SINGLE, BYREF m1() AS SINGLE, BYREF m2() AS SINGLE )
DIM p AS SINGLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))

block:
! call column
! call column
! call column
! call column
EXIT SUB

column:
! call cell
! call cell
! call cell
! call cell
! add edx,16
! sub ecx,16
! ret

cell: ' row A * column B
! fld   dword ptr [ecx   ]
! fmul  dword ptr [edx   ]
! fld   dword ptr [ecx+16]
! fmul  dword ptr [edx+04]
! fld   dword ptr [ecx+32]
! fmul  dword ptr [edx+08]
! fld   dword ptr [ecx+48]
! fmul  dword ptr [edx+12]
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp  dword ptr [eax]
! add   eax,4
! add   ecx,4
! ret
END SUB

SUB MatrixMul5x5SFPU(BYREF mr() AS SINGLE, BYREF m1() AS SINGLE, BYREF m2() AS SINGLE )
DIM p AS SINGLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))

block:
! call column
! call column
! call column
! call column
! call column
EXIT SUB

column:
! call cell
! call cell
! call cell
! call cell
! call cell
! add edx,20
! sub ecx,20
! ret

cell: ' row A * column B
! fld   dword ptr [ecx   ]
! fmul  dword ptr [edx   ]
! fld   dword ptr [ecx+20]
! fmul  dword ptr [edx+04]
! fld   dword ptr [ecx+40]
! fmul  dword ptr [edx+08]
! fld   dword ptr [ecx+60]
! fmul  dword ptr [edx+12]
! fld   dword ptr [ecx+80]
! fmul  dword ptr [edx+16]
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp  dword ptr [eax]
! add   eax,4
! add   ecx,4
! ret
END SUB
  
'====================================
'  DOUBLES
'====================================


SUB MatrixMul3x3DFPU(BYREF mr() AS DOUBLE, BYREF m1() AS DOUBLE, BYREF m2() AS DOUBLE )
DIM p AS DOUBLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))

block:
! call column
! call column
! call column
EXIT SUB

column:
! call cell
! call cell
! call cell
! add edx,24
! sub ecx,24
! ret

cell: ' row A * column B
! fld   qword ptr [ecx   ]
! fmul  qword ptr [edx   ]
! fld   qword ptr [ecx+24]
! fmul  qword ptr [edx+08]
! fld   qword ptr [ecx+48]
! fmul  qword ptr [edx+16]
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp  qword ptr [eax]
! add   eax,8
! add   ecx,8
! ret
END SUB


SUB MatrixMul4x4DFPU(BYREF mr() AS DOUBLE, BYREF m1() AS DOUBLE, BYREF m2() AS DOUBLE )
DIM p AS DOUBLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))

block:
! call column
! call column
! call column
! call column
EXIT SUB

column:
! call cell
! call cell
! call cell
! call cell
! add edx,32
! sub ecx,32
! ret

cell: ' row A * column B
! fld   qword ptr [ecx   ]
! fmul  qword ptr [edx   ]
! fld   qword ptr [ecx+32]
! fmul  qword ptr [edx+08]
! fld   qword ptr [ecx+64]
! fmul  qword ptr [edx+16]
! fld   qword ptr [ecx+96]
! fmul  qword ptr [edx+24]
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp  qword ptr [eax]
! add   eax,8
! add   ecx,8
! ret
END SUB

SUB MatrixMul5x5DFPU(BYREF mr() AS DOUBLE, BYREF m1() AS DOUBLE, BYREF m2() AS DOUBLE )
DIM p AS DOUBLE PTR
p=VARPTR(m1(0))
! mov ecx,eax
p=VARPTR(m2(0))
! mov edx,eax
p=VARPTR(mr(0))

block:
! call column
! call column
! call column
! call column
! call column
EXIT SUB

column:
! call cell
! call cell
! call cell
! call cell
! call cell
! add edx,40
! sub ecx,40
! ret

cell: ' row A * column B
! fld   qword ptr [ecx   ]
! fmul  qword ptr [edx   ]
! fld   qword ptr [ecx+40]
! fmul  qword ptr [edx+08]
! fld   qword ptr [ecx+80]
! fmul  qword ptr [edx+16]
! fld   qword ptr [ecx+120]
! fmul  qword ptr [edx+24]
! fld   qword ptr [ecx+160]
! fmul  qword ptr [edx+32]
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! faddp st(1),st(0)
! fstp  qword ptr [eax]
! add   eax,8
! add   ecx,8
! ret
END SUB

Correction to strides 28 July 2008

Title: Re: Asm: 4*4 Matrix Multiply
Post by: Petr Schreiber on July 27, 2008, 12:50:34 PM

That is perfect Charles,

may I steal your code for TBGL?

Petr

Title: Re: Asm: 4*4 Matrix Multiply
Post by: Charles Pegge on July 27, 2008, 12:59:52 PM

Of course Petr,

Are there any other matrix operations we need to look at. Multiplication seems to be the most important as far as graphics is concerned.

Title: Re: Asm: 4*4 Matrix Multiply
Post by: Petr Schreiber on July 27, 2008, 01:15:27 PM

Thanks a lot Charles,

I used multiplication only, as in 4x4 matrix all can be saved - scale, position and rotation.

Petr

Title: Re: Asm: 4*4 Matrix Multiply
Post by: Patrice Terrier on July 27, 2008, 01:45:55 PM

The problem i have to deal with, is that modern digital photographier produce so huge image, that wanting to perform cumulative settings in real time even with assembly, seems to have reach a limit, that could not be further optimized without the help of a graphic accelerated hardware.

Thus i think it is better to use a preview using a small size (for example 512x512) to adjust the settings, then apply them to the real photography once all parameters have been set.

Anyway if ever you can push the limits furthermore with these FPU settings i would be glad to try it ;)

Powerbasic Museum 2020-B

IT-Consultant: Charles Pegge => OxygenBasic Examples => Topic started by: Charles Pegge on July 26, 2008, 08:34:47 AM