Port `bones.gc` math to GOAL (#3425)

Reverse engineer the skinning matrix calculation and port to GOAL. This
is about 3x faster than the MIPS2c version.

As usual, there is a `*use-new-bones*` flag to go back to the old
version.

Fix for a bug in the compiler's `.div.vf` implementation (only happens
if src/dst are the same), and fix for a typo in the register allocator
that would sometimes cause it not to consider xmm8-xmm15.
pull/3426/head
water111 2024-03-15 20:31:11 -04:00 committed by GitHub
parent 5a8b4e81f9
commit 82fb2cc26a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 892 additions and 39 deletions

View File

@ -0,0 +1,500 @@
The `bones.gc` file computes skinning matrices for foreground rendering.
Arguments:
- `a0`: output matrix area, each matrix is 7x quadwords contaning the vertex and normal transformation matrices.
- `a1`: input joint array. The joints contain the inverse bind pose.
- `a2`: input bones array. The world space bone transforms
- `a3`: num bones
- `vf28, vf29, vf30, vf31` the camera matrix
- `vf25, vf26, vf27` the camera matrix again
```asm
daddiu sp, sp, -96
sd ra, 0(sp)
sq s2, 16(sp)
sq s3, 32(sp)
sq s4, 48(sp)
sq s5, 64(sp)
sq gp, 80(sp)
lui v1, 4096
lui t0, 4096
ori v1, v1, 54272 ;; v1 = DMA reg addr
ori t0, t0, 53248 ;; t0 = DMA reg addr
lui t2, 32767 ;; 0x7fff....
daddiu t1, a3, -16 ;; bone count - 16 (maybe we do 16 bones at a time?)
ori t2, t2, 65535 ;; 0x7fff'ffff
lui at, 28672 ;; scratchpad addr
addiu t4, r0, 64 ;; t4 = 64 (= 16 bones * 4)
addiu t5, r0, 1280 ;; t5 = 1280 (= 16 bones * 80)
bgez t1, L17 ;; more than 16 bones?
addiu t3, r0, 16 ;; t3 = 16
;; if first run is under 16 bones, adjust counts
B1:
or t3, a3, r0 ;; t3 = num bones
sll r0, r0, 0
dsll t4, t3, 2 ;; t4 = num bones * 4
dsll a3, t3, 4 ;; a3 = num bones * 16
dsll t1, t3, 6 ;; t1 = num bones * 64
sll r0, r0, 0
daddu t5, t1, a3 ;; t5 = num bones * 80
addiu t1, r0, 0 ;; t1 = 0 (remaining bones count)
B2:
L17:
addiu a3, r0, 0 ;; a3 = 0
addiu t6, r0, 1 ;; t6 = 1
and a1, a1, t2 ;; mask off upper bits of address (not sure why, but they do this sometimes)
sll r0, r0, 0
daddiu a1, a1, 12 ;; adjustment of joint pointer for the strided dma stuff.
or a0, a0, r0
daddiu a1, a1, -80
sll r0, r0, 0
;; wait for DMA to be free...
<snip>
B5:
L19:
addiu t6, r0, 80
addiu t7, r0, 264
sw t6, 128(v1) ;; addr in spad = 80 for joints
sw a1, 16(v1)
sw t4, 32(v1) ;; size: num bones * 4
sw t7, 0(v1)
daddu a1, a1, t5
;; wait for dma to complete
<snip>
B8:
L21:
and a2, a2, t2 ;; clean up bones addr
sll r0, r0, 0
dsll t2, t3, 2 ;; t2 = bones * 4
addiu t4, r0, 256 ;; t4 = 256
daddu t2, t2, t3 ;; t2 = bones * 5 (size of the bone)
addiu t6, r0, 1104 ;; addr in spad = 1104 for bones.
dsll t5, t2, 4
sw t6, 128(v1)
addiu t8, r0, 0
sw a2, 16(v1)
daddu a2, a2, t5
sw t2, 32(v1)
addiu t2, r0, 1
sw t4, 0(v1)
;; wait for dma
;; <snip>
B11:
L23:
dsll t5, t8, 2 ;; ?? not sure what this is, but always zero?
daddu t9, t5, at ;; ptr to bone-work
sll r0, r0, 0
lwu t5, 0(t9) ;; t5 = (-> bone-layout joint)
or t6, t3, r0
lwu t7, 8(t9) ;; t7 = (-> bone-layout bone)
or ra, t3, r0
lwu t3, 16(t9) ;; t3 = (-> bone-layout output)
sll r0, r0, 0
sw ra, 44(at) ;; stash sp-size
beq ra, r0, L36
sw t8, 48(at) ;; stash sp-bufnum
B12:
daddiu t1, t1, -16 ;; decrement bones count
addiu t9, r0, 1280 ;; next DMA math stuff
bgez t1, L24 ;; check if partial bone buffer
addiu t8, r0, 16 ;; ....
B13:
daddiu t8, t1, 16
addiu t1, r0, 0
dsll t9, t8, 4
dsll ra, t8, 6
beq t8, r0, L25
daddu t9, ra, t9
B14:
L24:
dsll t4, t8, 2
dsll ra, t2, 2
daddu gp, ra, at
sw a1, 16(v1)
addiu ra, r0, 264
lwu gp, 0(gp)
andi gp, gp, 16383
sw t4, 32(v1)
daddu a1, a1, t9
sw gp, 128(v1)
addiu t4, r0, 0
sw ra, 0(v1)
;; and now, for the actual bones.
B15:
L25:
sll r0, r0, 0
sw t8, 40(at) ;; in-count
sll r0, r0, 0
lqc2 vf1, 0(t5) ;; vf1, vf2, vf3, vf4 = inverse bind pose
sll r0, r0, 0
lqc2 vf2, 16(t5)
sll r0, r0, 0
lqc2 vf3, 32(t5)
sll r0, r0, 0
lqc2 vf4, 48(t5)
sll r0, r0, 0
lqc2 vf5, 0(t7) ;; vf5, vf6, vf7, vf8 = input bone matrix.
sll r0, r0, 0
lqc2 vf6, 16(t7)
sll r0, r0, 0
lqc2 vf7, 32(t7)
sll r0, r0, 0
lqc2 vf8, 48(t7)
vcallms 0 ;; run bone program
sll r0, r0, 0
B16:
L26:
sll r0, r0, 0
sll r0, r0, 0
daddiu t5, t5, 64 ;; advance joint
sll r0, r0, 0
daddiu t7, t7, 80 ;; advance bone.
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
lq t8, 0(t5) ;; load next joint
sll r0, r0, 0
lq t9, 16(t5)
sll r0, r0, 0
lq ra, 32(t5)
sll r0, r0, 0
lq gp, 48(t5)
sll r0, r0, 0
lq s5, 0(t7) ;; load next bone
sll r0, r0, 0
lq s4, 16(t7)
sll r0, r0, 0
lq s3, 32(t7)
sll r0, r0, 0
lq s2, 48(t7)
sll r0, r0, 0
qmtc2.ni vf1, t8 ;; swap in new inputs
sll r0, r0, 0
qmtc2.ni vf2, t9
sll r0, r0, 0
qmtc2.ni vf3, ra
sll r0, r0, 0
qmtc2.ni vf4, gp
sll r0, r0, 0
qmtc2.ni vf5, s5
sll r0, r0, 0
qmtc2.ni vf6, s4
sll r0, r0, 0
qmtc2.ni vf7, s3
sll r0, r0, 0
qmtc2.ni vf8, s2
sll r0, r0, 0
qmfc2.i t8, vf13 ;; swap out result in (vf13, vf14, vf15, vf16) and (vf9, vf10, vf11)
sll r0, r0, 0
qmfc2.ni t9, vf14
sll r0, r0, 0
qmfc2.ni ra, vf15
sll r0, r0, 0
qmfc2.ni gp, vf16
sll r0, r0, 0
qmfc2.ni s5, vf9
sll r0, r0, 0
qmfc2.ni s4, vf10
sll r0, r0, 0
qmfc2.ni s3, vf11
vcallms 0
sll r0, r0, 0
sll r0, r0, 0
sq t8, 0(t3)
sll r0, r0, 0
sq t9, 16(t3)
sll r0, r0, 0
sq ra, 32(t3)
sll r0, r0, 0
sq gp, 48(t3)
sll r0, r0, 0
sq s5, 64(t3)
sll r0, r0, 0
sq s4, 80(t3)
sll r0, r0, 0
sq s3, 96(t3)
sll r0, r0, 0
sq r0, 112(t3)
daddiu t3, t3, 128
daddiu t6, t6, -1
bgtz t6, L26
sll r0, r0, 0
B17:
sll r0, r0, 0
lw t3, 40(at)
beq t3, r0, L29
sll r0, r0, 0
B18:
L27:
lw t4, 0(v1)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
andi t4, t4, 256
sll r0, r0, 0
beq t4, r0, L28
sll r0, r0, 0
B19:
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
beq r0, r0, L27
sll r0, r0, 0
B20:
L28:
dsll t5, t2, 2
sll r0, r0, 0
addiu t4, r0, 1
daddu t5, t5, at
sll r0, r0, 0
lwu t6, 8(t5)
dsll t5, t3, 2
andi t6, t6, 16383
daddu t5, t5, t3
sw t6, 128(v1)
dsll t6, t5, 4
sw a2, 16(v1)
addiu t7, r0, 256
sw t5, 32(v1)
daddu a2, a2, t6
sw t7, 0(v1)
B21:
L29:
sll r0, r0, 0
lw t5, 48(at)
sll r0, r0, 0
lw t6, 44(at)
B22:
L30:
lw t7, 0(t0)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
andi t7, t7, 256
sll r0, r0, 0
beq t7, r0, L31
sll r0, r0, 0
B23:
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
beq r0, r0, L30
sll r0, r0, 0
B24:
L31:
beq t6, r0, L32
sll r0, r0, 0
B25:
dsll t7, t5, 2
lui t8, 28672
daddu t7, t7, t8
lwu t7, 16(t7)
andi t7, t7, 16383
sw t7, 128(t0)
sw a0, 16(t0)
dsll t7, t6, 3
sw t7, 32(t0)
addiu t7, r0, 256
sw t7, 0(t0)
dsll t6, t6, 7
daddu a0, a0, t6
B26:
L32:
beq t3, r0, L35
sll r0, r0, 0
B27:
bne t4, r0, L35
sll r0, r0, 0
B28:
L33:
lw t6, 0(v1)
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
andi t6, t6, 256
sll r0, r0, 0
beq t6, r0, L34
sll r0, r0, 0
B29:
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
sll r0, r0, 0
beq r0, r0, L33
sll r0, r0, 0
B30:
L34:
dsll t6, t2, 2
lui t7, 28672
daddu t6, t6, t7
lwu t6, 8(t6)
andi t6, t6, 16383
sw t6, 128(v1)
sw a2, 16(v1)
addiu t6, r0, 5
mult3 t6, t6, t3
sw t6, 32(v1)
addiu t6, r0, 256
sw t6, 0(v1)
addiu t6, r0, 80
mult3 t6, t6, t3
daddu a2, a2, t6
B31:
L35:
or t8, t2, r0
bne t1, r0, L22
or t2, t5, r0
B32:
beq a3, r0, L22
addiu a3, r0, 1
B33:
L36:
or v0, r0, r0
ld ra, 0(sp)
lq gp, 80(sp)
lq s5, 64(sp)
lq s4, 48(sp)
lq s3, 32(sp)
lq s2, 16(sp)
jr ra
daddiu sp, sp, 96
```
# VU0 micoprogram
- vf1, vf2, vf3, vf4 = inverse bind pose
- vf5, vf6, vf7, vf8 = input bone matrix.
- `vf28, vf29, vf30, vf31` the camera matrix
- `vf25, vf26, vf27` the camera matrix again
- (vf13, vf14, vf15, vf16) output point transformation
- (vf09, vf10, vf11) output normal transformation
```
First: multiply bone and bind pose: (vf13, vf14, vf15, vf16) = (vf5, vf6, vf7, vf8) * (vf1, vf2, vf3, vf4).
This is doing a true matrix multiplication.
nop | mulax.xyzw ACC, vf05, vf01
nop | madday.xyzw ACC, vf06, vf01
nop | maddaz.xyzw ACC, vf07, vf01
nop | maddw.xyzw vf13, vf08, vf01
nop | mulax.xyzw ACC, vf05, vf02
nop | madday.xyzw ACC, vf06, vf02
nop | maddaz.xyzw ACC, vf07, vf02
nop | maddw.xyzw vf14, vf08, vf02
nop | mulax.xyzw ACC, vf05, vf03
nop | madday.xyzw ACC, vf06, vf03
nop | maddaz.xyzw ACC, vf07, vf03
nop | maddw.xyzw vf15, vf08, vf03
nop | mulax.xyzw ACC, vf05, vf04
nop | madday.xyzw ACC, vf06, vf04
nop | maddaz.xyzw ACC, vf07, vf04
nop | maddw.xyzw vf16, vf08, vf04
;; vf09 = cross(y, z)
nop | opmula.xyz ACC, vf14, vf15
nop | opmsub.xyz vf09, vf15, vf14
;; vf10 = cross(z, x)
nop | opmula.xyz ACC, vf15, vf13
nop | opmsub.xyz vf10, vf13, vf15
;; vf11 = cross(x, y)
nop | opmula.xyz ACC, vf13, vf14
nop | opmsub.xyz vf11, vf14, vf13
;; vf12 = cross (y, z) * x
nop | mul.xyz vf12, vf13, vf09
;; second multiply: doing (vf13....) = cam * (vf5, vf6, vf7, vf8) * (vf1, vf2, vf3, vf4)
nop | mulax.xyzw ACC, vf28, vf13
nop | madday.xyzw ACC, vf29, vf13
nop | maddaz.xyzw ACC, vf30, vf13
nop | maddw.xyzw vf13, vf31, vf13
nop | mulax.w ACC, vf00, vf12
nop | madday.w ACC, vf00, vf12
nop | maddz.w vf12, vf00, vf12
vf12.w = dot (cross(y, z), x) [before the second multiply]
nop | mulax.xyzw ACC, vf28, vf14
nop | madday.xyzw ACC, vf29, vf14
nop | maddaz.xyzw ACC, vf30, vf14
div Q, vf00.w, vf12.w | maddw.xyzw vf14, vf31, vf14 ;; divide
nop | mulax.xyzw ACC, vf28, vf15
nop | madday.xyzw ACC, vf29, vf15
nop | maddaz.xyzw ACC, vf30, vf15
nop | maddw.xyzw vf15, vf31, vf15
nop | mulax.xyzw ACC, vf28, vf16
nop | madday.xyzw ACC, vf29, vf16
nop | maddaz.xyzw ACC, vf30, vf16
nop | maddw.xyzw vf16, vf31, vf16
;; normal scale
nop | mul.xyzw vf09, vf09, Q
nop | mul.xyzw vf10, vf10, Q
nop | mul.xyzw vf11, vf11, Q
;; apply cam to normal matrix too
nop | mulax.xyzw ACC, vf25, vf09
nop | madday.xyzw ACC, vf26, vf09
nop | maddz.xyzw vf09, vf27, vf09
nop | mulax.xyzw ACC, vf25, vf10
nop | madday.xyzw ACC, vf26, vf10
nop | maddz.xyzw vf10, vf27, vf10
nop | mulax.xyzw ACC, vf25, vf11
nop | madday.xyzw ACC, vf26, vf11 :e
nop | maddz.xyzw vf11, vf27, vf11
```

View File

@ -422,6 +422,141 @@
(def-mips2c bones-mtx-calc (function int pointer pointer int object none))
(defmacro .cross.vf (out a b)
`(begin
(.outer.product.a.vf acc ,a ,b)
(.outer.product.b.vf ,out ,b ,a acc)
)
)
(defun new-bones-mtx-calc-asm ((output (inline-array pris-mtx)) (joints (inline-array joint)) (bones (inline-array bone)) (cam matrix) (count int))
"Compute skinning matrices."
;; (declare (print-asm))
(dotimes (i (- count 1))
(let ((b (-> bones (+ i 1) transform))
(j (-> joints i bind-pose))
(out (-> output (+ i 1)))
)
(rlet (
(tmat0 :class vf)
(tmat1 :class vf)
(tmat2 :class vf)
(tmat3 :class vf)
(nmat0 :class vf)
(nmat1 :class vf)
(nmat2 :class vf)
(nmat3 :class vf)
(acc :class vf )
(vf0 :class vf )
(cam0 :class vf )
(cam1 :class vf )
(cam2 :class vf )
(cam3 :class vf )
)
(init-vf0-vector)
;; load bind-pose to tmat:
(.lvf tmat0 (&-> j quad 0))
(.lvf tmat1 (&-> j quad 1))
(.lvf tmat2 (&-> j quad 2))
(.lvf tmat3 (&-> j quad 3))
;; load bone to nmat
(.lvf nmat0 (&-> b quad 0))
(.lvf nmat1 (&-> b quad 1))
(.lvf nmat2 (&-> b quad 2))
(.lvf nmat3 (&-> b quad 3))
;; multiply bone and bind pose, store in tmat
(.mul.x.vf acc nmat0 tmat0)
(.add.mul.y.vf acc nmat1 tmat0 acc)
(.add.mul.z.vf acc nmat2 tmat0 acc)
(.add.mul.w.vf tmat0 nmat3 tmat0 acc)
(.mul.x.vf acc nmat0 tmat1)
(.add.mul.y.vf acc nmat1 tmat1 acc)
(.add.mul.z.vf acc nmat2 tmat1 acc)
(.add.mul.w.vf tmat1 nmat3 tmat1 acc)
(.mul.x.vf acc nmat0 tmat2)
(.add.mul.y.vf acc nmat1 tmat2 acc)
(.add.mul.z.vf acc nmat2 tmat2 acc)
(.add.mul.w.vf tmat2 nmat3 tmat2 acc)
(.mul.x.vf acc nmat0 tmat3)
(.add.mul.y.vf acc nmat1 tmat3 acc)
(.add.mul.z.vf acc nmat2 tmat3 acc)
(.add.mul.w.vf tmat3 nmat3 tmat3 acc)
;; compute inverse transpose, storing in nmat.
(.cross.vf nmat0 tmat1 tmat2)
(.cross.vf nmat1 tmat2 tmat0)
(.cross.vf nmat2 tmat0 tmat1)
;; dot nmat0 and tmat0
(.mul.vf acc nmat0 tmat0)
(.add.y.vf acc acc acc :mask #b1)
(.add.z.vf acc acc acc :mask #b1)
;; divide!
(.div.vf acc vf0 acc :fsf #b11 :ftf #b0)
;; scale nmat:
(.mul.x.vf nmat0 nmat0 acc)
(.mul.x.vf nmat1 nmat1 acc)
(.mul.x.vf nmat2 nmat2 acc)
;; load camera
(.lvf cam0 (&-> cam quad 0))
(.lvf cam1 (&-> cam quad 1))
(.lvf cam2 (&-> cam quad 2))
(.lvf cam3 (&-> cam quad 3))
;; multiply tmat by camera
(.mul.x.vf acc cam0 tmat0)
(.add.mul.y.vf acc cam1 tmat0 acc)
(.add.mul.z.vf acc cam2 tmat0 acc)
(.add.mul.w.vf tmat0 cam3 tmat0 acc)
(.mul.x.vf acc cam0 tmat1)
(.add.mul.y.vf acc cam1 tmat1 acc)
(.add.mul.z.vf acc cam2 tmat1 acc)
(.add.mul.w.vf tmat1 cam3 tmat1 acc)
(.mul.x.vf acc cam0 tmat2)
(.add.mul.y.vf acc cam1 tmat2 acc)
(.add.mul.z.vf acc cam2 tmat2 acc)
(.add.mul.w.vf tmat2 cam3 tmat2 acc)
(.mul.x.vf acc cam0 tmat3)
(.add.mul.y.vf acc cam1 tmat3 acc)
(.add.mul.z.vf acc cam2 tmat3 acc)
(.add.mul.w.vf tmat3 cam3 tmat3 acc)
;; store tmat
(.svf (&-> out t-mtx quad 0) tmat0)
(.svf (&-> out t-mtx quad 1) tmat1)
(.svf (&-> out t-mtx quad 2) tmat2)
(.svf (&-> out t-mtx quad 3) tmat3)
;; multiply nmat
(.mul.x.vf acc cam0 nmat0)
(.add.mul.y.vf acc cam1 nmat0 acc)
(.add.mul.z.vf nmat0 cam2 nmat0 acc)
(.mul.x.vf acc cam0 nmat1)
(.add.mul.y.vf acc cam1 nmat1 acc)
(.add.mul.z.vf nmat1 cam2 nmat1 acc)
(.mul.x.vf acc cam0 nmat2)
(.add.mul.y.vf acc cam1 nmat2 acc)
(.add.mul.z.vf nmat2 cam2 nmat2 acc)
;; store nmat
(.svf (&-> out n-mtx quad 0) nmat0)
(.svf (&-> out n-mtx quad 1) nmat1)
(.svf (&-> out n-mtx quad 2) nmat2)
)
)
)
(none)
)
(define *use-new-bones* #t)
(defun bones-mtx-calc-execute ()
"Do all pending bone calculations"
(local-vars (v1-14 float))
@ -481,13 +616,20 @@
(.mov v1-14 vf27)
;; hack??
(bones-mtx-calc
(the-as int (-> s4-0 matrix-area))
(the-as pointer (-> s4-0 joints))
(the-as pointer (-> s4-0 bones))
(the-as int (-> s4-0 num-bones))
v1-13 ;; hack, added
(if *use-new-bones*
(new-bones-mtx-calc-asm
(the (inline-array pris-mtx) (-> s4-0 matrix-area))
(-> s4-0 joints)
(-> s4-0 bones)
v1-13
(the int (-> s4-0 num-bones)))
(bones-mtx-calc
(the-as int (-> s4-0 matrix-area))
(the-as pointer (-> s4-0 joints))
(the-as pointer (-> s4-0 bones))
(the-as int (-> s4-0 num-bones))
v1-13 ;; hack, added
)
)
)
(when (logtest? (-> s4-0 flags) (bone-calc-flags bncfl00))

View File

@ -120,16 +120,224 @@ this is done by a linked list of "bone calculations", which is stashed in the dm
;; see the C++ code for more details.
(def-mips2c bones-mtx-calc (function (inline-array pris-mtx) (inline-array joint) (inline-array bone) uint object none))
(defun matrix-*float! ((output matrix3) (input matrix3) (x float))
(dotimes (i 12)
(set! (-> output data i) (* x (-> input data i)))
)
output
)
(defun matrix*!-first-three ((arg0 matrix3) (arg1 matrix3) (arg2 matrix))
"Set dst = src1 * src2. It is okay for any arguments to be the same data.
This is a moderately efficient implementation."
(rlet ((acc :class vf)
(vf10 :class vf)
(vf11 :class vf)
(vf12 :class vf)
(vf14 :class vf)
(vf15 :class vf)
(vf16 :class vf)
(vf18 :class vf)
(vf19 :class vf)
(vf20 :class vf)
)
(.lvf vf10 (&-> arg1 quad 0))
(.lvf vf14 (&-> arg2 quad 0))
(.lvf vf15 (&-> arg2 quad 1))
(.lvf vf16 (&-> arg2 quad 2))
(.lvf vf11 (&-> arg1 quad 1))
(.lvf vf12 (&-> arg1 quad 2))
(.mul.x.vf acc vf14 vf10)
(.add.mul.y.vf acc vf15 vf10 acc)
(.add.mul.z.vf vf18 vf16 vf10 acc)
(.mul.x.vf acc vf14 vf11)
(.add.mul.y.vf acc vf15 vf11 acc)
(.add.mul.z.vf vf19 vf16 vf11 acc)
(.mul.x.vf acc vf14 vf12)
(.add.mul.y.vf acc vf15 vf12 acc)
(.add.mul.z.vf vf20 vf16 vf12 acc)
(.svf (&-> arg0 quad 0) vf18)
(.svf (&-> arg0 quad 1) vf19)
(.svf (&-> arg0 quad 2) vf20)
arg0
)
)
(defun new-bones-mtx-calc ((output (inline-array pris-mtx)) (joints (inline-array joint)) (bones (inline-array bone)) (cam matrix) (count int))
(dotimes (i (- count 1))
(let ((b (-> bones (+ i 1) transform))
(j (-> joints i bind-pose))
(out (-> output (+ i 1)))
)
;; multiply by bind pose
;; mult swaps the args
(matrix*! (-> out t-mtx) j b)
;; clever way to compute inverse transpose of a 3x3:
(vector-cross! (-> out n-mtx vector 0)
(-> out t-mtx vector 1)
(-> out t-mtx vector 2)
)
(vector-cross! (-> out n-mtx vector 1)
(-> out t-mtx vector 2)
(-> out t-mtx vector 0)
)
(vector-cross! (-> out n-mtx vector 2)
(-> out t-mtx vector 0)
(-> out t-mtx vector 1)
)
(let ((scale (/ 1. (vector-dot (-> out n-mtx vector 0) (-> out t-mtx vector 0)))))
(matrix-*float! (-> out n-mtx) (-> out n-mtx) scale)
)
;; multiply by camera
(matrix*! (-> out t-mtx) (-> out t-mtx) cam)
(matrix*!-first-three (-> out n-mtx) (-> out n-mtx) cam) ;; WRONG!!
)
)
)
(defmacro .cross.vf (out a b)
`(begin
(.outer.product.a.vf acc ,a ,b)
(.outer.product.b.vf ,out ,b ,a acc)
)
)
(defun new-bones-mtx-calc-asm ((output (inline-array pris-mtx)) (joints (inline-array joint)) (bones (inline-array bone)) (cam matrix) (count int))
;; (declare (print-asm))
(dotimes (i (- count 1))
(let ((b (-> bones (+ i 1) transform))
(j (-> joints i bind-pose))
(out (-> output (+ i 1)))
)
(rlet (
(tmat0 :class vf)
(tmat1 :class vf)
(tmat2 :class vf)
(tmat3 :class vf)
(nmat0 :class vf)
(nmat1 :class vf)
(nmat2 :class vf)
(nmat3 :class vf)
(acc :class vf )
(vf0 :class vf )
(cam0 :class vf )
(cam1 :class vf )
(cam2 :class vf )
(cam3 :class vf )
)
(init-vf0-vector)
;; load bind-pose to tmat:
(.lvf tmat0 (&-> j quad 0))
(.lvf tmat1 (&-> j quad 1))
(.lvf tmat2 (&-> j quad 2))
(.lvf tmat3 (&-> j quad 3))
;; load bone to nmat
(.lvf nmat0 (&-> b quad 0))
(.lvf nmat1 (&-> b quad 1))
(.lvf nmat2 (&-> b quad 2))
(.lvf nmat3 (&-> b quad 3))
;; multiply, store in tmat
(.mul.x.vf acc nmat0 tmat0)
(.add.mul.y.vf acc nmat1 tmat0 acc)
(.add.mul.z.vf acc nmat2 tmat0 acc)
(.add.mul.w.vf tmat0 nmat3 tmat0 acc)
(.mul.x.vf acc nmat0 tmat1)
(.add.mul.y.vf acc nmat1 tmat1 acc)
(.add.mul.z.vf acc nmat2 tmat1 acc)
(.add.mul.w.vf tmat1 nmat3 tmat1 acc)
(.mul.x.vf acc nmat0 tmat2)
(.add.mul.y.vf acc nmat1 tmat2 acc)
(.add.mul.z.vf acc nmat2 tmat2 acc)
(.add.mul.w.vf tmat2 nmat3 tmat2 acc)
(.mul.x.vf acc nmat0 tmat3)
(.add.mul.y.vf acc nmat1 tmat3 acc)
(.add.mul.z.vf acc nmat2 tmat3 acc)
(.add.mul.w.vf tmat3 nmat3 tmat3 acc)
;; compute inverse transpose, storing in nmat
(.cross.vf nmat0 tmat1 tmat2)
(.cross.vf nmat1 tmat2 tmat0)
(.cross.vf nmat2 tmat0 tmat1)
;; dot nmat0 and tmat0
(.mul.vf acc nmat0 tmat0)
(.add.y.vf acc acc acc :mask #b1)
(.add.z.vf acc acc acc :mask #b1)
;; divide!
(.div.vf acc vf0 acc :fsf #b11 :ftf #b0)
;; scale nmat:
(.mul.x.vf nmat0 nmat0 acc)
(.mul.x.vf nmat1 nmat1 acc)
(.mul.x.vf nmat2 nmat2 acc)
;; load camera
(.lvf cam0 (&-> cam quad 0))
(.lvf cam1 (&-> cam quad 1))
(.lvf cam2 (&-> cam quad 2))
(.lvf cam3 (&-> cam quad 3))
;; multiply tmat by camera
(.mul.x.vf acc cam0 tmat0)
(.add.mul.y.vf acc cam1 tmat0 acc)
(.add.mul.z.vf acc cam2 tmat0 acc)
(.add.mul.w.vf tmat0 cam3 tmat0 acc)
(.mul.x.vf acc cam0 tmat1)
(.add.mul.y.vf acc cam1 tmat1 acc)
(.add.mul.z.vf acc cam2 tmat1 acc)
(.add.mul.w.vf tmat1 cam3 tmat1 acc)
(.mul.x.vf acc cam0 tmat2)
(.add.mul.y.vf acc cam1 tmat2 acc)
(.add.mul.z.vf acc cam2 tmat2 acc)
(.add.mul.w.vf tmat2 cam3 tmat2 acc)
(.mul.x.vf acc cam0 tmat3)
(.add.mul.y.vf acc cam1 tmat3 acc)
(.add.mul.z.vf acc cam2 tmat3 acc)
(.add.mul.w.vf tmat3 cam3 tmat3 acc)
;; store tmat
(.svf (&-> out t-mtx quad 0) tmat0)
(.svf (&-> out t-mtx quad 1) tmat1)
(.svf (&-> out t-mtx quad 2) tmat2)
(.svf (&-> out t-mtx quad 3) tmat3)
;; multiply nmat
(.mul.x.vf acc cam0 nmat0)
(.add.mul.y.vf acc cam1 nmat0 acc)
(.add.mul.z.vf nmat0 cam2 nmat0 acc)
(.mul.x.vf acc cam0 nmat1)
(.add.mul.y.vf acc cam1 nmat1 acc)
(.add.mul.z.vf nmat1 cam2 nmat1 acc)
(.mul.x.vf acc cam0 nmat2)
(.add.mul.y.vf acc cam1 nmat2 acc)
(.add.mul.z.vf nmat2 cam2 nmat2 acc)
;; store nmat
(.svf (&-> out n-mtx quad 0) nmat0)
(.svf (&-> out n-mtx quad 1) nmat1)
(.svf (&-> out n-mtx quad 2) nmat2)
)
)
)
(none)
)
(define *use-new-bones* #t)
(define *display-bone-stats* #f)
(define *num-bones* 0)
(defun bones-mtx-calc-execute ()
"Execute all bone matrix calculations."
(rlet ((vf1 :class vf)
(vf25 :class vf)
(vf26 :class vf)
(vf27 :class vf)
(vf28 :class vf)
(vf29 :class vf)
(vf30 :class vf)
(vf31 :class vf)
(vf4 :class vf)
(vf5 :class vf)
(vf6 :class vf)
@ -157,6 +365,7 @@ this is done by a linked list of "bone calculations", which is stashed in the dm
(s5-1 (-> *math-camera* camera-rot))
(s4-1 (-> v1-26 first))
)
(set! *num-bones* 0)
(while (nonzero? s4-1) ;; loop
;; pick correct matrix. The pris-mtx includes camera rotation, so all that's needed in the final
@ -167,23 +376,19 @@ this is done by a linked list of "bone calculations", which is stashed in the dm
)
)
)
(.lvf vf28 (&-> v1-29 quad 0))
(.lvf vf29 (&-> v1-29 quad 1))
(.lvf vf30 (&-> v1-29 quad 2))
(.lvf vf31 (&-> v1-29 trans quad))
(.lvf vf25 (&-> v1-29 quad 0))
(.lvf vf26 (&-> v1-29 quad 1))
(.lvf vf27 (&-> v1-29 quad 2))
;; calculate!
(bones-mtx-calc
(-> s4-1 matrix-area)
(-> s4-1 joints)
(-> s4-1 bones)
(-> s4-1 num-bones)
v1-29 ;; hack, to pass matrix to bones-mtx-calc in a better way.
)
(if *use-new-bones*
(new-bones-mtx-calc-asm (-> s4-1 matrix-area) (-> s4-1 joints) (-> s4-1 bones) v1-29 (the int (-> s4-1 num-bones)))
(bones-mtx-calc
(-> s4-1 matrix-area)
(-> s4-1 joints)
(-> s4-1 bones)
(-> s4-1 num-bones)
v1-29 ;; hack, to pass matrix to bones-mtx-calc in a better way.
)
)
)
(+! *num-bones* (-> s4-1 num-bones))
;; there is an optional post-processing step for ripple.
(when (logtest? (-> s4-1 flags) (bone-calc-flags write-ripple-data))
@ -228,6 +433,10 @@ this is done by a linked list of "bone calculations", which is stashed in the dm
)
)
(when *display-bone-stats*
(format *stdcon* "num-bones: ~D~%" *num-bones*)
)
;; reset sqwc
;; (set! (-> (the-as dma-bank-control #x1000e000) sqwc) (new 'static 'dma-sqwc :sqwc #x1 :tqwc #x1))

View File

@ -1233,17 +1233,17 @@ Val* Compiler::compile_asm_div_vf(const goos::Object& form, const goos::Object&
// Why do we even bother using VDIVPS instead of FDIV? Because otherwise in x86, you have to use
// the FPU stack Registers are nicer.
// Save one temp reg, use the destination as one
auto temp_reg = env->make_vfr(dest->type());
auto temp_reg1 = env->make_vfr(dest->type());
auto temp_reg2 = env->make_vfr(dest->type());
// Splat src1's value into the dest reg, keep it simple, this way no matter which vector component
// Splat src1's value into a temp reg, keep it simple, this way no matter which vector component
// is accessed from the final result will be the correct answer
env->emit_ir<IR_SplatVF>(form, color, dest, src1, ftf_fsf_to_vector_element(fsf));
env->emit_ir<IR_SplatVF>(form, color, temp_reg1, src1, ftf_fsf_to_vector_element(fsf));
// Splat src1's value into the the temp reg
env->emit_ir<IR_SplatVF>(form, color, temp_reg, src2, ftf_fsf_to_vector_element(ftf));
env->emit_ir<IR_SplatVF>(form, color, temp_reg2, src2, ftf_fsf_to_vector_element(ftf));
// Perform the Division
env->emit_ir<IR_VFMath3Asm>(form, color, dest, dest, temp_reg, IR_VFMath3Asm::Kind::DIV);
env->emit_ir<IR_VFMath3Asm>(form, color, dest, temp_reg1, temp_reg2, IR_VFMath3Asm::Kind::DIV);
return get_none();
}

View File

@ -332,7 +332,7 @@ const std::vector<emitter::Register>& get_alloc_order(int var_idx,
if (is_gpr) {
return REG_temp_first_order.gprs;
} else {
return REG_temp_only_order.xmms;
return REG_temp_first_order.xmms;
}
}
}
@ -1060,8 +1060,10 @@ bool run_assignment_on_var(const AllocationInput& input,
for (auto& reg : assign_order) {
bool worked = check_register_assign(input, *cache, var_idx, reg);
if (trace) {
lg::print("m2 trying var {} in {}: {}\n", cache->iregs.at(var_idx).to_string(),
reg.print(), worked);
const auto& this_var = cache->vars.at(var_idx);
lg::print("m2 trying var {} in {} (live {} to {}): {}\n",
cache->iregs.at(var_idx).to_string(), reg.print(), this_var.first_live(),
this_var.last_live(), worked);
}
if (worked) {
var.assign_to_register(reg);

View File

@ -24,7 +24,7 @@ void print_allocate_input(const AllocationInput& in) {
}
} else {
for (const auto& instruction : in.instructions) {
lg::print(" [{:3d}] {}\n", instruction.print());
lg::print(" {}\n", instruction.print());
}
}
lg::print("[RegAlloc] Debug Input Constraints:\n");