From d2c04c5c511cca3daeed292a397a5ae8ae2aa6a7 Mon Sep 17 00:00:00 2001
From: RSDuck <rsduck@users.noreply.github.com>
Date: Tue, 3 Nov 2020 21:13:49 +0100
Subject: GX: add fastpath for single parameter cmds

---
 src/GPU3D.cpp | 677 +++++++++++++++++++++++++++++++---------------------------
 1 file changed, 357 insertions(+), 320 deletions(-)

diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp
index 7b30426..64e67e7 100644
--- a/src/GPU3D.cpp
+++ b/src/GPU3D.cpp
@@ -99,7 +99,7 @@
 namespace GPU3D
 {
 
-const u32 CmdNumParams[256] =
+const u8 CmdNumParams[256] =
 {
     // 0x00
     0,
@@ -1798,7 +1798,37 @@ CmdFIFOEntry CmdFIFORead()
     return ret;
 }
 
+inline void VertexPipelineSubmitCmd()
+{
+    // vertex commands 0x24, 0x25, 0x26, 0x27, 0x28
+    if (!(VertexSlotsFree & 0x1)) NextVertexSlot();
+    else                          AddCycles(1);
+    NormalPipeline = 0;
+}
+
+inline void VertexPipelineCmdDelayed6()
+{
+    // commands 0x20, 0x30, 0x31, 0x72 that can run 6 cycles after a vertex
+    if (VertexPipeline > 2) AddCycles((VertexPipeline - 2) + 1);
+    else                    AddCycles(NormalPipeline + 1);
+    NormalPipeline = 0;
+}
+
+inline void VertexPipelineCmdDelayed8()
+{
+    // commands 0x29, 0x2A, 0x2B, 0x33, 0x34, 0x41, 0x60, 0x71 that can run 8 cycles after a vertex
+    if (VertexPipeline > 0) AddCycles(VertexPipeline + 1);
+    else                    AddCycles(NormalPipeline + 1);
+    NormalPipeline = 0;
+}
 
+inline void VertexPipelineCmdDelayed4()
+{
+    // all other commands can run 4 cycles after a vertex
+    // no need to do much here since that is the minimum
+    AddCycles(NormalPipeline + 1);
+    NormalPipeline = 0;
+}
 
 void ExecuteCommand()
 {
@@ -1809,81 +1839,23 @@ void ExecuteCommand()
     // each FIFO entry takes 1 cycle to be processed
     // commands (presumably) run when all the needed parameters have been read
     // which is where we add the remaining cycles if any
-    if (ExecParamCount == 0)
-    {
-        // delay the first command entry as needed
-        switch (entry.Command)
-        {
-        // commands that stall the polygon pipeline
-        case 0x32: StallPolygonPipeline(8 + 1,  2); break; // 32 can run 6 cycles after a vertex
-        case 0x40: StallPolygonPipeline(1,      0); break;
-        case 0x70: StallPolygonPipeline(10 + 1, 0); break;
-
-        case 0x23:
-        case 0x24:
-        case 0x25:
-        case 0x26:
-        case 0x27:
-        case 0x28:
-            // vertex
-            if (!(VertexSlotsFree & 0x1)) NextVertexSlot();
-            else                          AddCycles(1);
-            NormalPipeline = 0;
-            break;
-
-        case 0x20:
-        case 0x30:
-        case 0x31:
-        case 0x72:
-            // commands that can run 6 cycles after a vertex
-            if (VertexPipeline > 2) AddCycles((VertexPipeline - 2) + 1);
-            else                    AddCycles(NormalPipeline + 1);
-            NormalPipeline = 0;
-            break;
 
-        case 0x29:
-        case 0x2A:
-        case 0x2B:
-        case 0x33:
-        case 0x34:
-        case 0x41:
-        case 0x60:
-        case 0x71:
-            // command that can run 8 cycles after a vertex
-            if (VertexPipeline > 0) AddCycles(VertexPipeline + 1);
-            else                    AddCycles(NormalPipeline + 1);
-            NormalPipeline = 0;
-            break;
-
-        default:
-            // all other commands can run 4 cycles after a vertex
-            // no need to do much here since that is the minimum
-            AddCycles(NormalPipeline + 1);
-            NormalPipeline = 0;
-            break;
-        }
-    }
-    else
-        AddCycles(1);
-
-    ExecParams[ExecParamCount] = entry.Param;
-    ExecParamCount++;
-
-    if (ExecParamCount >= CmdNumParams[entry.Command])
+    u32 paramsRequiredCount = CmdNumParams[entry.Command];
+    if (paramsRequiredCount <= 1)
     {
-        /*printf("[GXS:%08X] 0x%02X,  ", GXStat, entry.Command);
-        for (int k = 0; k < ExecParamCount; k++) printf("0x%08X, ", ExecParams[k]);
-        printf("\n");*/
+        // fast path for command which only have a single parameter
 
-        ExecParamCount = 0;
+        /*printf("[GXS:%08X] 0x%02X,  0x%08X", GXStat, entry.Command, entry.Param);*/
 
         switch (entry.Command)
         {
         case 0x10: // matrix mode
-            MatrixMode = ExecParams[0] & 0x3;
+            VertexPipelineCmdDelayed4();
+            MatrixMode = entry.Param & 0x3;
             break;
 
         case 0x11: // push matrix
+            VertexPipelineCmdDelayed4();
             NumPushPopCommands--;
             if (MatrixMode == 0)
             {
@@ -1914,6 +1886,7 @@ void ExecuteCommand()
             break;
 
         case 0x12: // pop matrix
+            VertexPipelineCmdDelayed4();
             NumPushPopCommands--;
             if (MatrixMode == 0)
             {
@@ -1936,7 +1909,7 @@ void ExecuteCommand()
             }
             else
             {
-                s32 offset = (s32)(ExecParams[0] << 26) >> 26;
+                s32 offset = (s32)(entry.Param << 26) >> 26;
                 PosMatrixStackPointer -= offset;
                 PosMatrixStackPointer &= 0x3F;
 
@@ -1950,6 +1923,7 @@ void ExecuteCommand()
             break;
 
         case 0x13: // store matrix
+            VertexPipelineCmdDelayed4();
             if (MatrixMode == 0)
             {
                 memcpy(ProjMatrixStack, ProjMatrix, 16*4);
@@ -1960,7 +1934,7 @@ void ExecuteCommand()
             }
             else
             {
-                u32 addr = ExecParams[0] & 0x1F;
+                u32 addr = entry.Param & 0x1F;
                 if (addr > 30) GXStat |= (1<<15);
 
                 memcpy(PosMatrixStack[addr], PosMatrix, 16*4);
@@ -1970,6 +1944,7 @@ void ExecuteCommand()
             break;
 
         case 0x14: // restore matrix
+            VertexPipelineCmdDelayed4();
             if (MatrixMode == 0)
             {
                 memcpy(ProjMatrix, ProjMatrixStack, 16*4);
@@ -1983,7 +1958,7 @@ void ExecuteCommand()
             }
             else
             {
-                u32 addr = ExecParams[0] & 0x1F;
+                u32 addr = entry.Param & 0x1F;
                 if (addr > 30) GXStat |= (1<<15);
 
                 memcpy(PosMatrix, PosMatrixStack[addr], 16*4);
@@ -1994,6 +1969,7 @@ void ExecuteCommand()
             break;
 
         case 0x15: // identity
+            VertexPipelineCmdDelayed4();
             if (MatrixMode == 0)
             {
                 MatrixLoadIdentity(ProjMatrix);
@@ -2012,173 +1988,10 @@ void ExecuteCommand()
             }
             break;
 
-        case 0x16: // load 4x4
-            if (MatrixMode == 0)
-            {
-                MatrixLoad4x4(ProjMatrix, (s32*)ExecParams);
-                ClipMatrixDirty = true;
-                AddCycles(18);
-            }
-            else if (MatrixMode == 3)
-            {
-                MatrixLoad4x4(TexMatrix, (s32*)ExecParams);
-                AddCycles(10);
-            }
-            else
-            {
-                MatrixLoad4x4(PosMatrix, (s32*)ExecParams);
-                if (MatrixMode == 2)
-                    MatrixLoad4x4(VecMatrix, (s32*)ExecParams);
-                ClipMatrixDirty = true;
-                AddCycles(18);
-            }
-            break;
-
-        case 0x17: // load 4x3
-            if (MatrixMode == 0)
-            {
-                MatrixLoad4x3(ProjMatrix, (s32*)ExecParams);
-                ClipMatrixDirty = true;
-                AddCycles(18);
-            }
-            else if (MatrixMode == 3)
-            {
-                MatrixLoad4x3(TexMatrix, (s32*)ExecParams);
-                AddCycles(7);
-            }
-            else
-            {
-                MatrixLoad4x3(PosMatrix, (s32*)ExecParams);
-                if (MatrixMode == 2)
-                    MatrixLoad4x3(VecMatrix, (s32*)ExecParams);
-                ClipMatrixDirty = true;
-                AddCycles(18);
-            }
-            break;
-
-        case 0x18: // mult 4x4
-            if (MatrixMode == 0)
-            {
-                MatrixMult4x4(ProjMatrix, (s32*)ExecParams);
-                ClipMatrixDirty = true;
-                AddCycles(35 - 16);
-            }
-            else if (MatrixMode == 3)
-            {
-                MatrixMult4x4(TexMatrix, (s32*)ExecParams);
-                AddCycles(33 - 16);
-            }
-            else
-            {
-                MatrixMult4x4(PosMatrix, (s32*)ExecParams);
-                if (MatrixMode == 2)
-                {
-                    MatrixMult4x4(VecMatrix, (s32*)ExecParams);
-                    AddCycles(35 + 30 - 16);
-                }
-                else AddCycles(35 - 16);
-                ClipMatrixDirty = true;
-            }
-            break;
-
-        case 0x19: // mult 4x3
-            if (MatrixMode == 0)
-            {
-                MatrixMult4x3(ProjMatrix, (s32*)ExecParams);
-                ClipMatrixDirty = true;
-                AddCycles(35 - 12);
-            }
-            else if (MatrixMode == 3)
-            {
-                MatrixMult4x3(TexMatrix, (s32*)ExecParams);
-                AddCycles(33 - 12);
-            }
-            else
-            {
-                MatrixMult4x3(PosMatrix, (s32*)ExecParams);
-                if (MatrixMode == 2)
-                {
-                    MatrixMult4x3(VecMatrix, (s32*)ExecParams);
-                    AddCycles(35 + 30 - 12);
-                }
-                else AddCycles(35 - 12);
-                ClipMatrixDirty = true;
-            }
-            break;
-
-        case 0x1A: // mult 3x3
-            if (MatrixMode == 0)
-            {
-                MatrixMult3x3(ProjMatrix, (s32*)ExecParams);
-                ClipMatrixDirty = true;
-                AddCycles(35 - 9);
-            }
-            else if (MatrixMode == 3)
-            {
-                MatrixMult3x3(TexMatrix, (s32*)ExecParams);
-                AddCycles(33 - 9);
-            }
-            else
-            {
-                MatrixMult3x3(PosMatrix, (s32*)ExecParams);
-                if (MatrixMode == 2)
-                {
-                    MatrixMult3x3(VecMatrix, (s32*)ExecParams);
-                    AddCycles(35 + 30 - 9);
-                }
-                else AddCycles(35 - 9);
-                ClipMatrixDirty = true;
-            }
-            break;
-
-        case 0x1B: // scale
-            if (MatrixMode == 0)
-            {
-                MatrixScale(ProjMatrix, (s32*)ExecParams);
-                ClipMatrixDirty = true;
-                AddCycles(35 - 3);
-            }
-            else if (MatrixMode == 3)
-            {
-                MatrixScale(TexMatrix, (s32*)ExecParams);
-                AddCycles(33 - 3);
-            }
-            else
-            {
-                MatrixScale(PosMatrix, (s32*)ExecParams);
-                ClipMatrixDirty = true;
-                AddCycles(35 - 3);
-            }
-            break;
-
-        case 0x1C: // translate
-            if (MatrixMode == 0)
-            {
-                MatrixTranslate(ProjMatrix, (s32*)ExecParams);
-                ClipMatrixDirty = true;
-                AddCycles(35 - 3);
-            }
-            else if (MatrixMode == 3)
-            {
-                MatrixTranslate(TexMatrix, (s32*)ExecParams);
-                AddCycles(33 - 3);
-            }
-            else
-            {
-                MatrixTranslate(PosMatrix, (s32*)ExecParams);
-                if (MatrixMode == 2)
-                {
-                    MatrixTranslate(VecMatrix, (s32*)ExecParams);
-                    AddCycles(35 + 30 - 3);
-                }
-                else AddCycles(35 - 3);
-                ClipMatrixDirty = true;
-            }
-            break;
-
         case 0x20: // vertex color
+            VertexPipelineCmdDelayed6();
             {
-                u32 c = ExecParams[0];
+                u32 c = entry.Param;
                 u32 r = c & 0x1F;
                 u32 g = (c >> 5) & 0x1F;
                 u32 b = (c >> 10) & 0x1F;
@@ -2189,15 +2002,17 @@ void ExecuteCommand()
             break;
 
         case 0x21: // normal
-            Normal[0] = (s16)((ExecParams[0] & 0x000003FF) << 6) >> 6;
-            Normal[1] = (s16)((ExecParams[0] & 0x000FFC00) >> 4) >> 6;
-            Normal[2] = (s16)((ExecParams[0] & 0x3FF00000) >> 14) >> 6;
+            VertexPipelineCmdDelayed4();
+            Normal[0] = (s16)((entry.Param & 0x000003FF) << 6) >> 6;
+            Normal[1] = (s16)((entry.Param & 0x000FFC00) >> 4) >> 6;
+            Normal[2] = (s16)((entry.Param & 0x3FF00000) >> 14) >> 6;
             CalculateLighting();
             break;
 
         case 0x22: // texcoord
-            RawTexCoords[0] = ExecParams[0] & 0xFFFF;
-            RawTexCoords[1] = ExecParams[0] >> 16;
+            VertexPipelineCmdDelayed4();
+            RawTexCoords[0] = entry.Param & 0xFFFF;
+            RawTexCoords[1] = entry.Param >> 16;
             if ((TexParam >> 30) == 1)
             {
                 TexCoords[0] = (RawTexCoords[0]*TexMatrix[0] + RawTexCoords[1]*TexMatrix[4] + TexMatrix[8] + TexMatrix[12]) >> 12;
@@ -2210,65 +2025,67 @@ void ExecuteCommand()
             }
             break;
 
-        case 0x23: // full vertex
-            CurVertex[0] = ExecParams[0] & 0xFFFF;
-            CurVertex[1] = ExecParams[0] >> 16;
-            CurVertex[2] = ExecParams[1] & 0xFFFF;
-            SubmitVertex();
-            break;
-
         case 0x24: // 10-bit vertex
-            CurVertex[0] = (ExecParams[0] & 0x000003FF) << 6;
-            CurVertex[1] = (ExecParams[0] & 0x000FFC00) >> 4;
-            CurVertex[2] = (ExecParams[0] & 0x3FF00000) >> 14;
+            VertexPipelineSubmitCmd();
+            CurVertex[0] = (entry.Param & 0x000003FF) << 6;
+            CurVertex[1] = (entry.Param & 0x000FFC00) >> 4;
+            CurVertex[2] = (entry.Param & 0x3FF00000) >> 14;
             SubmitVertex();
             break;
 
         case 0x25: // vertex XY
-            CurVertex[0] = ExecParams[0] & 0xFFFF;
-            CurVertex[1] = ExecParams[0] >> 16;
+            VertexPipelineSubmitCmd();
+            CurVertex[0] = entry.Param & 0xFFFF;
+            CurVertex[1] = entry.Param >> 16;
             SubmitVertex();
             break;
 
         case 0x26: // vertex XZ
-            CurVertex[0] = ExecParams[0] & 0xFFFF;
-            CurVertex[2] = ExecParams[0] >> 16;
+            VertexPipelineSubmitCmd();
+            CurVertex[0] = entry.Param & 0xFFFF;
+            CurVertex[2] = entry.Param >> 16;
             SubmitVertex();
             break;
 
         case 0x27: // vertex YZ
-            CurVertex[1] = ExecParams[0] & 0xFFFF;
-            CurVertex[2] = ExecParams[0] >> 16;
+            VertexPipelineSubmitCmd();
+            CurVertex[1] = entry.Param & 0xFFFF;
+            CurVertex[2] = entry.Param >> 16;
             SubmitVertex();
             break;
 
         case 0x28: // 10-bit delta vertex
-            CurVertex[0] += (s16)((ExecParams[0] & 0x000003FF) << 6) >> 6;
-            CurVertex[1] += (s16)((ExecParams[0] & 0x000FFC00) >> 4) >> 6;
-            CurVertex[2] += (s16)((ExecParams[0] & 0x3FF00000) >> 14) >> 6;
+            VertexPipelineSubmitCmd();
+            CurVertex[0] += (s16)((entry.Param & 0x000003FF) << 6) >> 6;
+            CurVertex[1] += (s16)((entry.Param & 0x000FFC00) >> 4) >> 6;
+            CurVertex[2] += (s16)((entry.Param & 0x3FF00000) >> 14) >> 6;
             SubmitVertex();
             break;
 
         case 0x29: // polygon attributes
-            PolygonAttr = ExecParams[0];
+            VertexPipelineCmdDelayed8();
+            PolygonAttr = entry.Param;
             break;
 
         case 0x2A: // texture param
-            TexParam = ExecParams[0];
+            VertexPipelineCmdDelayed8();
+            TexParam = entry.Param;
             break;
 
         case 0x2B: // texture palette
-            TexPalette = ExecParams[0] & 0x1FFF;
+            VertexPipelineCmdDelayed8();
+            TexPalette = entry.Param & 0x1FFF;
             break;
 
         case 0x30: // diffuse/ambient material
-            MatDiffuse[0] = ExecParams[0] & 0x1F;
-            MatDiffuse[1] = (ExecParams[0] >> 5) & 0x1F;
-            MatDiffuse[2] = (ExecParams[0] >> 10) & 0x1F;
-            MatAmbient[0] = (ExecParams[0] >> 16) & 0x1F;
-            MatAmbient[1] = (ExecParams[0] >> 21) & 0x1F;
-            MatAmbient[2] = (ExecParams[0] >> 26) & 0x1F;
-            if (ExecParams[0] & 0x8000)
+            VertexPipelineCmdDelayed6();
+            MatDiffuse[0] = entry.Param & 0x1F;
+            MatDiffuse[1] = (entry.Param >> 5) & 0x1F;
+            MatDiffuse[2] = (entry.Param >> 10) & 0x1F;
+            MatAmbient[0] = (entry.Param >> 16) & 0x1F;
+            MatAmbient[1] = (entry.Param >> 21) & 0x1F;
+            MatAmbient[2] = (entry.Param >> 26) & 0x1F;
+            if (entry.Param & 0x8000)
             {
                 VertexColor[0] = MatDiffuse[0];
                 VertexColor[1] = MatDiffuse[1];
@@ -2278,23 +2095,25 @@ void ExecuteCommand()
             break;
 
         case 0x31: // specular/emission material
-            MatSpecular[0] = ExecParams[0] & 0x1F;
-            MatSpecular[1] = (ExecParams[0] >> 5) & 0x1F;
-            MatSpecular[2] = (ExecParams[0] >> 10) & 0x1F;
-            MatEmission[0] = (ExecParams[0] >> 16) & 0x1F;
-            MatEmission[1] = (ExecParams[0] >> 21) & 0x1F;
-            MatEmission[2] = (ExecParams[0] >> 26) & 0x1F;
-            UseShininessTable = (ExecParams[0] & 0x8000) != 0;
+            VertexPipelineCmdDelayed6();
+            MatSpecular[0] = entry.Param & 0x1F;
+            MatSpecular[1] = (entry.Param >> 5) & 0x1F;
+            MatSpecular[2] = (entry.Param >> 10) & 0x1F;
+            MatEmission[0] = (entry.Param >> 16) & 0x1F;
+            MatEmission[1] = (entry.Param >> 21) & 0x1F;
+            MatEmission[2] = (entry.Param >> 26) & 0x1F;
+            UseShininessTable = (entry.Param & 0x8000) != 0;
             AddCycles(3);
             break;
 
         case 0x32: // light direction
+            StallPolygonPipeline(8 + 1,  2); // 0x32 can run 6 cycles after a vertex
             {
-                u32 l = ExecParams[0] >> 30;
+                u32 l = entry.Param >> 30;
                 s16 dir[3];
-                dir[0] = (s16)((ExecParams[0] & 0x000003FF) << 6) >> 6;
-                dir[1] = (s16)((ExecParams[0] & 0x000FFC00) >> 4) >> 6;
-                dir[2] = (s16)((ExecParams[0] & 0x3FF00000) >> 14) >> 6;
+                dir[0] = (s16)((entry.Param & 0x000003FF) << 6) >> 6;
+                dir[1] = (s16)((entry.Param & 0x000FFC00) >> 4) >> 6;
+                dir[2] = (s16)((entry.Param & 0x3FF00000) >> 14) >> 6;
                 LightDirection[l][0] = (dir[0]*VecMatrix[0] + dir[1]*VecMatrix[4] + dir[2]*VecMatrix[8]) >> 12;
                 LightDirection[l][1] = (dir[0]*VecMatrix[1] + dir[1]*VecMatrix[5] + dir[2]*VecMatrix[9]) >> 12;
                 LightDirection[l][2] = (dir[0]*VecMatrix[2] + dir[1]*VecMatrix[6] + dir[2]*VecMatrix[10]) >> 12;
@@ -2303,32 +2122,21 @@ void ExecuteCommand()
             break;
 
         case 0x33: // light color
+            VertexPipelineCmdDelayed8();
             {
-                u32 l = ExecParams[0] >> 30;
-                LightColor[l][0] = ExecParams[0] & 0x1F;
-                LightColor[l][1] = (ExecParams[0] >> 5) & 0x1F;
-                LightColor[l][2] = (ExecParams[0] >> 10) & 0x1F;
+                u32 l = entry.Param >> 30;
+                LightColor[l][0] = entry.Param & 0x1F;
+                LightColor[l][1] = (entry.Param >> 5) & 0x1F;
+                LightColor[l][2] = (entry.Param >> 10) & 0x1F;
             }
             AddCycles(1);
             break;
 
-        case 0x34: // shininess table
-            {
-                for (int i = 0; i < 128; i += 4)
-                {
-                    u32 val = ExecParams[i >> 2];
-                    ShininessTable[i + 0] = val & 0xFF;
-                    ShininessTable[i + 1] = (val >> 8) & 0xFF;
-                    ShininessTable[i + 2] = (val >> 16) & 0xFF;
-                    ShininessTable[i + 3] = val >> 24;
-                }
-            }
-            break;
-
         case 0x40: // begin polygons
+            StallPolygonPipeline(1, 0);
             // TODO: check if there was a polygon being defined but incomplete
             // such cases seem to freeze the GPU
-            PolygonMode = ExecParams[0] & 0x3;
+            PolygonMode = entry.Param & 0x3;
             VertexNum = 0;
             VertexNumInPoly = 0;
             NumConsecutivePolygons = 0;
@@ -2337,6 +2145,7 @@ void ExecuteCommand()
             break;
 
         case 0x41: // end polygons
+            VertexPipelineCmdDelayed8();
             // TODO: research this?
             // it doesn't seem to have any effect whatsoever, but
             // its timing characteristics are different from those of other
@@ -2344,8 +2153,9 @@ void ExecuteCommand()
             break;
 
         case 0x50: // flush
+            VertexPipelineCmdDelayed4();
             FlushRequest = 1;
-            FlushAttributes = ExecParams[0] & 0x3;
+            FlushAttributes = entry.Param & 0x3;
             CycleCount = 325;
             // probably safe to just reset all pipelines
             // but needs checked
@@ -2355,38 +2165,265 @@ void ExecuteCommand()
             VertexSlotCounter = 0;
             VertexSlotsFree = 1;
             break;
-
+        
         case 0x60: // viewport x1,y1,x2,y2
+            VertexPipelineCmdDelayed8();
             // note: viewport Y coordinates are upside-down
-            Viewport[0] = ExecParams[0] & 0xFF;                             // x0
-            Viewport[1] = (191 - ((ExecParams[0] >> 8) & 0xFF)) & 0xFF;     // y0
-            Viewport[2] = (ExecParams[0] >> 16) & 0xFF;                     // x1
-            Viewport[3] = (191 - (ExecParams[0] >> 24)) & 0xFF;             // y1
+            Viewport[0] = entry.Param & 0xFF;                             // x0
+            Viewport[1] = (191 - ((entry.Param >> 8) & 0xFF)) & 0xFF;     // y0
+            Viewport[2] = (entry.Param >> 16) & 0xFF;                     // x1
+            Viewport[3] = (191 - (entry.Param >> 24)) & 0xFF;             // y1
             Viewport[4] = (Viewport[2] - Viewport[0] + 1) & 0x1FF;          // width
             Viewport[5] = (Viewport[1] - Viewport[3] + 1) & 0xFF;           // height
             break;
 
-        case 0x70: // box test
-            NumTestCommands -= 3;
-            BoxTest(ExecParams);
+        default:
+            VertexPipelineCmdDelayed4();
+            //printf("!! UNKNOWN GX COMMAND %02X %08X\n", entry.Command, entry.Param);
             break;
+        }
+    }
+    else
+    {
+        ExecParams[ExecParamCount] = entry.Param;
+        ExecParamCount++;
 
-        case 0x71: // pos test
-            NumTestCommands -= 2;
-            CurVertex[0] = ExecParams[0] & 0xFFFF;
-            CurVertex[1] = ExecParams[0] >> 16;
-            CurVertex[2] = ExecParams[1] & 0xFFFF;
-            PosTest();
-            break;
+        if (ExecParamCount == 1)
+        {
+            // delay the first command entry as needed
+            switch (entry.Command)
+            {
+            // commands that stall the polygon pipeline
+            case 0x23: VertexPipelineSubmitCmd(); break;
+            case 0x34:
+            case 0x71:
+                VertexPipelineCmdDelayed8();
+                break;
+            case 0x70: StallPolygonPipeline(10 + 1, 0); break;
+            case 0x72: VertexPipelineCmdDelayed6(); break;
+            default: VertexPipelineCmdDelayed4(); break;
+            }
+        }
+        else
+        {
+            AddCycles(1);
 
-        case 0x72: // vec test
-            NumTestCommands--;
-            VecTest(ExecParams);
-            break;
+            if (ExecParamCount >= paramsRequiredCount)
+            {
+                /*printf("[GXS:%08X] 0x%02X,  ", GXStat, entry.Command);
+                for (int k = 0; k < ExecParamCount; k++) printf("0x%08X, ", ExecParams[k]);
+                printf("\n");*/
 
-        default:
-            //printf("!! UNKNOWN GX COMMAND %02X %08X\n", entry.Command, entry.Param);
-            break;
+                ExecParamCount = 0;
+
+                switch (entry.Command)
+                {
+                case 0x16: // load 4x4
+                    if (MatrixMode == 0)
+                    {
+                        MatrixLoad4x4(ProjMatrix, (s32*)ExecParams);
+                        ClipMatrixDirty = true;
+                        AddCycles(18);
+                    }
+                    else if (MatrixMode == 3)
+                    {
+                        MatrixLoad4x4(TexMatrix, (s32*)ExecParams);
+                        AddCycles(10);
+                    }
+                    else
+                    {
+                        MatrixLoad4x4(PosMatrix, (s32*)ExecParams);
+                        if (MatrixMode == 2)
+                            MatrixLoad4x4(VecMatrix, (s32*)ExecParams);
+                        ClipMatrixDirty = true;
+                        AddCycles(18);
+                    }
+                    break;
+
+                case 0x17: // load 4x3
+                    if (MatrixMode == 0)
+                    {
+                        MatrixLoad4x3(ProjMatrix, (s32*)ExecParams);
+                        ClipMatrixDirty = true;
+                        AddCycles(18);
+                    }
+                    else if (MatrixMode == 3)
+                    {
+                        MatrixLoad4x3(TexMatrix, (s32*)ExecParams);
+                        AddCycles(7);
+                    }
+                    else
+                    {
+                        MatrixLoad4x3(PosMatrix, (s32*)ExecParams);
+                        if (MatrixMode == 2)
+                            MatrixLoad4x3(VecMatrix, (s32*)ExecParams);
+                        ClipMatrixDirty = true;
+                        AddCycles(18);
+                    }
+                    break;
+
+                case 0x18: // mult 4x4
+                    if (MatrixMode == 0)
+                    {
+                        MatrixMult4x4(ProjMatrix, (s32*)ExecParams);
+                        ClipMatrixDirty = true;
+                        AddCycles(35 - 16);
+                    }
+                    else if (MatrixMode == 3)
+                    {
+                        MatrixMult4x4(TexMatrix, (s32*)ExecParams);
+                        AddCycles(33 - 16);
+                    }
+                    else
+                    {
+                        MatrixMult4x4(PosMatrix, (s32*)ExecParams);
+                        if (MatrixMode == 2)
+                        {
+                            MatrixMult4x4(VecMatrix, (s32*)ExecParams);
+                            AddCycles(35 + 30 - 16);
+                        }
+                        else AddCycles(35 - 16);
+                        ClipMatrixDirty = true;
+                    }
+                    break;
+
+                case 0x19: // mult 4x3
+                    if (MatrixMode == 0)
+                    {
+                        MatrixMult4x3(ProjMatrix, (s32*)ExecParams);
+                        ClipMatrixDirty = true;
+                        AddCycles(35 - 12);
+                    }
+                    else if (MatrixMode == 3)
+                    {
+                        MatrixMult4x3(TexMatrix, (s32*)ExecParams);
+                        AddCycles(33 - 12);
+                    }
+                    else
+                    {
+                        MatrixMult4x3(PosMatrix, (s32*)ExecParams);
+                        if (MatrixMode == 2)
+                        {
+                            MatrixMult4x3(VecMatrix, (s32*)ExecParams);
+                            AddCycles(35 + 30 - 12);
+                        }
+                        else AddCycles(35 - 12);
+                        ClipMatrixDirty = true;
+                    }
+                    break;
+
+                case 0x1A: // mult 3x3
+                    if (MatrixMode == 0)
+                    {
+                        MatrixMult3x3(ProjMatrix, (s32*)ExecParams);
+                        ClipMatrixDirty = true;
+                        AddCycles(35 - 9);
+                    }
+                    else if (MatrixMode == 3)
+                    {
+                        MatrixMult3x3(TexMatrix, (s32*)ExecParams);
+                        AddCycles(33 - 9);
+                    }
+                    else
+                    {
+                        MatrixMult3x3(PosMatrix, (s32*)ExecParams);
+                        if (MatrixMode == 2)
+                        {
+                            MatrixMult3x3(VecMatrix, (s32*)ExecParams);
+                            AddCycles(35 + 30 - 9);
+                        }
+                        else AddCycles(35 - 9);
+                        ClipMatrixDirty = true;
+                    }
+                    break;
+
+                case 0x1B: // scale
+                    if (MatrixMode == 0)
+                    {
+                        MatrixScale(ProjMatrix, (s32*)ExecParams);
+                        ClipMatrixDirty = true;
+                        AddCycles(35 - 3);
+                    }
+                    else if (MatrixMode == 3)
+                    {
+                        MatrixScale(TexMatrix, (s32*)ExecParams);
+                        AddCycles(33 - 3);
+                    }
+                    else
+                    {
+                        MatrixScale(PosMatrix, (s32*)ExecParams);
+                        ClipMatrixDirty = true;
+                        AddCycles(35 - 3);
+                    }
+                    break;
+
+                case 0x1C: // translate
+                    if (MatrixMode == 0)
+                    {
+                        MatrixTranslate(ProjMatrix, (s32*)ExecParams);
+                        ClipMatrixDirty = true;
+                        AddCycles(35 - 3);
+                    }
+                    else if (MatrixMode == 3)
+                    {
+                        MatrixTranslate(TexMatrix, (s32*)ExecParams);
+                        AddCycles(33 - 3);
+                    }
+                    else
+                    {
+                        MatrixTranslate(PosMatrix, (s32*)ExecParams);
+                        if (MatrixMode == 2)
+                        {
+                            MatrixTranslate(VecMatrix, (s32*)ExecParams);
+                            AddCycles(35 + 30 - 3);
+                        }
+                        else AddCycles(35 - 3);
+                        ClipMatrixDirty = true;
+                    }
+                    break;
+
+                case 0x23: // full vertex
+                    CurVertex[0] = ExecParams[0] & 0xFFFF;
+                    CurVertex[1] = ExecParams[0] >> 16;
+                    CurVertex[2] = ExecParams[1] & 0xFFFF;
+                    SubmitVertex();
+                    break;
+
+                case 0x34: // shininess table
+                    {
+                        for (int i = 0; i < 128; i += 4)
+                        {
+                            u32 val = ExecParams[i >> 2];
+                            ShininessTable[i + 0] = val & 0xFF;
+                            ShininessTable[i + 1] = (val >> 8) & 0xFF;
+                            ShininessTable[i + 2] = (val >> 16) & 0xFF;
+                            ShininessTable[i + 3] = val >> 24;
+                        }
+                    }
+                    break;
+
+                case 0x71: // pos test
+                    NumTestCommands -= 2;
+                    CurVertex[0] = ExecParams[0] & 0xFFFF;
+                    CurVertex[1] = ExecParams[0] >> 16;
+                    CurVertex[2] = ExecParams[1] & 0xFFFF;
+                    PosTest();
+                    break;
+                
+                case 0x70: // box test
+                    NumTestCommands -= 3;
+                    BoxTest(ExecParams);
+                    break;
+
+                case 0x72: // vec test
+                    NumTestCommands--;
+                    VecTest(ExecParams);
+                    break;
+
+                default:
+                    __builtin_unreachable();
+                }
+            }
         }
     }
 }
-- 
cgit v1.2.3