1 files changed, 516 insertions, 0 deletions
diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h
new file mode 100644
index 0000000..851b7c1
--- /dev/null
+++ b/src/GPU3D_Soft.h
@@ -0,0 +1,516 @@
+/*
+    Copyright 2016-2020 Arisotura
+
+    This file is part of melonDS.
+
+    melonDS is free software: you can redistribute it and/or modify it under
+    the terms of the GNU General Public License as published by the Free
+    Software Foundation, either version 3 of the License, or (at your option)
+    any later version.
+
+    melonDS is distributed in the hope that it will be useful, but WITHOUT ANY
+    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+    FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with melonDS. If not, see http://www.gnu.org/licenses/.
+*/
+
+#pragma once
+
+#include "GPU3D.h"
+#include "Platform.h"
+#include <thread>
+
+namespace GPU3D
+{
+class SoftRenderer : public Renderer3D
+{
+public:
+    SoftRenderer();
+    virtual ~SoftRenderer() override {};
+    virtual bool Init() override;
+    virtual void DeInit() override;
+    virtual void Reset() override;
+
+    virtual void SetRenderSettings(GPU::RenderSettings& settings) override;
+
+    virtual void VCount144() override;
+    virtual void RenderFrame() override;
+    virtual void RestartFrame() override;
+    virtual u32* GetLine(int line) override;
+
+    void SetupRenderThread();
+    void StopRenderThread();
+private:
+    // Notes on the interpolator:
+    //
+    // This is a theory on how the DS hardware interpolates values. It matches hardware output
+    // in the tests I did, but the hardware may be doing it differently. You never know.
+    //
+    // Assuming you want to perspective-correctly interpolate a variable named A across two points
+    // in a typical rasterizer, you would calculate A/W and 1/W at each point, interpolate linearly,
+    // then divide A/W by 1/W to recover the correct A value.
+    //
+    // The DS GPU approximates interpolation by calculating a perspective-correct interpolation
+    // between 0 and 1, then using the result as a factor to linearly interpolate the actual
+    // vertex attributes. The factor has 9 bits of precision when interpolating along Y and
+    // 8 bits along X.
+    //
+    // There's a special path for when the two W values are equal: it directly does linear
+    // interpolation, avoiding precision loss from the aforementioned approximation.
+    // Which is desirable when using the GPU to draw 2D graphics.
+
+    template<int dir>
+    class Interpolator
+    {
+    public:
+        Interpolator() {}
+        Interpolator(s32 x0, s32 x1, s32 w0, s32 w1)
+        {
+            Setup(x0, x1, w0, w1);
+        }
+
+        void Setup(s32 x0, s32 x1, s32 w0, s32 w1)
+        {
+            this->x0 = x0;
+            this->x1 = x1;
+            this->xdiff = x1 - x0;
+
+            // calculate reciprocals for linear mode and Z interpolation
+            // TODO eventually: use a faster reciprocal function?
+            if (this->xdiff != 0)
+                this->xrecip = (1<<30) / this->xdiff;
+            else
+                this->xrecip = 0;
+            this->xrecip_z = this->xrecip >> 8;
+
+            // linear mode is used if both W values are equal and have
+            // low-order bits cleared (0-6 along X, 1-6 along Y)
+            u32 mask = dir ? 0x7E : 0x7F;
+            if ((w0 == w1) && !(w0 & mask) && !(w1 & mask))
+                this->linear = true;
+            else
+                this->linear = false;
+
+            if (dir)
+            {
+                // along Y
+
+                if ((w0 & 0x1) && !(w1 & 0x1))
+                {
+                    this->w0n = w0 - 1;
+                    this->w0d = w0 + 1;
+                    this->w1d = w1;
+                }
+                else
+                {
+                    this->w0n = w0 & 0xFFFE;
+                    this->w0d = w0 & 0xFFFE;
+                    this->w1d = w1 & 0xFFFE;
+                }
+
+                this->shift = 9;
+            }
+            else
+            {
+                // along X
+
+                this->w0n = w0;
+                this->w0d = w0;
+                this->w1d = w1;
+
+                this->shift = 8;
+            }
+        }
+
+        void SetX(s32 x)
+        {
+            x -= x0;
+            this->x = x;
+            if (xdiff != 0 && !linear)
+            {
+                s64 num = ((s64)x * w0n) << shift;
+                s32 den = (x * w0d) + ((xdiff-x) * w1d);
+
+                // this seems to be a proper division on hardware :/
+                // I haven't been able to find cases that produce imperfect output
+                if (den == 0) yfactor = 0;
+                else          yfactor = (s32)(num / den);
+            }
+        }
+
+        s32 Interpolate(s32 y0, s32 y1)
+        {
+            if (xdiff == 0 || y0 == y1) return y0;
+
+            if (!linear)
+            {
+                // perspective-correct approx. interpolation
+                if (y0 < y1)
+                    return y0 + (((y1-y0) * yfactor) >> shift);
+                else
+                    return y1 + (((y0-y1) * ((1<<shift)-yfactor)) >> shift);
+            }
+            else
+            {
+                // linear interpolation
+                // checkme: the rounding bias there (3<<24) is a guess
+                if (y0 < y1)
+                    return y0 + ((((s64)(y1-y0) * x * xrecip) + (3<<24)) >> 30);
+                else
+                    return y1 + ((((s64)(y0-y1) * (xdiff-x) * xrecip) + (3<<24)) >> 30);
+            }
+        }
+
+        s32 InterpolateZ(s32 z0, s32 z1, bool wbuffer)
+        {
+            if (xdiff == 0 || z0 == z1) return z0;
+
+            if (wbuffer)
+            {
+                // W-buffering: perspective-correct approx. interpolation
+                if (z0 < z1)
+                    return z0 + (((s64)(z1-z0) * yfactor) >> shift);
+                else
+                    return z1 + (((s64)(z0-z1) * ((1<<shift)-yfactor)) >> shift);
+            }
+            else
+            {
+                // Z-buffering: linear interpolation
+                // still doesn't quite match hardware...
+                s32 base, disp, factor;
+
+                if (z0 < z1)
+                {
+                    base = z0;
+                    disp = z1 - z0;
+                    factor = x;
+                }
+                else
+                {
+                    base = z1;
+                    disp = z0 - z1,
+                    factor = xdiff - x;
+                }
+
+                if (dir)
+                {
+                    int shift = 0;
+                    while (disp > 0x3FF)
+                    {
+                        disp >>= 1;
+                        shift++;
+                    }
+
+                    return base + ((((s64)disp * factor * xrecip_z) >> 22) << shift);
+                }
+                else
+                {
+                    disp >>= 9;
+                    return base + (((s64)disp * factor * xrecip_z) >> 13);
+                }
+            }
+        }
+
+    private:
+        s32 x0, x1, xdiff, x;
+
+        int shift;
+        bool linear;
+
+        s32 xrecip, xrecip_z;
+        s32 w0n, w0d, w1d;
+
+        u32 yfactor;
+    };
+
+
+    template<int side>
+    class Slope
+    {
+    public:
+        Slope() {}
+
+        s32 SetupDummy(s32 x0)
+        {
+            if (side)
+            {
+                dx = -0x40000;
+                x0--;
+            }
+            else
+            {
+                dx = 0;
+            }
+
+            this->x0 = x0;
+            this->xmin = x0;
+            this->xmax = x0;
+
+            Increment = 0;
+            XMajor = false;
+
+            Interp.Setup(0, 0, 0, 0);
+            Interp.SetX(0);
+
+            xcov_incr = 0;
+
+            return x0;
+        }
+
+        s32 Setup(s32 x0, s32 x1, s32 y0, s32 y1, s32 w0, s32 w1, s32 y)
+        {
+            this->x0 = x0;
+            this->y = y;
+
+            if (x1 > x0)
+            {
+                this->xmin = x0;
+                this->xmax = x1-1;
+                this->Negative = false;
+            }
+            else if (x1 < x0)
+            {
+                this->xmin = x1;
+                this->xmax = x0-1;
+                this->Negative = true;
+            }
+            else
+            {
+                this->xmin = x0;
+                if (side) this->xmin--;
+                this->xmax = this->xmin;
+                this->Negative = false;
+            }
+
+            xlen = xmax+1 - xmin;
+            ylen = y1 - y0;
+
+            // slope increment has a 18-bit fractional part
+            // note: for some reason, x/y isn't calculated directly,
+            // instead, 1/y is calculated and then multiplied by x
+            // TODO: this is still not perfect (see for example x=169 y=33)
+            if (ylen == 0)
+                Increment = 0;
+            else if (ylen == xlen)
+                Increment = 0x40000;
+            else
+            {
+                s32 yrecip = (1<<18) / ylen;
+                Increment = (x1-x0) * yrecip;
+                if (Increment < 0) Increment = -Increment;
+            }
+
+            XMajor = (Increment > 0x40000);
+
+            if (side)
+            {
+                // right
+
+                if (XMajor)              dx = Negative ? (0x20000 + 0x40000) : (Increment - 0x20000);
+                else if (Increment != 0) dx = Negative ? 0x40000 : 0;
+                else                     dx = -0x40000;
+            }
+            else
+            {
+                // left
+
+                if (XMajor)              dx = Negative ? ((Increment - 0x20000) + 0x40000) : 0x20000;
+                else if (Increment != 0) dx = Negative ? 0x40000 : 0;
+                else                     dx = 0;
+            }
+
+            dx += (y - y0) * Increment;
+
+            s32 x = XVal();
+
+            if (XMajor)
+            {
+                if (side) Interp.Setup(x0-1, x1-1, w0, w1); // checkme
+                else      Interp.Setup(x0, x1, w0, w1);
+                Interp.SetX(x);
+
+                // used for calculating AA coverage
+                xcov_incr = (ylen << 10) / xlen;
+            }
+            else
+            {
+                Interp.Setup(y0, y1, w0, w1);
+                Interp.SetX(y);
+            }
+
+            return x;
+        }
+
+        s32 Step()
+        {
+            dx += Increment;
+            y++;
+
+            s32 x = XVal();
+            if (XMajor)
+            {
+                Interp.SetX(x);
+            }
+            else
+            {
+                Interp.SetX(y);
+            }
+            return x;
+        }
+
+        s32 XVal()
+        {
+            s32 ret;
+            if (Negative) ret = x0 - (dx >> 18);
+            else          ret = x0 + (dx >> 18);
+
+            if (ret < xmin) ret = xmin;
+            else if (ret > xmax) ret = xmax;
+            return ret;
+        }
+
+        void EdgeParams_XMajor(s32* length, s32* coverage)
+        {
+            if (side ^ Negative)
+                *length = (dx >> 18) - ((dx-Increment) >> 18);
+            else
+                *length = ((dx+Increment) >> 18) - (dx >> 18);
+
+            // for X-major edges, we return the coverage
+            // for the first pixel, and the increment for
+            // further pixels on the same scanline
+            s32 startx = dx >> 18;
+            if (Negative) startx = xlen - startx;
+            if (side)     startx = startx - *length + 1;
+
+            s32 startcov = (((startx << 10) + 0x1FF) * ylen) / xlen;
+            *coverage = (1<<31) | ((startcov & 0x3FF) << 12) | (xcov_incr & 0x3FF);
+        }
+
+        void EdgeParams_YMajor(s32* length, s32* coverage)
+        {
+            *length = 1;
+
+            if (Increment == 0)
+            {
+                *coverage = 31;
+            }
+            else
+            {
+                s32 cov = ((dx >> 9) + (Increment >> 10)) >> 4;
+                if ((cov >> 5) != (dx >> 18)) cov = 31;
+                cov &= 0x1F;
+                if (!(side ^ Negative)) cov = 0x1F - cov;
+
+                *coverage = cov;
+            }
+        }
+
+        void EdgeParams(s32* length, s32* coverage)
+        {
+            if (XMajor)
+                return EdgeParams_XMajor(length, coverage);
+            else
+                return EdgeParams_YMajor(length, coverage);
+        }
+
+        s32 Increment;
+        bool Negative;
+        bool XMajor;
+        Interpolator<1> Interp;
+
+    private:
+        s32 x0, xmin, xmax;
+        s32 xlen, ylen;
+        s32 dx;
+        s32 y;
+
+        s32 xcov_incr;
+        s32 ycoverage, ycov_incr;
+    };
+
+    template <typename T>
+    inline T ReadVRAM_Texture(u32 addr)
+    {
+        return *(T*)&GPU::VRAMFlat_Texture[addr & 0x7FFFF];
+    }
+    template <typename T>
+    inline T ReadVRAM_TexPal(u32 addr)
+    {
+        return *(T*)&GPU::VRAMFlat_TexPal[addr & 0x1FFFF];
+    }
+
+    struct RendererPolygon
+    {
+        Polygon* PolyData;
+
+        Slope<0> SlopeL;
+        Slope<1> SlopeR;
+        s32 XL, XR;
+        u32 CurVL, CurVR;
+        u32 NextVL, NextVR;
+
+    };
+
+    RendererPolygon PolygonList[2048];
+    void TextureLookup(u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha);
+    u32 RenderPixel(Polygon* polygon, u8 vr, u8 vg, u8 vb, s16 s, s16 t);
+    void PlotTranslucentPixel(u32 pixeladdr, u32 color, u32 z, u32 polyattr, u32 shadow);
+    void SetupPolygonLeftEdge(RendererPolygon* rp, s32 y);
+    void SetupPolygonRightEdge(RendererPolygon* rp, s32 y);
+    void SetupPolygon(RendererPolygon* rp, Polygon* polygon);
+    void RenderShadowMaskScanline(RendererPolygon* rp, s32 y);
+    void RenderPolygonScanline(RendererPolygon* rp, s32 y);
+    void RenderScanline(s32 y, int npolys);
+    u32 CalculateFogDensity(u32 pixeladdr);
+    void ScanlineFinalPass(s32 y);
+    void ClearBuffers();
+    void RenderPolygons(bool threaded, Polygon** polygons, int npolys);
+
+    void RenderThreadFunc();
+
+    // buffer dimensions are 258x194 to add a offscreen 1px border
+    // which simplifies edge marking tests
+    // buffer is duplicated to keep track of the two topmost pixels
+    // TODO: check if the hardware can accidentally plot pixels
+    // offscreen in that border
+
+    static constexpr int ScanlineWidth = 258;
+    static constexpr int NumScanlines = 194;
+    static constexpr int BufferSize = ScanlineWidth * NumScanlines;
+    static constexpr int FirstPixelOffset = ScanlineWidth + 1;
+
+    u32 ColorBuffer[BufferSize * 2];
+    u32 DepthBuffer[BufferSize * 2];
+    u32 AttrBuffer[BufferSize * 2];
+
+    // attribute buffer:
+    // bit0-3: edge flags (left/right/top/bottom)
+    // bit4: backfacing flag
+    // bit8-12: antialiasing alpha
+    // bit15: fog enable
+    // bit16-21: polygon ID for translucent pixels
+    // bit22: translucent flag
+    // bit24-29: polygon ID for opaque pixels
+
+    u8 StencilBuffer[256*2];
+    bool PrevIsShadowMask;
+
+    bool Enabled;
+
+    bool FrameIdentical;
+
+    // threading
+
+    bool Threaded;
+    // Platform::Thread* RenderThread;
+    std::thread RenderThread;
+    bool RenderThreadRunning;
+    bool RenderThreadRendering;
+    Platform::Semaphore* Sema_RenderStart;
+    Platform::Semaphore* Sema_RenderDone;
+    Platform::Semaphore* Sema_ScanlineCount;
+};
+}
+\ No newline at end of file