aboutsummaryrefslogtreecommitdiff
path: root/src/DMA.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/DMA.cpp')
-rw-r--r--src/DMA.cpp241
1 files changed, 159 insertions, 82 deletions
diff --git a/src/DMA.cpp b/src/DMA.cpp
index f0c22b5..7bbf980 100644
--- a/src/DMA.cpp
+++ b/src/DMA.cpp
@@ -26,6 +26,25 @@
// NOTES ON DMA SHIT
//
// * could use optimized code paths for common types of DMA transfers. for example, VRAM
+// have to profile it to see if it's actually worth doing
+
+
+// DMA TIMINGS
+//
+// sequential timing:
+// * 1 cycle per read or write
+// * in 32bit mode, accessing a 16bit bus (mainRAM, palette, VRAM) incurs 1 cycle of penalty
+// * in 32bit mode, transferring from mainRAM to another bank is 1 cycle faster
+// * if source and destination are the same memory bank, there is a 1 cycle penalty
+// * transferring from mainRAM to mainRAM is a trainwreck (all accesses are made nonsequential)
+//
+// nonsequential timing:
+// * nonseq penalty is applied to the first read and write
+// * I also figure it gets nonseq penalty again when resuming, after having been interrupted by
+// another DMA (TODO: check)
+// * applied to all accesses for mainRAM->mainRAM, resulting in timings of 16-18 cycles per unit
+//
+// TODO: GBA slot
DMA::DMA(u32 cpu, u32 num)
@@ -38,61 +57,6 @@ DMA::DMA(u32 cpu, u32 num)
else
CountMask = (num==3 ? 0x0000FFFF : 0x00003FFF);
- // TODO: merge with the one in ARM.cpp, somewhere
- for (int i = 0; i < 16; i++)
- {
- Waitstates[0][i] = 1;
- Waitstates[1][i] = 1;
- }
-
- if (!cpu)
- {
- // ARM9
- // note: 33MHz cycles
- Waitstates[0][0x2] = 1;
- Waitstates[0][0x3] = 1;
- Waitstates[0][0x4] = 1;
- Waitstates[0][0x5] = 1;
- Waitstates[0][0x6] = 1;
- Waitstates[0][0x7] = 1;
- Waitstates[0][0x8] = 6;
- Waitstates[0][0x9] = 6;
- Waitstates[0][0xA] = 10;
- Waitstates[0][0xF] = 1;
-
- Waitstates[1][0x2] = 2;
- Waitstates[1][0x3] = 1;
- Waitstates[1][0x4] = 1;
- Waitstates[1][0x5] = 2;
- Waitstates[1][0x6] = 2;
- Waitstates[1][0x7] = 1;
- Waitstates[1][0x8] = 12;
- Waitstates[1][0x9] = 12;
- Waitstates[1][0xA] = 10;
- Waitstates[1][0xF] = 1;
- }
- else
- {
- // ARM7
- Waitstates[0][0x0] = 1;
- Waitstates[0][0x2] = 1;
- Waitstates[0][0x3] = 1;
- Waitstates[0][0x4] = 1;
- Waitstates[0][0x6] = 1;
- Waitstates[0][0x8] = 6;
- Waitstates[0][0x9] = 6;
- Waitstates[0][0xA] = 10;
-
- Waitstates[1][0x0] = 1;
- Waitstates[1][0x2] = 2;
- Waitstates[1][0x3] = 1;
- Waitstates[1][0x4] = 1;
- Waitstates[1][0x6] = 2;
- Waitstates[1][0x8] = 12;
- Waitstates[1][0x9] = 12;
- Waitstates[1][0xA] = 10;
- }
-
Reset();
}
@@ -136,7 +100,7 @@ void DMA::DoSavestate(Savestate* file)
file->Var32(&SrcAddrInc);
file->Var32(&DstAddrInc);
- file->Var32((u32*)&Running);
+ file->Var32(&Running);
file->Var32((u32*)&InProgress);
file->Var32((u32*)&IsGXFIFODMA);
}
@@ -213,7 +177,11 @@ void DMA::Start()
// TODO eventually: not stop if we're running code in ITCM
- Running = true;
+ if (NDS::DMAsRunning(CPU))
+ Running = 1;
+ else
+ Running = 2;
+
InProgress = true;
NDS::StopCPU(CPU, 1<<Num);
}
@@ -223,73 +191,177 @@ s32 DMA::Run(s32 cycles)
if (!Running)
return cycles;
+#ifdef DEBUG_CHECK_DESYNC
+ s32 startc = cycles;
+#endif // DEBUG_CHECK_DESYNC
+
+ Executing = true;
+
+ // add NS penalty for first accesses in burst
+ // note: this seems to only apply when starting DMA 'in the void'
+ // for example, the aging cart DMA PRIORITY test:
+ // starts a big DMA immediately, and a small DMA upon HBlank
+ // each pulling from a timer incrementing once per cycle
+ // it expects that the values be increasing linearly (2c/unit)
+ // even as the small DMA starts and ends
+ bool burststart = (Running == 2);
+ Running = 1;
+
+ s32 unitcycles;
+ s32 lastcycles = cycles;
+
if (!(Cnt & 0x04000000))
{
+ if (CPU == 0)
+ {
+ if ((CurSrcAddr >> 24) == 0x02 && (CurDstAddr >> 24) == 0x02)
+ {
+ unitcycles = NDS::ARM9MemTimings[CurSrcAddr >> 14][0] + NDS::ARM9MemTimings[CurDstAddr >> 14][0];
+ }
+ else
+ {
+ unitcycles = NDS::ARM9MemTimings[CurSrcAddr >> 14][1] + NDS::ARM9MemTimings[CurDstAddr >> 14][1];
+ if ((CurSrcAddr >> 24) == (CurDstAddr >> 24))
+ unitcycles++;
+
+ if (burststart)
+ {
+ cycles -= 2;
+ cycles -= (NDS::ARM9MemTimings[CurSrcAddr >> 14][0] + NDS::ARM9MemTimings[CurDstAddr >> 14][0]);
+ cycles += unitcycles;
+ }
+ }
+ }
+ else
+ {
+ if ((CurSrcAddr >> 24) == 0x02 && (CurDstAddr >> 24) == 0x02)
+ {
+ unitcycles = NDS::ARM7MemTimings[CurSrcAddr >> 15][0] + NDS::ARM7MemTimings[CurDstAddr >> 15][0];
+ }
+ else
+ {
+ unitcycles = NDS::ARM7MemTimings[CurSrcAddr >> 15][1] + NDS::ARM7MemTimings[CurDstAddr >> 15][1];
+ if ((CurSrcAddr >> 23) == (CurDstAddr >> 23))
+ unitcycles++;
+
+ if (burststart)
+ {
+ cycles -= 2;
+ cycles -= (NDS::ARM7MemTimings[CurSrcAddr >> 15][0] + NDS::ARM7MemTimings[CurDstAddr >> 15][0]);
+ cycles += unitcycles;
+ }
+ }
+ }
+
u16 (*readfn)(u32) = CPU ? NDS::ARM7Read16 : NDS::ARM9Read16;
void (*writefn)(u32,u16) = CPU ? NDS::ARM7Write16 : NDS::ARM9Write16;
- while (IterCount > 0 && cycles > 0)
+ while (IterCount > 0 && !Stall)
{
- writefn(CurDstAddr, readfn(CurSrcAddr));
+ cycles -= unitcycles;
+
+ NDS::RunTightTimers(CPU, lastcycles-cycles);
- s32 c = (Waitstates[0][(CurSrcAddr >> 24) & 0xF] + Waitstates[0][(CurDstAddr >> 24) & 0xF]);
- cycles -= c;
- NDS::RunTimingCriticalDevices(CPU, c);
+ lastcycles = cycles;
+
+ writefn(CurDstAddr, readfn(CurSrcAddr));
CurSrcAddr += SrcAddrInc<<1;
CurDstAddr += DstAddrInc<<1;
IterCount--;
RemCount--;
+
+ if (cycles <= 0) break;
}
}
else
{
- // optimized path for typical GXFIFO DMA
- if (IsGXFIFODMA)
+ if (CPU == 0)
{
- while (IterCount > 0 && cycles > 0)
+ if ((CurSrcAddr >> 24) == 0x02 && (CurDstAddr >> 24) == 0x02)
{
- GPU3D::WriteToGXFIFO(*(u32*)&NDS::MainRAM[CurSrcAddr&0x3FFFFF]);
-
- s32 c = (Waitstates[1][0x2] + Waitstates[1][0x4]);
- cycles -= c;
- NDS::RunTimingCriticalDevices(0, c);
-
- CurSrcAddr += SrcAddrInc<<2;
- IterCount--;
- RemCount--;
+ unitcycles = NDS::ARM9MemTimings[CurSrcAddr >> 14][2] + NDS::ARM9MemTimings[CurDstAddr >> 14][2];
+ }
+ else
+ {
+ unitcycles = NDS::ARM9MemTimings[CurSrcAddr >> 14][3] + NDS::ARM9MemTimings[CurDstAddr >> 14][3];
+ if ((CurSrcAddr >> 24) == (CurDstAddr >> 24))
+ unitcycles++;
+ else if ((CurSrcAddr >> 24) == 0x02)
+ unitcycles--;
+
+ if (burststart)
+ {
+ cycles -= 2;
+ cycles -= (NDS::ARM9MemTimings[CurSrcAddr >> 14][2] + NDS::ARM9MemTimings[CurDstAddr >> 14][2]);
+ cycles += unitcycles;
+ }
+ }
+ }
+ else
+ {
+ if ((CurSrcAddr >> 24) == 0x02 && (CurDstAddr >> 24) == 0x02)
+ {
+ unitcycles = NDS::ARM7MemTimings[CurSrcAddr >> 15][2] + NDS::ARM7MemTimings[CurDstAddr >> 15][2];
+ }
+ else
+ {
+ unitcycles = NDS::ARM7MemTimings[CurSrcAddr >> 15][3] + NDS::ARM7MemTimings[CurDstAddr >> 15][3];
+ if ((CurSrcAddr >> 23) == (CurDstAddr >> 23))
+ unitcycles++;
+ else if ((CurSrcAddr >> 24) == 0x02)
+ unitcycles--;
+
+ if (burststart)
+ {
+ cycles -= 2;
+ cycles -= (NDS::ARM7MemTimings[CurSrcAddr >> 15][2] + NDS::ARM7MemTimings[CurDstAddr >> 15][2]);
+ cycles += unitcycles;
+ }
}
}
u32 (*readfn)(u32) = CPU ? NDS::ARM7Read32 : NDS::ARM9Read32;
void (*writefn)(u32,u32) = CPU ? NDS::ARM7Write32 : NDS::ARM9Write32;
- while (IterCount > 0 && cycles > 0)
+ while (IterCount > 0 && !Stall)
{
- writefn(CurDstAddr, readfn(CurSrcAddr));
+ cycles -= unitcycles;
+
+ NDS::RunTightTimers(CPU, lastcycles-cycles);
- s32 c = (Waitstates[1][(CurSrcAddr >> 24) & 0xF] + Waitstates[1][(CurDstAddr >> 24) & 0xF]);
- cycles -= c;
- NDS::RunTimingCriticalDevices(CPU, c);
+ lastcycles = cycles;
+
+ writefn(CurDstAddr, readfn(CurSrcAddr));
CurSrcAddr += SrcAddrInc<<2;
CurDstAddr += DstAddrInc<<2;
IterCount--;
RemCount--;
+
+ if (cycles <= 0) break;
}
}
+ Executing = false;
+ Stall = false;
+
if (RemCount)
{
if (IterCount == 0)
{
- Running = false;
+ Running = 0;
NDS::ResumeCPU(CPU, 1<<Num);
if (StartMode == 0x07)
GPU3D::CheckFIFODMA();
}
+#ifdef DEBUG_CHECK_DESYNC
+ if (CPU) NDS::dbg_CyclesARM7 += (startc-cycles);
+ else NDS::dbg_CyclesARM9 += (startc-cycles);
+#endif // DEBUG_CHECK_DESYNC
+
return cycles;
}
@@ -299,9 +371,14 @@ s32 DMA::Run(s32 cycles)
if (Cnt & 0x40000000)
NDS::SetIRQ(CPU, NDS::IRQ_DMA0 + Num);
- Running = false;
+ Running = 0;
InProgress = false;
NDS::ResumeCPU(CPU, 1<<Num);
- return cycles - 2;
+#ifdef DEBUG_CHECK_DESYNC
+ if (CPU) NDS::dbg_CyclesARM7 += (startc-cycles);
+ else NDS::dbg_CyclesARM9 += (startc-cycles);
+#endif // DEBUG_CHECK_DESYNC
+
+ return cycles;
}