aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorlonkaars <loek@pipeframe.xyz>2023-03-03 17:30:09 +0100
committerlonkaars <loek@pipeframe.xyz>2023-03-03 17:30:09 +0100
commitdf8902fba3a6e97ca3c5fdedb70999faac713815 (patch)
tree68cc7cd37724c971d78624be7317181afc0f8df5
parentd832b7f7e4747f443b550d78b78394dbf981c6cc (diff)
WIP fg sprite optimilization
-rw-r--r--basys3/basys3.srcs/ppu.vhd23
-rw-r--r--basys3/basys3.srcs/ppu_consts.vhd2
-rw-r--r--basys3/basys3.srcs/ppu_pceg.vhd18
-rw-r--r--basys3/basys3.srcs/ppu_pceg_tb.vhd10
-rw-r--r--basys3/basys3.srcs/ppu_sprite_bg.vhd17
-rw-r--r--basys3/basys3.srcs/ppu_sprite_fg.vhd148
-rw-r--r--basys3/basys3.xpr23
-rw-r--r--docs/architecture.md42
8 files changed, 163 insertions, 120 deletions
diff --git a/basys3/basys3.srcs/ppu.vhd b/basys3/basys3.srcs/ppu.vhd
index 2425edc..c6dfe60 100644
--- a/basys3/basys3.srcs/ppu.vhd
+++ b/basys3/basys3.srcs/ppu.vhd
@@ -2,7 +2,6 @@ library ieee;
library work;
use ieee.std_logic_1164.all;
---use ieee.numeric_std.all;
use work.ppu_consts.all;
entity ppu is port(
@@ -22,8 +21,8 @@ architecture Behavioral of ppu is
CLK : in std_logic; -- system clock
RESET : in std_logic; -- async reset
SPRITE : out std_logic; -- sprite info fetch + sprite pixel fetch
- COMP_PAL : out std_logic; -- compositor + palette lookup
- DONE : out std_logic); -- last pipeline stage done
+ DONE : out std_logic; -- last pipeline stage done
+ READY : out std_logic); -- rgb buffer propagation ready
end component;
component ppu_addr_dec port( -- address decoder
WEN : in std_logic; -- EXT write enable
@@ -81,6 +80,7 @@ architecture Behavioral of ppu is
-- inputs
CLK : in std_logic; -- pipeline clock
RESET : in std_logic; -- reset clock counter
+ PL_RESET : in std_logic; -- reset pipeline clock counters
OE : in std_logic; -- output enable (of CIDX)
X : in std_logic_vector(PPU_POS_H_WIDTH-1 downto 0); -- current screen pixel x
Y : in std_logic_vector(PPU_POS_V_WIDTH-1 downto 0); -- current screen pixel y
@@ -105,6 +105,7 @@ architecture Behavioral of ppu is
-- inputs
CLK : in std_logic; -- system clock
RESET : in std_logic; -- reset internal memory and clock counters
+ PL_RESET : in std_logic; -- reset pipeline clock counters
OE : in std_logic; -- output enable (of CIDX)
X : in std_logic_vector(PPU_POS_H_WIDTH-1 downto 0); -- current screen pixel x
Y : in std_logic_vector(PPU_POS_V_WIDTH-1 downto 0); -- current screen pixel y
@@ -166,7 +167,7 @@ architecture Behavioral of ppu is
-- signals
signal SYSCLK, SYSRST : std_logic; -- system clock and reset
- signal PL_SPRITE, PL_COMP_PAL, PL_DONE : std_logic; -- pipeline stages
+ signal PL_SPRITE, PL_DONE, PL_READY : std_logic; -- pipeline stages
signal TMM_WEN, BAM_WEN, FAM_WEN, PAL_WEN, AUX_WEN : std_logic;
signal TMM_W_ADDR, TMM_R_ADDR : std_logic_vector(PPU_TMM_ADDR_WIDTH-1 downto 0); -- read/write TMM addr (dual port)
signal BAM_W_ADDR, BAM_R_ADDR : std_logic_vector(PPU_BAM_ADDR_WIDTH-1 downto 0); -- read/write BAM addr (dual port)
@@ -181,7 +182,7 @@ architecture Behavioral of ppu is
signal X : std_logic_vector(PPU_POS_H_WIDTH-1 downto 0); -- current screen pixel x
signal Y : std_logic_vector(PPU_POS_V_WIDTH-1 downto 0); -- current screen pixel y
signal UR,UG,UB : std_logic_vector(PPU_COLOR_OUTPUT_DEPTH-1 downto 0); -- unstable RGB (to be buffered)
- signal SR,SG,SB : std_logic_vector(PPU_COLOR_OUTPUT_DEPTH-1 downto 0); -- stable RGB (buffered until PL_COMP_PAL)
+ signal SR,SG,SB : std_logic_vector(PPU_COLOR_OUTPUT_DEPTH-1 downto 0); -- stable RGB (buffered until PL_DONE)
signal BG_SHIFT_X : std_logic_vector(PPU_POS_H_WIDTH-1 downto 0);
signal BG_SHIFT_Y : std_logic_vector(PPU_POS_V_WIDTH-1 downto 0);
signal FG_FETCH : std_logic;
@@ -202,8 +203,8 @@ begin
CLK => SYSCLK,
RESET => SYSRST,
SPRITE => PL_SPRITE,
- COMP_PAL => PL_COMP_PAL,
- DONE => PL_DONE);
+ DONE => PL_DONE,
+ READY => PL_READY);
address_decoder : component ppu_addr_dec port map(
WEN => WEN,
@@ -255,6 +256,7 @@ begin
background_sprite : component ppu_sprite_bg port map(
CLK => PL_SPRITE,
RESET => SYSRST,
+ PL_RESET => PL_READY,
OE => BG_EN,
X => X,
Y => Y,
@@ -272,6 +274,7 @@ begin
port map(
CLK => SYSCLK,
RESET => SYSRST,
+ PL_RESET => PL_READY,
OE => FG_EN(FG_IDX),
X => X,
Y => Y,
@@ -303,13 +306,13 @@ begin
B => UB);
-- palette lookup output buffer (pipeline stage 5)
- process(PL_COMP_PAL, SYSRST)
+ process(PL_DONE, SYSRST)
begin
if SYSRST = '1' then
SR <= x"0";
SG <= x"0";
SB <= x"0";
- elsif rising_edge(PL_COMP_PAL) then
+ elsif rising_edge(PL_DONE) then
SR <= UR;
SG <= UG;
SB <= UB;
@@ -331,7 +334,7 @@ begin
RESET => SYSRST,
X => X,
Y => Y,
- PREADY => PL_DONE,
+ PREADY => PL_READY,
RI => SR,
GI => SG,
BI => SB,
diff --git a/basys3/basys3.srcs/ppu_consts.vhd b/basys3/basys3.srcs/ppu_consts.vhd
index 75b6168..c7786c4 100644
--- a/basys3/basys3.srcs/ppu_consts.vhd
+++ b/basys3/basys3.srcs/ppu_consts.vhd
@@ -44,6 +44,8 @@ package ppu_consts is
constant PPU_TMM_CACHE_FETCH_C_COUNT : natural := PPU_SPRITE_WORD_COUNT + 1;
constant PPU_TMM_CACHE_FETCH_A_COUNT : natural := PPU_TMM_CACHE_FETCH_C_COUNT * PPU_FG_SPRITE_COUNT; -- amount of clocks to fetch new TMM cache
constant PPU_TMM_CACHE_FETCH_A_WIDTH : natural := ceil_log2(PPU_TMM_CACHE_FETCH_A_COUNT);
+ constant PPU_ACCURATE_FG_SPRITE_COUNT : natural := 16;
+ constant PPU_PL_TOTAL_STAGES : natural := 14;
end package ppu_consts;
package body ppu_consts is
-- https://stackoverflow.com/questions/21783280/number-of-bits-to-represent-an-integer-in-vhdl
diff --git a/basys3/basys3.srcs/ppu_pceg.vhd b/basys3/basys3.srcs/ppu_pceg.vhd
index 1aaeee4..5d9f4d6 100644
--- a/basys3/basys3.srcs/ppu_pceg.vhd
+++ b/basys3/basys3.srcs/ppu_pceg.vhd
@@ -1,25 +1,23 @@
library ieee;
use ieee.std_logic_1164.all;
---use ieee.numeric_std.all;
+use work.ppu_consts.all;
entity ppu_pceg is port(
CLK : in std_logic; -- system clock
RESET : in std_logic; -- async reset
SPRITE : out std_logic; -- sprite info fetch + sprite pixel fetch
- COMP_PAL : out std_logic; -- compositor + palette lookup
- DONE : out std_logic); -- last pipeline stage done
+ DONE : out std_logic; -- last pipeline stage done
+ READY : out std_logic); -- rgb buffer propagation ready
end ppu_pceg;
architecture Behavioral of ppu_pceg is
- constant PPU_PL_TOTAL_STAGES : natural := 14;
-
- type states is (PL_SPRITE, PL_COMP_PAL, PL_DONE);
+ type states is (PL_SPRITE, PL_DONE, PL_READY);
signal state : states := PL_SPRITE;
begin
-- output drivers
SPRITE <= CLK when RESET = '0' and state = PL_SPRITE else '0';
- COMP_PAL <= CLK when RESET = '0' and state = PL_COMP_PAL else '0';
- DONE <= '1' when RESET = '0' and state = PL_DONE else '0';
+ DONE <= CLK when RESET = '0' and state = PL_DONE else '0';
+ READY <= '1' when RESET = '0' and state = PL_READY else '0';
process(CLK, RESET)
variable CLK_IDX : natural range 0 to PPU_PL_TOTAL_STAGES+1 := 0;
@@ -31,9 +29,9 @@ begin
if CLK_IDX < 4 then
state <= PL_SPRITE;
elsif CLK_IDX < 5 then
- state <= PL_COMP_PAL;
- else
state <= PL_DONE;
+ else
+ state <= PL_READY;
end if;
-- increment clock counter
diff --git a/basys3/basys3.srcs/ppu_pceg_tb.vhd b/basys3/basys3.srcs/ppu_pceg_tb.vhd
index 719ec06..86061a0 100644
--- a/basys3/basys3.srcs/ppu_pceg_tb.vhd
+++ b/basys3/basys3.srcs/ppu_pceg_tb.vhd
@@ -13,22 +13,22 @@ architecture behavioral of ppu_pceg_tb is
CLK : in std_logic; -- system clock
RESET : in std_logic; -- async reset
SPRITE : out std_logic; -- sprite info fetch + sprite pixel fetch
- COMP_PAL : out std_logic; -- compositor + palette lookup
- DONE : out std_logic); -- last pipeline stage done
+ DONE : out std_logic; -- last pipeline stage done
+ READY : out std_logic); -- rgb buffer propagation ready
end component;
signal CLK : std_logic := '0';
signal RESET : std_logic := '0';
signal SPRITE : std_logic;
- signal COMP_PAL : std_logic;
signal DONE : std_logic;
+ signal READY : std_logic;
begin
uut : ppu_pceg port map(
CLK => CLK,
RESET => RESET,
SPRITE => SPRITE,
- COMP_PAL => COMP_PAL,
- DONE => DONE);
+ DONE => DONE,
+ READY => READY);
tb : process
begin
diff --git a/basys3/basys3.srcs/ppu_sprite_bg.vhd b/basys3/basys3.srcs/ppu_sprite_bg.vhd
index dba5b8e..1892694 100644
--- a/basys3/basys3.srcs/ppu_sprite_bg.vhd
+++ b/basys3/basys3.srcs/ppu_sprite_bg.vhd
@@ -11,6 +11,7 @@ entity ppu_sprite_bg is port(
-- inputs
CLK : in std_logic; -- pipeline clock
RESET : in std_logic; -- reset clock counter
+ PL_RESET : in std_logic; -- reset pipeline clock counters
OE : in std_logic; -- output enable (of CIDX)
X : in std_logic_vector(PPU_POS_H_WIDTH-1 downto 0); -- current screen pixel x
Y : in std_logic_vector(PPU_POS_V_WIDTH-1 downto 0); -- current screen pixel y
@@ -108,16 +109,18 @@ begin
(others => '0') when others;
-- state machine (pipeline stage counter) + sync r/w
- process(CLK, RESET)
+ process(CLK, RESET, PL_RESET)
begin
- if RESET = '1' then
+ if RESET = '1' or PL_RESET = '1' then
-- reset state
state <= PL_BAM_ADDR;
- -- reset internal pipeline registers
- R_BAM_ADDR <= (others => '0');
- R_BAM_DATA <= (others => '0');
- R_TMM_ADDR <= (others => '0');
- R_TMM_DATA <= (others => '0');
+ if RESET = '1' then
+ -- reset internal pipeline registers
+ R_BAM_ADDR <= (others => '0');
+ R_BAM_DATA <= (others => '0');
+ R_TMM_ADDR <= (others => '0');
+ R_TMM_DATA <= (others => '0');
+ end if;
elsif rising_edge(CLK) then
case state is
when PL_BAM_ADDR =>
diff --git a/basys3/basys3.srcs/ppu_sprite_fg.vhd b/basys3/basys3.srcs/ppu_sprite_fg.vhd
index af7cfa3..dd315d8 100644
--- a/basys3/basys3.srcs/ppu_sprite_fg.vhd
+++ b/basys3/basys3.srcs/ppu_sprite_fg.vhd
@@ -14,6 +14,7 @@ entity ppu_sprite_fg is -- foreground sprite
-- inputs
CLK : in std_logic; -- system clock
RESET : in std_logic; -- reset internal memory and clock counters
+ PL_RESET : in std_logic; -- reset pipeline clock counters
OE : in std_logic; -- output enable (of CIDX)
X : in std_logic_vector(PPU_POS_H_WIDTH-1 downto 0); -- current screen pixel x
Y : in std_logic_vector(PPU_POS_V_WIDTH-1 downto 0); -- current screen pixel y
@@ -80,18 +81,7 @@ architecture Behavioral of ppu_sprite_fg is
signal TRANS_TILE_PIXEL_IDX : integer := 0; -- index of pixel within tile (reading order)
signal TILEMAP_WORD_OFFSET : integer := 0; -- word offset from tile start address in TMM
signal TMM_DATA_PAL_IDX : std_logic_vector(PPU_PALETTE_COLOR_WIDTH-1 downto 0); -- color of palette
-
- -- TMM cache lines
- signal TMM_CACHE_WEN, TMM_CACHE_UPDATE_TURN : std_logic := '0';
- signal TMM_CACHE_DATA : std_logic_vector(PPU_TMM_DATA_WIDTH-1 downto 0) := (others => '0');
- signal TMM_CACHE_ADDR : std_logic_vector(PPU_TMM_ADDR_WIDTH-1 downto 0) := (others => '0');
- signal TMM_CACHE : std_logic_vector((PPU_SPRITE_WORD_COUNT * PPU_TMM_DATA_WIDTH)-1 downto 0);
begin
- -- output drivers
- CIDX <= T_CIDX when OE = '1' else (others => 'Z');
- -- CIDX combination
- T_CIDX <= FAM_REG_COL_IDX & TMM_DATA_PAL_IDX;
-
-- FAM memory
FAM : component er_ram
generic map(
@@ -107,11 +97,18 @@ begin
DATA => FAM_DATA,
REG => INT_FAM);
+ -- output drivers
+ CIDX <= T_CIDX when OE = '1' else (others => 'Z');
+ -- CIDX combination
+ T_CIDX <= FAM_REG_COL_IDX & TMM_DATA_PAL_IDX;
+
+ T_TMM_DATA <= TMM_DATA;
+
-- pixel position within bounding box of sprite
SPRITE_ACTIVE <= '1' when ((unsigned(X) + 16) >= unsigned(FAM_REG_POS_H)) and
- ((unsigned(X) + 16) < (unsigned(FAM_REG_POS_H) + to_unsigned(PPU_SPRITE_WIDTH, PPU_POS_H_WIDTH))) and
- ((unsigned(Y) + 16) >= unsigned(FAM_REG_POS_V)) and
- ((unsigned(Y) + 16) < (unsigned(FAM_REG_POS_V) + to_unsigned(PPU_SPRITE_HEIGHT, PPU_POS_V_WIDTH))) else '0';
+ ((unsigned(X) + 16) < (unsigned(FAM_REG_POS_H) + to_unsigned(PPU_SPRITE_WIDTH, PPU_POS_H_WIDTH))) and
+ ((unsigned(Y) + 16) >= unsigned(FAM_REG_POS_V)) and
+ ((unsigned(Y) + 16) < (unsigned(FAM_REG_POS_V) + to_unsigned(PPU_SPRITE_HEIGHT, PPU_POS_V_WIDTH))) else '0';
-- (sprite local) pixel coords
TILE_PIDX_X <= resize(unsigned(X) + 16 - resize(unsigned(FAM_REG_POS_H), TILE_PIDX_X'length), TILE_PIDX_X'length);
@@ -128,65 +125,80 @@ begin
-- pixel index
TRANS_TILE_PIXEL_IDX <= integer(PPU_SPRITE_WIDTH) * to_integer(TRANS_TILE_PIDX_Y) + to_integer(TRANS_TILE_PIDX_X);
- -- palette color at pixel
- TMM_DATA_PAL_IDX <= TMM_CACHE(TRANS_TILE_PIXEL_IDX * integer(PPU_PALETTE_COLOR_WIDTH) + integer(PPU_PALETTE_COLOR_WIDTH)-1 downto TRANS_TILE_PIXEL_IDX * integer(PPU_PALETTE_COLOR_WIDTH));
-- if pixel in sprite hitbox and TMM_DATA_PAL_IDX > 0
HIT <= SPRITE_ACTIVE and (nor TMM_DATA_PAL_IDX);
- -- FETCH LOGIC BELOW
- TMM_ADDR <= T_TMM_ADDR when TMM_CACHE_UPDATE_TURN else (others => 'Z');
- T_TMM_DATA <= TMM_DATA;
-
- -- TTM cache
- ttm_cache : component er_ram
- generic map(
- ADDR_W => PPU_TMM_ADDR_WIDTH,
- DATA_W => PPU_TMM_DATA_WIDTH,
- ADDR_LOW => 0,
- ADDR_RANGE => PPU_SPRITE_WORD_COUNT)
- port map(
- CLK => CLK,
- RST => RESET,
- WEN => TMM_CACHE_WEN,
- ADDR => TMM_CACHE_ADDR,
- DATA => TMM_CACHE_DATA,
- REG => TMM_CACHE);
-
- -- fetch machine, should do the following (offset data read by one clock -> propagation/lookup delay):
- -- CLK[53 * IDX + 0] (addr = 0)
- -- CLK[53 * IDX + 1] (addr = 1, read data[0])
- -- CLK[53 * IDX + 2] (addr = 2, read data[1]), etc
- -- a full tile is 52 words, but since the offset is 1 clock, a total copy takes 53 clock cycles
- process(CLK, RESET, FETCH)
- constant TMM_FETCH_CLK_RANGE_BEGIN : natural := PPU_TMM_CACHE_FETCH_C_COUNT * IDX; -- fetch CLK count for copying this module's sprite from TMM
- variable TMM_FETCH_CTR : unsigned(PPU_TMM_CACHE_FETCH_A_WIDTH-1 downto 0) := (others => '0'); -- CLK counter while FETCH=1
- variable TMM_FETCH_CTR_REL : unsigned(PPU_TMM_CACHE_FETCH_A_WIDTH-1 downto 0) := (others => '0'); -- CLK counter relative for sprite[IDX]
+ inaccurate_occlusion_shims: if IDX >= PPU_ACCURATE_FG_SPRITE_COUNT generate
begin
- if RESET = '1' or FETCH = '0' then
- TMM_FETCH_CTR := (others => '0');
- TMM_FETCH_CTR_REL := (others => '0');
- TMM_CACHE_WEN <= '0';
- TMM_CACHE_UPDATE_TURN <= '0';
- elsif rising_edge(CLK) then
- TMM_FETCH_CTR := TMM_FETCH_CTR + 1;
- TMM_FETCH_CTR_REL := TMM_FETCH_CTR - TMM_FETCH_CLK_RANGE_BEGIN;
-
- if TMM_FETCH_CTR >= TMM_FETCH_CLK_RANGE_BEGIN and
- TMM_FETCH_CTR < (TMM_FETCH_CLK_RANGE_BEGIN + PPU_TMM_CACHE_FETCH_C_COUNT) then
- TMM_CACHE_UPDATE_TURN <= '1';
- if TMM_FETCH_CTR_REL < PPU_TMM_CACHE_FETCH_C_COUNT - 1 then -- calculate address until second to last clock
- T_TMM_ADDR <= std_logic_vector(resize(TMM_FETCH_CTR - IDX, T_TMM_ADDR'length));
- TMM_CACHE_ADDR <= std_logic_vector(resize(TMM_FETCH_CTR_REL - 1, TMM_CACHE_ADDR'length));
- end if;
-
- if TMM_FETCH_CTR_REL > 0 then -- read offset
- TMM_CACHE_DATA <= T_TMM_DATA;
- TMM_CACHE_WEN <= '1';
- end if;
- else
+ -- palette color at pixel
+ TMM_DATA_PAL_IDX <= (others => '0');
+
+ TMM_ADDR <= (others => 'Z');
+ end generate;
+
+ accurate_occlusion_logic: if IDX < PPU_ACCURATE_FG_SPRITE_COUNT generate
+ -- TMM cache lines
+ signal TMM_CACHE_WEN, TMM_CACHE_UPDATE_TURN : std_logic := '0';
+ signal TMM_CACHE_DATA : std_logic_vector(PPU_TMM_DATA_WIDTH-1 downto 0) := (others => '0');
+ signal TMM_CACHE_ADDR : std_logic_vector(PPU_TMM_ADDR_WIDTH-1 downto 0) := (others => '0');
+ signal TMM_CACHE : std_logic_vector((PPU_SPRITE_WORD_COUNT * PPU_TMM_DATA_WIDTH)-1 downto 0);
+ begin
+ -- palette color at pixel
+ TMM_DATA_PAL_IDX <= TMM_CACHE(TRANS_TILE_PIXEL_IDX * integer(PPU_PALETTE_COLOR_WIDTH) + integer(PPU_PALETTE_COLOR_WIDTH)-1 downto TRANS_TILE_PIXEL_IDX * integer(PPU_PALETTE_COLOR_WIDTH));
+
+ TMM_ADDR <= T_TMM_ADDR when TMM_CACHE_UPDATE_TURN else (others => 'Z');
+
+ -- TTM cache
+ ttm_cache : component er_ram
+ generic map(
+ ADDR_W => PPU_TMM_ADDR_WIDTH,
+ DATA_W => PPU_TMM_DATA_WIDTH,
+ ADDR_LOW => 0,
+ ADDR_RANGE => PPU_SPRITE_WORD_COUNT)
+ port map(
+ CLK => CLK,
+ RST => RESET,
+ WEN => TMM_CACHE_WEN,
+ ADDR => TMM_CACHE_ADDR,
+ DATA => TMM_CACHE_DATA,
+ REG => TMM_CACHE);
+
+ -- fetch machine, should do the following (offset data read by one clock -> propagation/lookup delay):
+ -- CLK[53 * IDX + 0] (addr = 0)
+ -- CLK[53 * IDX + 1] (addr = 1, read data[0])
+ -- CLK[53 * IDX + 2] (addr = 2, read data[1]), etc
+ -- a full tile is 52 words, but since the offset is 1 clock, a total copy takes 53 clock cycles
+ process(CLK, RESET, FETCH)
+ constant TMM_FETCH_CLK_RANGE_BEGIN : natural := PPU_TMM_CACHE_FETCH_C_COUNT * IDX; -- fetch CLK count for copying this module's sprite from TMM
+ variable TMM_FETCH_CTR : unsigned(PPU_TMM_CACHE_FETCH_A_WIDTH-1 downto 0) := (others => '0'); -- CLK counter while FETCH=1
+ variable TMM_FETCH_CTR_REL : unsigned(PPU_TMM_CACHE_FETCH_A_WIDTH-1 downto 0) := (others => '0'); -- CLK counter relative for sprite[IDX]
+ begin
+ if RESET = '1' or FETCH = '0' then
+ TMM_FETCH_CTR := (others => '0');
+ TMM_FETCH_CTR_REL := (others => '0');
TMM_CACHE_WEN <= '0';
TMM_CACHE_UPDATE_TURN <= '0';
+ elsif rising_edge(CLK) then
+ TMM_FETCH_CTR := TMM_FETCH_CTR + 1;
+ TMM_FETCH_CTR_REL := TMM_FETCH_CTR - TMM_FETCH_CLK_RANGE_BEGIN;
+
+ if TMM_FETCH_CTR >= TMM_FETCH_CLK_RANGE_BEGIN and
+ TMM_FETCH_CTR < (TMM_FETCH_CLK_RANGE_BEGIN + PPU_TMM_CACHE_FETCH_C_COUNT) then
+ TMM_CACHE_UPDATE_TURN <= '1';
+ if TMM_FETCH_CTR_REL < PPU_TMM_CACHE_FETCH_C_COUNT - 1 then -- calculate address until second to last clock
+ T_TMM_ADDR <= std_logic_vector(resize(TMM_FETCH_CTR - IDX, T_TMM_ADDR'length));
+ TMM_CACHE_ADDR <= std_logic_vector(resize(TMM_FETCH_CTR_REL - 1, TMM_CACHE_ADDR'length));
+ end if;
+
+ if TMM_FETCH_CTR_REL > 0 then -- read offset
+ TMM_CACHE_DATA <= T_TMM_DATA;
+ TMM_CACHE_WEN <= '1';
+ end if;
+ else
+ TMM_CACHE_WEN <= '0';
+ TMM_CACHE_UPDATE_TURN <= '0';
+ end if;
end if;
- end if;
- end process;
+ end process;
+ end generate;
end Behavioral;
diff --git a/basys3/basys3.xpr b/basys3/basys3.xpr
index 87ec9fe..22b1d66 100644
--- a/basys3/basys3.xpr
+++ b/basys3/basys3.xpr
@@ -61,7 +61,7 @@
<Option Name="IPStaticSourceDir" Val="$PIPUSERFILESDIR/ipstatic"/>
<Option Name="EnableBDX" Val="FALSE"/>
<Option Name="DSABoardId" Val="basys3"/>
- <Option Name="WTXSimLaunchSim" Val="118"/>
+ <Option Name="WTXSimLaunchSim" Val="121"/>
<Option Name="WTModelSimLaunchSim" Val="0"/>
<Option Name="WTQuestaLaunchSim" Val="0"/>
<Option Name="WTIesLaunchSim" Val="0"/>
@@ -268,12 +268,11 @@
</File>
<Config>
<Option Name="DesignMode" Val="RTL"/>
- <Option Name="TopModule" Val="ppu_addr_dec_tb"/>
+ <Option Name="TopModule" Val="ppu_pceg_tb"/>
<Option Name="TopLib" Val="xil_defaultlib"/>
<Option Name="TransportPathDelay" Val="0"/>
<Option Name="TransportIntDelay" Val="0"/>
<Option Name="SelectedSimModel" Val="rtl"/>
- <Option Name="SimMode" Val="post-synthesis"/>
<Option Name="PamDesignTestbench" Val=""/>
<Option Name="PamDutBypassFile" Val="xil_dut_bypass"/>
<Option Name="PamSignalDriverFile" Val="xil_bypass_driver"/>
@@ -283,6 +282,14 @@
</FileSet>
<FileSet Name="utils_1" Type="Utils" RelSrcDir="$PSRCDIR/utils_1" RelGenDir="$PGENDIR/utils_1">
<Filter Type="Utils"/>
+ <File Path="$PSRCDIR/utils_1/imports/synth_1/ppu.dcp">
+ <FileInfo>
+ <Attr Name="UsedIn" Val="synthesis"/>
+ <Attr Name="UsedIn" Val="implementation"/>
+ <Attr Name="UsedInSteps" Val="synth_1"/>
+ <Attr Name="AutoDcp" Val="1"/>
+ </FileInfo>
+ </File>
<Config>
<Option Name="TopAutoSet" Val="TRUE"/>
</Config>
@@ -337,7 +344,7 @@
</Simulator>
</Simulators>
<Runs Version="1" Minor="19">
- <Run Id="synth_1" Type="Ft3:Synth" SrcSet="sources_1" Part="xc7a35tcpg236-1" ConstrsSet="constrs_1" Description="Vivado Synthesis Defaults" AutoIncrementalCheckpoint="true" WriteIncrSynthDcp="false" State="current" Dir="$PRUNDIR/synth_1" IncludeInArchive="true" IsChild="false" AutoIncrementalDir="$PSRCDIR/utils_1/imports/synth_1" AutoRQSDir="$PSRCDIR/utils_1/imports/synth_1">
+ <Run Id="synth_1" Type="Ft3:Synth" SrcSet="sources_1" Part="xc7a35tcpg236-1" ConstrsSet="constrs_1" Description="Vivado Synthesis Defaults" AutoIncrementalCheckpoint="true" IncrementalCheckpoint="$PSRCDIR/utils_1/imports/synth_1/ppu.dcp" WriteIncrSynthDcp="false" State="current" Dir="$PRUNDIR/synth_1" IncludeInArchive="true" IsChild="false" AutoIncrementalDir="$PSRCDIR/utils_1/imports/synth_1" AutoRQSDir="$PSRCDIR/utils_1/imports/synth_1">
<Strategy Version="1" Minor="2">
<StratHandle Name="Vivado Synthesis Defaults" Flow="Vivado Synthesis 2022"/>
<Step Id="synth_design"/>
@@ -359,9 +366,7 @@
</Run>
<Run Id="ppu_bam_synth_1" Type="Ft3:Synth" SrcSet="ppu_bam" Part="xc7a35tcpg236-1" ConstrsSet="ppu_bam" Description="Vivado Synthesis Defaults" AutoIncrementalCheckpoint="false" WriteIncrSynthDcp="false" Dir="$PRUNDIR/ppu_bam_synth_1" IncludeInArchive="true" IsChild="false" AutoIncrementalDir="$PSRCDIR/utils_1/imports/ppu_bam_synth_1" AutoRQSDir="$PSRCDIR/utils_1/imports/ppu_bam_synth_1">
<Strategy Version="1" Minor="2">
- <StratHandle Name="Vivado Synthesis Defaults" Flow="Vivado Synthesis 2022">
- <Desc>Vivado Synthesis Defaults</Desc>
- </StratHandle>
+ <StratHandle Name="Vivado Synthesis Defaults" Flow="Vivado Synthesis 2022"/>
<Step Id="synth_design"/>
</Strategy>
<GeneratedRun Dir="$PRUNDIR" File="gen_run.xml"/>
@@ -405,9 +410,7 @@
</Run>
<Run Id="ppu_bam_impl_1" Type="Ft2:EntireDesign" Part="xc7a35tcpg236-1" ConstrsSet="ppu_bam" Description="Default settings for Implementation." AutoIncrementalCheckpoint="false" WriteIncrSynthDcp="false" SynthRun="ppu_bam_synth_1" IncludeInArchive="false" IsChild="false" GenFullBitstream="true" AutoIncrementalDir="$PSRCDIR/utils_1/imports/ppu_bam_impl_1" AutoRQSDir="$PSRCDIR/utils_1/imports/ppu_bam_impl_1">
<Strategy Version="1" Minor="2">
- <StratHandle Name="Vivado Implementation Defaults" Flow="Vivado Implementation 2022">
- <Desc>Default settings for Implementation.</Desc>
- </StratHandle>
+ <StratHandle Name="Vivado Implementation Defaults" Flow="Vivado Implementation 2022"/>
<Step Id="init_design"/>
<Step Id="opt_design"/>
<Step Id="power_opt_design"/>
diff --git a/docs/architecture.md b/docs/architecture.md
index 5001eed..9a77e57 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -68,6 +68,7 @@ Here's a list of features our PPU has:
- 640x480 background canvas with scrolling
- NO background scrolling splits
- 128 total sprites on screen (NO scanline sprite limit)
+ - the first 16 foreground sprites have accurate background occlusion
- sprites are always drawn on top of the background layer
- PPU control using DMA (dual-port asynchronous RAM)
- tiles can be flipped using FAM or BAM
@@ -82,7 +83,9 @@ Notable differences:
Since we're using VGA, we can't use custom resolutions without an
upscaler/downscaler. This resolution was chosen because it's exactly half of
- the lowest standard VGA resolution 640x480.
+ the lowest standard VGA resolution 640x480. The native resolution can't be
+ used due to the pipelined pixel fetch logic, which needs at least 5 clock
+ cycles to produce a stable color output.
- No scanline sprite limit
Unless not imposing any sprite limit makes the hardware implementation
@@ -98,24 +101,24 @@ Notable differences:
- Single 1024 sprite tilemap shared between foreground and background sprites
The NES OAM registers contain a bit to select which tilemap to use (of two),
- which effectively expands each tile's index address by one byte. Instead of
+ which effectively expands each tile's index address by one bit. Instead of
creating the illusion of two separate memory areas for tiles, having one
- large tilemap seems like a more sensible solution to indexed tiles.
+ large tilemap seems like a more sensible solution.
- 8 total palettes, with 8 colors each
- More colors is better. Increasing the total palette count is a very memory
- intensive operation, while increasing the palette color count is likely slower
+ More colors is better. Increasing the palette color count is a very memory
+ intensive operation, while increasing the total amount of palettes is slower
when looking up color values for each pixel on real hardware.
- Sprites can be positioned partially off-screen on all screen edges using only
the offset bits in the FAM register
The NES has a separate PPUMASK register to control special color effects, and
to shift sprites off the left and top screen edges, as the sprite offsets
- count from 0. Our PPU's FAM sprite offset bits count from -15, so the sprite
+ count from 0. Our PPU's FAM sprite offset bits count from -16, so the sprite
can shift past the top and left screen edges, as well as the standard bottom
and right edges.
-- No status line register, only V-sync and H-sync outputs are supplied back to
- CPU
+- No status line register, only vertical and horizontal blanking/sync outputs
+ are supplied back to CPU
The NES status line register contains some handy lines, such as a buggy
status line for reaching the max sprite count per scanline, and a status line
@@ -126,7 +129,7 @@ Notable differences:
- No background scrolling splits
This feature allows only part of the background canvas to be scrolled, while
- another portion stays still. This was used to draw HUD elements on the
+ another portion remains still. This was used to draw HUD elements on the
background layer for displaying things like health bars or score counters.
Since we are working with a higher foreground sprite limit, we'll use regular
foreground sprites to display HUD elements.
@@ -197,6 +200,7 @@ Important notes:
the RAM in it's own cache memory. The cache updates are fetched during the
VBLANK time between each frame.
+<!-- inaccurate and no longer needed
### Level 3
This diagram has several flaws, but a significant amount of time has already
@@ -227,6 +231,24 @@ Important notes:
CIDX signal based on the EN signal from the compositor.
- All DATA and ADDR lines are shared between all RAM ports. WEN inputs are
controlled by the address decoder.
+-->
+
+## Pipeline stage reference
+
+This table describes which components use which lines during pipeline stages
+1-5. The pipeline stages happen for every pixel, and is run on the system clock
+(100 MHz).
+
+|Stage|Component|Action|To|Type|
+|-|-|-|-|-|
+|1|`ppu_sprite_bg`|write|BAM address|bus|
+|2|`ppu_sprite_bg`|read|BAM data|bus|
+|2|`ppu_sprite_fg`|write|TMM address|bus|
+|3|`ppu_sprite_bg`|write|TMM address|bus|
+|3|`ppu_sprite_fg`|read|TMM data|bus|
+|4|`ppu_sprite_bg`|read|TMM data|bus|
+|5|`ppu_pceg`|write|pixel done|flag|
+|6|`ppu_pceg`|write|pixel ready|flag|
## Registers
@@ -258,7 +280,7 @@ there is no address validity checking.
discarded padding bit per word)
- Pixel index order is from top-left to bottom-right in (English) reading
order.
-- Bits `14 downto 3` of the byte with the highest address for a given tile are
+- Bits `14 downto 3` of the word with the highest address for a given tile are
not used
- To calculate TMM address $a$ for any given pixel $p$ of tile with index $t$,
compute $a=52*t+\left\lfloor\frac{p}{5}\right\rfloor$