diff options
-rw-r--r-- | basys3/basys3.srcs/ppu.vhd | 23 | ||||
-rw-r--r-- | basys3/basys3.srcs/ppu_consts.vhd | 2 | ||||
-rw-r--r-- | basys3/basys3.srcs/ppu_pceg.vhd | 18 | ||||
-rw-r--r-- | basys3/basys3.srcs/ppu_pceg_tb.vhd | 10 | ||||
-rw-r--r-- | basys3/basys3.srcs/ppu_sprite_bg.vhd | 17 | ||||
-rw-r--r-- | basys3/basys3.srcs/ppu_sprite_fg.vhd | 148 | ||||
-rw-r--r-- | basys3/basys3.xpr | 23 | ||||
-rw-r--r-- | docs/architecture.md | 42 |
8 files changed, 163 insertions, 120 deletions
diff --git a/basys3/basys3.srcs/ppu.vhd b/basys3/basys3.srcs/ppu.vhd index 2425edc..c6dfe60 100644 --- a/basys3/basys3.srcs/ppu.vhd +++ b/basys3/basys3.srcs/ppu.vhd @@ -2,7 +2,6 @@ library ieee; library work; use ieee.std_logic_1164.all; ---use ieee.numeric_std.all; use work.ppu_consts.all; entity ppu is port( @@ -22,8 +21,8 @@ architecture Behavioral of ppu is CLK : in std_logic; -- system clock RESET : in std_logic; -- async reset SPRITE : out std_logic; -- sprite info fetch + sprite pixel fetch - COMP_PAL : out std_logic; -- compositor + palette lookup - DONE : out std_logic); -- last pipeline stage done + DONE : out std_logic; -- last pipeline stage done + READY : out std_logic); -- rgb buffer propagation ready end component; component ppu_addr_dec port( -- address decoder WEN : in std_logic; -- EXT write enable @@ -81,6 +80,7 @@ architecture Behavioral of ppu is -- inputs CLK : in std_logic; -- pipeline clock RESET : in std_logic; -- reset clock counter + PL_RESET : in std_logic; -- reset pipeline clock counters OE : in std_logic; -- output enable (of CIDX) X : in std_logic_vector(PPU_POS_H_WIDTH-1 downto 0); -- current screen pixel x Y : in std_logic_vector(PPU_POS_V_WIDTH-1 downto 0); -- current screen pixel y @@ -105,6 +105,7 @@ architecture Behavioral of ppu is -- inputs CLK : in std_logic; -- system clock RESET : in std_logic; -- reset internal memory and clock counters + PL_RESET : in std_logic; -- reset pipeline clock counters OE : in std_logic; -- output enable (of CIDX) X : in std_logic_vector(PPU_POS_H_WIDTH-1 downto 0); -- current screen pixel x Y : in std_logic_vector(PPU_POS_V_WIDTH-1 downto 0); -- current screen pixel y @@ -166,7 +167,7 @@ architecture Behavioral of ppu is -- signals signal SYSCLK, SYSRST : std_logic; -- system clock and reset - signal PL_SPRITE, PL_COMP_PAL, PL_DONE : std_logic; -- pipeline stages + signal PL_SPRITE, PL_DONE, PL_READY : std_logic; -- pipeline stages signal TMM_WEN, BAM_WEN, FAM_WEN, PAL_WEN, AUX_WEN : std_logic; signal TMM_W_ADDR, TMM_R_ADDR : std_logic_vector(PPU_TMM_ADDR_WIDTH-1 downto 0); -- read/write TMM addr (dual port) signal BAM_W_ADDR, BAM_R_ADDR : std_logic_vector(PPU_BAM_ADDR_WIDTH-1 downto 0); -- read/write BAM addr (dual port) @@ -181,7 +182,7 @@ architecture Behavioral of ppu is signal X : std_logic_vector(PPU_POS_H_WIDTH-1 downto 0); -- current screen pixel x signal Y : std_logic_vector(PPU_POS_V_WIDTH-1 downto 0); -- current screen pixel y signal UR,UG,UB : std_logic_vector(PPU_COLOR_OUTPUT_DEPTH-1 downto 0); -- unstable RGB (to be buffered) - signal SR,SG,SB : std_logic_vector(PPU_COLOR_OUTPUT_DEPTH-1 downto 0); -- stable RGB (buffered until PL_COMP_PAL) + signal SR,SG,SB : std_logic_vector(PPU_COLOR_OUTPUT_DEPTH-1 downto 0); -- stable RGB (buffered until PL_DONE) signal BG_SHIFT_X : std_logic_vector(PPU_POS_H_WIDTH-1 downto 0); signal BG_SHIFT_Y : std_logic_vector(PPU_POS_V_WIDTH-1 downto 0); signal FG_FETCH : std_logic; @@ -202,8 +203,8 @@ begin CLK => SYSCLK, RESET => SYSRST, SPRITE => PL_SPRITE, - COMP_PAL => PL_COMP_PAL, - DONE => PL_DONE); + DONE => PL_DONE, + READY => PL_READY); address_decoder : component ppu_addr_dec port map( WEN => WEN, @@ -255,6 +256,7 @@ begin background_sprite : component ppu_sprite_bg port map( CLK => PL_SPRITE, RESET => SYSRST, + PL_RESET => PL_READY, OE => BG_EN, X => X, Y => Y, @@ -272,6 +274,7 @@ begin port map( CLK => SYSCLK, RESET => SYSRST, + PL_RESET => PL_READY, OE => FG_EN(FG_IDX), X => X, Y => Y, @@ -303,13 +306,13 @@ begin B => UB); -- palette lookup output buffer (pipeline stage 5) - process(PL_COMP_PAL, SYSRST) + process(PL_DONE, SYSRST) begin if SYSRST = '1' then SR <= x"0"; SG <= x"0"; SB <= x"0"; - elsif rising_edge(PL_COMP_PAL) then + elsif rising_edge(PL_DONE) then SR <= UR; SG <= UG; SB <= UB; @@ -331,7 +334,7 @@ begin RESET => SYSRST, X => X, Y => Y, - PREADY => PL_DONE, + PREADY => PL_READY, RI => SR, GI => SG, BI => SB, diff --git a/basys3/basys3.srcs/ppu_consts.vhd b/basys3/basys3.srcs/ppu_consts.vhd index 75b6168..c7786c4 100644 --- a/basys3/basys3.srcs/ppu_consts.vhd +++ b/basys3/basys3.srcs/ppu_consts.vhd @@ -44,6 +44,8 @@ package ppu_consts is constant PPU_TMM_CACHE_FETCH_C_COUNT : natural := PPU_SPRITE_WORD_COUNT + 1; constant PPU_TMM_CACHE_FETCH_A_COUNT : natural := PPU_TMM_CACHE_FETCH_C_COUNT * PPU_FG_SPRITE_COUNT; -- amount of clocks to fetch new TMM cache constant PPU_TMM_CACHE_FETCH_A_WIDTH : natural := ceil_log2(PPU_TMM_CACHE_FETCH_A_COUNT); + constant PPU_ACCURATE_FG_SPRITE_COUNT : natural := 16; + constant PPU_PL_TOTAL_STAGES : natural := 14; end package ppu_consts; package body ppu_consts is -- https://stackoverflow.com/questions/21783280/number-of-bits-to-represent-an-integer-in-vhdl diff --git a/basys3/basys3.srcs/ppu_pceg.vhd b/basys3/basys3.srcs/ppu_pceg.vhd index 1aaeee4..5d9f4d6 100644 --- a/basys3/basys3.srcs/ppu_pceg.vhd +++ b/basys3/basys3.srcs/ppu_pceg.vhd @@ -1,25 +1,23 @@ library ieee; use ieee.std_logic_1164.all; ---use ieee.numeric_std.all; +use work.ppu_consts.all; entity ppu_pceg is port( CLK : in std_logic; -- system clock RESET : in std_logic; -- async reset SPRITE : out std_logic; -- sprite info fetch + sprite pixel fetch - COMP_PAL : out std_logic; -- compositor + palette lookup - DONE : out std_logic); -- last pipeline stage done + DONE : out std_logic; -- last pipeline stage done + READY : out std_logic); -- rgb buffer propagation ready end ppu_pceg; architecture Behavioral of ppu_pceg is - constant PPU_PL_TOTAL_STAGES : natural := 14; - - type states is (PL_SPRITE, PL_COMP_PAL, PL_DONE); + type states is (PL_SPRITE, PL_DONE, PL_READY); signal state : states := PL_SPRITE; begin -- output drivers SPRITE <= CLK when RESET = '0' and state = PL_SPRITE else '0'; - COMP_PAL <= CLK when RESET = '0' and state = PL_COMP_PAL else '0'; - DONE <= '1' when RESET = '0' and state = PL_DONE else '0'; + DONE <= CLK when RESET = '0' and state = PL_DONE else '0'; + READY <= '1' when RESET = '0' and state = PL_READY else '0'; process(CLK, RESET) variable CLK_IDX : natural range 0 to PPU_PL_TOTAL_STAGES+1 := 0; @@ -31,9 +29,9 @@ begin if CLK_IDX < 4 then state <= PL_SPRITE; elsif CLK_IDX < 5 then - state <= PL_COMP_PAL; - else state <= PL_DONE; + else + state <= PL_READY; end if; -- increment clock counter diff --git a/basys3/basys3.srcs/ppu_pceg_tb.vhd b/basys3/basys3.srcs/ppu_pceg_tb.vhd index 719ec06..86061a0 100644 --- a/basys3/basys3.srcs/ppu_pceg_tb.vhd +++ b/basys3/basys3.srcs/ppu_pceg_tb.vhd @@ -13,22 +13,22 @@ architecture behavioral of ppu_pceg_tb is CLK : in std_logic; -- system clock RESET : in std_logic; -- async reset SPRITE : out std_logic; -- sprite info fetch + sprite pixel fetch - COMP_PAL : out std_logic; -- compositor + palette lookup - DONE : out std_logic); -- last pipeline stage done + DONE : out std_logic; -- last pipeline stage done + READY : out std_logic); -- rgb buffer propagation ready end component; signal CLK : std_logic := '0'; signal RESET : std_logic := '0'; signal SPRITE : std_logic; - signal COMP_PAL : std_logic; signal DONE : std_logic; + signal READY : std_logic; begin uut : ppu_pceg port map( CLK => CLK, RESET => RESET, SPRITE => SPRITE, - COMP_PAL => COMP_PAL, - DONE => DONE); + DONE => DONE, + READY => READY); tb : process begin diff --git a/basys3/basys3.srcs/ppu_sprite_bg.vhd b/basys3/basys3.srcs/ppu_sprite_bg.vhd index dba5b8e..1892694 100644 --- a/basys3/basys3.srcs/ppu_sprite_bg.vhd +++ b/basys3/basys3.srcs/ppu_sprite_bg.vhd @@ -11,6 +11,7 @@ entity ppu_sprite_bg is port( -- inputs CLK : in std_logic; -- pipeline clock RESET : in std_logic; -- reset clock counter + PL_RESET : in std_logic; -- reset pipeline clock counters OE : in std_logic; -- output enable (of CIDX) X : in std_logic_vector(PPU_POS_H_WIDTH-1 downto 0); -- current screen pixel x Y : in std_logic_vector(PPU_POS_V_WIDTH-1 downto 0); -- current screen pixel y @@ -108,16 +109,18 @@ begin (others => '0') when others; -- state machine (pipeline stage counter) + sync r/w - process(CLK, RESET) + process(CLK, RESET, PL_RESET) begin - if RESET = '1' then + if RESET = '1' or PL_RESET = '1' then -- reset state state <= PL_BAM_ADDR; - -- reset internal pipeline registers - R_BAM_ADDR <= (others => '0'); - R_BAM_DATA <= (others => '0'); - R_TMM_ADDR <= (others => '0'); - R_TMM_DATA <= (others => '0'); + if RESET = '1' then + -- reset internal pipeline registers + R_BAM_ADDR <= (others => '0'); + R_BAM_DATA <= (others => '0'); + R_TMM_ADDR <= (others => '0'); + R_TMM_DATA <= (others => '0'); + end if; elsif rising_edge(CLK) then case state is when PL_BAM_ADDR => diff --git a/basys3/basys3.srcs/ppu_sprite_fg.vhd b/basys3/basys3.srcs/ppu_sprite_fg.vhd index af7cfa3..dd315d8 100644 --- a/basys3/basys3.srcs/ppu_sprite_fg.vhd +++ b/basys3/basys3.srcs/ppu_sprite_fg.vhd @@ -14,6 +14,7 @@ entity ppu_sprite_fg is -- foreground sprite -- inputs CLK : in std_logic; -- system clock RESET : in std_logic; -- reset internal memory and clock counters + PL_RESET : in std_logic; -- reset pipeline clock counters OE : in std_logic; -- output enable (of CIDX) X : in std_logic_vector(PPU_POS_H_WIDTH-1 downto 0); -- current screen pixel x Y : in std_logic_vector(PPU_POS_V_WIDTH-1 downto 0); -- current screen pixel y @@ -80,18 +81,7 @@ architecture Behavioral of ppu_sprite_fg is signal TRANS_TILE_PIXEL_IDX : integer := 0; -- index of pixel within tile (reading order) signal TILEMAP_WORD_OFFSET : integer := 0; -- word offset from tile start address in TMM signal TMM_DATA_PAL_IDX : std_logic_vector(PPU_PALETTE_COLOR_WIDTH-1 downto 0); -- color of palette - - -- TMM cache lines - signal TMM_CACHE_WEN, TMM_CACHE_UPDATE_TURN : std_logic := '0'; - signal TMM_CACHE_DATA : std_logic_vector(PPU_TMM_DATA_WIDTH-1 downto 0) := (others => '0'); - signal TMM_CACHE_ADDR : std_logic_vector(PPU_TMM_ADDR_WIDTH-1 downto 0) := (others => '0'); - signal TMM_CACHE : std_logic_vector((PPU_SPRITE_WORD_COUNT * PPU_TMM_DATA_WIDTH)-1 downto 0); begin - -- output drivers - CIDX <= T_CIDX when OE = '1' else (others => 'Z'); - -- CIDX combination - T_CIDX <= FAM_REG_COL_IDX & TMM_DATA_PAL_IDX; - -- FAM memory FAM : component er_ram generic map( @@ -107,11 +97,18 @@ begin DATA => FAM_DATA, REG => INT_FAM); + -- output drivers + CIDX <= T_CIDX when OE = '1' else (others => 'Z'); + -- CIDX combination + T_CIDX <= FAM_REG_COL_IDX & TMM_DATA_PAL_IDX; + + T_TMM_DATA <= TMM_DATA; + -- pixel position within bounding box of sprite SPRITE_ACTIVE <= '1' when ((unsigned(X) + 16) >= unsigned(FAM_REG_POS_H)) and - ((unsigned(X) + 16) < (unsigned(FAM_REG_POS_H) + to_unsigned(PPU_SPRITE_WIDTH, PPU_POS_H_WIDTH))) and - ((unsigned(Y) + 16) >= unsigned(FAM_REG_POS_V)) and - ((unsigned(Y) + 16) < (unsigned(FAM_REG_POS_V) + to_unsigned(PPU_SPRITE_HEIGHT, PPU_POS_V_WIDTH))) else '0'; + ((unsigned(X) + 16) < (unsigned(FAM_REG_POS_H) + to_unsigned(PPU_SPRITE_WIDTH, PPU_POS_H_WIDTH))) and + ((unsigned(Y) + 16) >= unsigned(FAM_REG_POS_V)) and + ((unsigned(Y) + 16) < (unsigned(FAM_REG_POS_V) + to_unsigned(PPU_SPRITE_HEIGHT, PPU_POS_V_WIDTH))) else '0'; -- (sprite local) pixel coords TILE_PIDX_X <= resize(unsigned(X) + 16 - resize(unsigned(FAM_REG_POS_H), TILE_PIDX_X'length), TILE_PIDX_X'length); @@ -128,65 +125,80 @@ begin -- pixel index TRANS_TILE_PIXEL_IDX <= integer(PPU_SPRITE_WIDTH) * to_integer(TRANS_TILE_PIDX_Y) + to_integer(TRANS_TILE_PIDX_X); - -- palette color at pixel - TMM_DATA_PAL_IDX <= TMM_CACHE(TRANS_TILE_PIXEL_IDX * integer(PPU_PALETTE_COLOR_WIDTH) + integer(PPU_PALETTE_COLOR_WIDTH)-1 downto TRANS_TILE_PIXEL_IDX * integer(PPU_PALETTE_COLOR_WIDTH)); -- if pixel in sprite hitbox and TMM_DATA_PAL_IDX > 0 HIT <= SPRITE_ACTIVE and (nor TMM_DATA_PAL_IDX); - -- FETCH LOGIC BELOW - TMM_ADDR <= T_TMM_ADDR when TMM_CACHE_UPDATE_TURN else (others => 'Z'); - T_TMM_DATA <= TMM_DATA; - - -- TTM cache - ttm_cache : component er_ram - generic map( - ADDR_W => PPU_TMM_ADDR_WIDTH, - DATA_W => PPU_TMM_DATA_WIDTH, - ADDR_LOW => 0, - ADDR_RANGE => PPU_SPRITE_WORD_COUNT) - port map( - CLK => CLK, - RST => RESET, - WEN => TMM_CACHE_WEN, - ADDR => TMM_CACHE_ADDR, - DATA => TMM_CACHE_DATA, - REG => TMM_CACHE); - - -- fetch machine, should do the following (offset data read by one clock -> propagation/lookup delay): - -- CLK[53 * IDX + 0] (addr = 0) - -- CLK[53 * IDX + 1] (addr = 1, read data[0]) - -- CLK[53 * IDX + 2] (addr = 2, read data[1]), etc - -- a full tile is 52 words, but since the offset is 1 clock, a total copy takes 53 clock cycles - process(CLK, RESET, FETCH) - constant TMM_FETCH_CLK_RANGE_BEGIN : natural := PPU_TMM_CACHE_FETCH_C_COUNT * IDX; -- fetch CLK count for copying this module's sprite from TMM - variable TMM_FETCH_CTR : unsigned(PPU_TMM_CACHE_FETCH_A_WIDTH-1 downto 0) := (others => '0'); -- CLK counter while FETCH=1 - variable TMM_FETCH_CTR_REL : unsigned(PPU_TMM_CACHE_FETCH_A_WIDTH-1 downto 0) := (others => '0'); -- CLK counter relative for sprite[IDX] + inaccurate_occlusion_shims: if IDX >= PPU_ACCURATE_FG_SPRITE_COUNT generate begin - if RESET = '1' or FETCH = '0' then - TMM_FETCH_CTR := (others => '0'); - TMM_FETCH_CTR_REL := (others => '0'); - TMM_CACHE_WEN <= '0'; - TMM_CACHE_UPDATE_TURN <= '0'; - elsif rising_edge(CLK) then - TMM_FETCH_CTR := TMM_FETCH_CTR + 1; - TMM_FETCH_CTR_REL := TMM_FETCH_CTR - TMM_FETCH_CLK_RANGE_BEGIN; - - if TMM_FETCH_CTR >= TMM_FETCH_CLK_RANGE_BEGIN and - TMM_FETCH_CTR < (TMM_FETCH_CLK_RANGE_BEGIN + PPU_TMM_CACHE_FETCH_C_COUNT) then - TMM_CACHE_UPDATE_TURN <= '1'; - if TMM_FETCH_CTR_REL < PPU_TMM_CACHE_FETCH_C_COUNT - 1 then -- calculate address until second to last clock - T_TMM_ADDR <= std_logic_vector(resize(TMM_FETCH_CTR - IDX, T_TMM_ADDR'length)); - TMM_CACHE_ADDR <= std_logic_vector(resize(TMM_FETCH_CTR_REL - 1, TMM_CACHE_ADDR'length)); - end if; - - if TMM_FETCH_CTR_REL > 0 then -- read offset - TMM_CACHE_DATA <= T_TMM_DATA; - TMM_CACHE_WEN <= '1'; - end if; - else + -- palette color at pixel + TMM_DATA_PAL_IDX <= (others => '0'); + + TMM_ADDR <= (others => 'Z'); + end generate; + + accurate_occlusion_logic: if IDX < PPU_ACCURATE_FG_SPRITE_COUNT generate + -- TMM cache lines + signal TMM_CACHE_WEN, TMM_CACHE_UPDATE_TURN : std_logic := '0'; + signal TMM_CACHE_DATA : std_logic_vector(PPU_TMM_DATA_WIDTH-1 downto 0) := (others => '0'); + signal TMM_CACHE_ADDR : std_logic_vector(PPU_TMM_ADDR_WIDTH-1 downto 0) := (others => '0'); + signal TMM_CACHE : std_logic_vector((PPU_SPRITE_WORD_COUNT * PPU_TMM_DATA_WIDTH)-1 downto 0); + begin + -- palette color at pixel + TMM_DATA_PAL_IDX <= TMM_CACHE(TRANS_TILE_PIXEL_IDX * integer(PPU_PALETTE_COLOR_WIDTH) + integer(PPU_PALETTE_COLOR_WIDTH)-1 downto TRANS_TILE_PIXEL_IDX * integer(PPU_PALETTE_COLOR_WIDTH)); + + TMM_ADDR <= T_TMM_ADDR when TMM_CACHE_UPDATE_TURN else (others => 'Z'); + + -- TTM cache + ttm_cache : component er_ram + generic map( + ADDR_W => PPU_TMM_ADDR_WIDTH, + DATA_W => PPU_TMM_DATA_WIDTH, + ADDR_LOW => 0, + ADDR_RANGE => PPU_SPRITE_WORD_COUNT) + port map( + CLK => CLK, + RST => RESET, + WEN => TMM_CACHE_WEN, + ADDR => TMM_CACHE_ADDR, + DATA => TMM_CACHE_DATA, + REG => TMM_CACHE); + + -- fetch machine, should do the following (offset data read by one clock -> propagation/lookup delay): + -- CLK[53 * IDX + 0] (addr = 0) + -- CLK[53 * IDX + 1] (addr = 1, read data[0]) + -- CLK[53 * IDX + 2] (addr = 2, read data[1]), etc + -- a full tile is 52 words, but since the offset is 1 clock, a total copy takes 53 clock cycles + process(CLK, RESET, FETCH) + constant TMM_FETCH_CLK_RANGE_BEGIN : natural := PPU_TMM_CACHE_FETCH_C_COUNT * IDX; -- fetch CLK count for copying this module's sprite from TMM + variable TMM_FETCH_CTR : unsigned(PPU_TMM_CACHE_FETCH_A_WIDTH-1 downto 0) := (others => '0'); -- CLK counter while FETCH=1 + variable TMM_FETCH_CTR_REL : unsigned(PPU_TMM_CACHE_FETCH_A_WIDTH-1 downto 0) := (others => '0'); -- CLK counter relative for sprite[IDX] + begin + if RESET = '1' or FETCH = '0' then + TMM_FETCH_CTR := (others => '0'); + TMM_FETCH_CTR_REL := (others => '0'); TMM_CACHE_WEN <= '0'; TMM_CACHE_UPDATE_TURN <= '0'; + elsif rising_edge(CLK) then + TMM_FETCH_CTR := TMM_FETCH_CTR + 1; + TMM_FETCH_CTR_REL := TMM_FETCH_CTR - TMM_FETCH_CLK_RANGE_BEGIN; + + if TMM_FETCH_CTR >= TMM_FETCH_CLK_RANGE_BEGIN and + TMM_FETCH_CTR < (TMM_FETCH_CLK_RANGE_BEGIN + PPU_TMM_CACHE_FETCH_C_COUNT) then + TMM_CACHE_UPDATE_TURN <= '1'; + if TMM_FETCH_CTR_REL < PPU_TMM_CACHE_FETCH_C_COUNT - 1 then -- calculate address until second to last clock + T_TMM_ADDR <= std_logic_vector(resize(TMM_FETCH_CTR - IDX, T_TMM_ADDR'length)); + TMM_CACHE_ADDR <= std_logic_vector(resize(TMM_FETCH_CTR_REL - 1, TMM_CACHE_ADDR'length)); + end if; + + if TMM_FETCH_CTR_REL > 0 then -- read offset + TMM_CACHE_DATA <= T_TMM_DATA; + TMM_CACHE_WEN <= '1'; + end if; + else + TMM_CACHE_WEN <= '0'; + TMM_CACHE_UPDATE_TURN <= '0'; + end if; end if; - end if; - end process; + end process; + end generate; end Behavioral; diff --git a/basys3/basys3.xpr b/basys3/basys3.xpr index 87ec9fe..22b1d66 100644 --- a/basys3/basys3.xpr +++ b/basys3/basys3.xpr @@ -61,7 +61,7 @@ <Option Name="IPStaticSourceDir" Val="$PIPUSERFILESDIR/ipstatic"/> <Option Name="EnableBDX" Val="FALSE"/> <Option Name="DSABoardId" Val="basys3"/> - <Option Name="WTXSimLaunchSim" Val="118"/> + <Option Name="WTXSimLaunchSim" Val="121"/> <Option Name="WTModelSimLaunchSim" Val="0"/> <Option Name="WTQuestaLaunchSim" Val="0"/> <Option Name="WTIesLaunchSim" Val="0"/> @@ -268,12 +268,11 @@ </File> <Config> <Option Name="DesignMode" Val="RTL"/> - <Option Name="TopModule" Val="ppu_addr_dec_tb"/> + <Option Name="TopModule" Val="ppu_pceg_tb"/> <Option Name="TopLib" Val="xil_defaultlib"/> <Option Name="TransportPathDelay" Val="0"/> <Option Name="TransportIntDelay" Val="0"/> <Option Name="SelectedSimModel" Val="rtl"/> - <Option Name="SimMode" Val="post-synthesis"/> <Option Name="PamDesignTestbench" Val=""/> <Option Name="PamDutBypassFile" Val="xil_dut_bypass"/> <Option Name="PamSignalDriverFile" Val="xil_bypass_driver"/> @@ -283,6 +282,14 @@ </FileSet> <FileSet Name="utils_1" Type="Utils" RelSrcDir="$PSRCDIR/utils_1" RelGenDir="$PGENDIR/utils_1"> <Filter Type="Utils"/> + <File Path="$PSRCDIR/utils_1/imports/synth_1/ppu.dcp"> + <FileInfo> + <Attr Name="UsedIn" Val="synthesis"/> + <Attr Name="UsedIn" Val="implementation"/> + <Attr Name="UsedInSteps" Val="synth_1"/> + <Attr Name="AutoDcp" Val="1"/> + </FileInfo> + </File> <Config> <Option Name="TopAutoSet" Val="TRUE"/> </Config> @@ -337,7 +344,7 @@ </Simulator> </Simulators> <Runs Version="1" Minor="19"> - <Run Id="synth_1" Type="Ft3:Synth" SrcSet="sources_1" Part="xc7a35tcpg236-1" ConstrsSet="constrs_1" Description="Vivado Synthesis Defaults" AutoIncrementalCheckpoint="true" WriteIncrSynthDcp="false" State="current" Dir="$PRUNDIR/synth_1" IncludeInArchive="true" IsChild="false" AutoIncrementalDir="$PSRCDIR/utils_1/imports/synth_1" AutoRQSDir="$PSRCDIR/utils_1/imports/synth_1"> + <Run Id="synth_1" Type="Ft3:Synth" SrcSet="sources_1" Part="xc7a35tcpg236-1" ConstrsSet="constrs_1" Description="Vivado Synthesis Defaults" AutoIncrementalCheckpoint="true" IncrementalCheckpoint="$PSRCDIR/utils_1/imports/synth_1/ppu.dcp" WriteIncrSynthDcp="false" State="current" Dir="$PRUNDIR/synth_1" IncludeInArchive="true" IsChild="false" AutoIncrementalDir="$PSRCDIR/utils_1/imports/synth_1" AutoRQSDir="$PSRCDIR/utils_1/imports/synth_1"> <Strategy Version="1" Minor="2"> <StratHandle Name="Vivado Synthesis Defaults" Flow="Vivado Synthesis 2022"/> <Step Id="synth_design"/> @@ -359,9 +366,7 @@ </Run> <Run Id="ppu_bam_synth_1" Type="Ft3:Synth" SrcSet="ppu_bam" Part="xc7a35tcpg236-1" ConstrsSet="ppu_bam" Description="Vivado Synthesis Defaults" AutoIncrementalCheckpoint="false" WriteIncrSynthDcp="false" Dir="$PRUNDIR/ppu_bam_synth_1" IncludeInArchive="true" IsChild="false" AutoIncrementalDir="$PSRCDIR/utils_1/imports/ppu_bam_synth_1" AutoRQSDir="$PSRCDIR/utils_1/imports/ppu_bam_synth_1"> <Strategy Version="1" Minor="2"> - <StratHandle Name="Vivado Synthesis Defaults" Flow="Vivado Synthesis 2022"> - <Desc>Vivado Synthesis Defaults</Desc> - </StratHandle> + <StratHandle Name="Vivado Synthesis Defaults" Flow="Vivado Synthesis 2022"/> <Step Id="synth_design"/> </Strategy> <GeneratedRun Dir="$PRUNDIR" File="gen_run.xml"/> @@ -405,9 +410,7 @@ </Run> <Run Id="ppu_bam_impl_1" Type="Ft2:EntireDesign" Part="xc7a35tcpg236-1" ConstrsSet="ppu_bam" Description="Default settings for Implementation." AutoIncrementalCheckpoint="false" WriteIncrSynthDcp="false" SynthRun="ppu_bam_synth_1" IncludeInArchive="false" IsChild="false" GenFullBitstream="true" AutoIncrementalDir="$PSRCDIR/utils_1/imports/ppu_bam_impl_1" AutoRQSDir="$PSRCDIR/utils_1/imports/ppu_bam_impl_1"> <Strategy Version="1" Minor="2"> - <StratHandle Name="Vivado Implementation Defaults" Flow="Vivado Implementation 2022"> - <Desc>Default settings for Implementation.</Desc> - </StratHandle> + <StratHandle Name="Vivado Implementation Defaults" Flow="Vivado Implementation 2022"/> <Step Id="init_design"/> <Step Id="opt_design"/> <Step Id="power_opt_design"/> diff --git a/docs/architecture.md b/docs/architecture.md index 5001eed..9a77e57 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -68,6 +68,7 @@ Here's a list of features our PPU has: - 640x480 background canvas with scrolling - NO background scrolling splits - 128 total sprites on screen (NO scanline sprite limit) + - the first 16 foreground sprites have accurate background occlusion - sprites are always drawn on top of the background layer - PPU control using DMA (dual-port asynchronous RAM) - tiles can be flipped using FAM or BAM @@ -82,7 +83,9 @@ Notable differences: Since we're using VGA, we can't use custom resolutions without an upscaler/downscaler. This resolution was chosen because it's exactly half of - the lowest standard VGA resolution 640x480. + the lowest standard VGA resolution 640x480. The native resolution can't be + used due to the pipelined pixel fetch logic, which needs at least 5 clock + cycles to produce a stable color output. - No scanline sprite limit Unless not imposing any sprite limit makes the hardware implementation @@ -98,24 +101,24 @@ Notable differences: - Single 1024 sprite tilemap shared between foreground and background sprites The NES OAM registers contain a bit to select which tilemap to use (of two), - which effectively expands each tile's index address by one byte. Instead of + which effectively expands each tile's index address by one bit. Instead of creating the illusion of two separate memory areas for tiles, having one - large tilemap seems like a more sensible solution to indexed tiles. + large tilemap seems like a more sensible solution. - 8 total palettes, with 8 colors each - More colors is better. Increasing the total palette count is a very memory - intensive operation, while increasing the palette color count is likely slower + More colors is better. Increasing the palette color count is a very memory + intensive operation, while increasing the total amount of palettes is slower when looking up color values for each pixel on real hardware. - Sprites can be positioned partially off-screen on all screen edges using only the offset bits in the FAM register The NES has a separate PPUMASK register to control special color effects, and to shift sprites off the left and top screen edges, as the sprite offsets - count from 0. Our PPU's FAM sprite offset bits count from -15, so the sprite + count from 0. Our PPU's FAM sprite offset bits count from -16, so the sprite can shift past the top and left screen edges, as well as the standard bottom and right edges. -- No status line register, only V-sync and H-sync outputs are supplied back to - CPU +- No status line register, only vertical and horizontal blanking/sync outputs + are supplied back to CPU The NES status line register contains some handy lines, such as a buggy status line for reaching the max sprite count per scanline, and a status line @@ -126,7 +129,7 @@ Notable differences: - No background scrolling splits This feature allows only part of the background canvas to be scrolled, while - another portion stays still. This was used to draw HUD elements on the + another portion remains still. This was used to draw HUD elements on the background layer for displaying things like health bars or score counters. Since we are working with a higher foreground sprite limit, we'll use regular foreground sprites to display HUD elements. @@ -197,6 +200,7 @@ Important notes: the RAM in it's own cache memory. The cache updates are fetched during the VBLANK time between each frame. +<!-- inaccurate and no longer needed ### Level 3 This diagram has several flaws, but a significant amount of time has already @@ -227,6 +231,24 @@ Important notes: CIDX signal based on the EN signal from the compositor. - All DATA and ADDR lines are shared between all RAM ports. WEN inputs are controlled by the address decoder. +--> + +## Pipeline stage reference + +This table describes which components use which lines during pipeline stages +1-5. The pipeline stages happen for every pixel, and is run on the system clock +(100 MHz). + +|Stage|Component|Action|To|Type| +|-|-|-|-|-| +|1|`ppu_sprite_bg`|write|BAM address|bus| +|2|`ppu_sprite_bg`|read|BAM data|bus| +|2|`ppu_sprite_fg`|write|TMM address|bus| +|3|`ppu_sprite_bg`|write|TMM address|bus| +|3|`ppu_sprite_fg`|read|TMM data|bus| +|4|`ppu_sprite_bg`|read|TMM data|bus| +|5|`ppu_pceg`|write|pixel done|flag| +|6|`ppu_pceg`|write|pixel ready|flag| ## Registers @@ -258,7 +280,7 @@ there is no address validity checking. discarded padding bit per word) - Pixel index order is from top-left to bottom-right in (English) reading order. -- Bits `14 downto 3` of the byte with the highest address for a given tile are +- Bits `14 downto 3` of the word with the highest address for a given tile are not used - To calculate TMM address $a$ for any given pixel $p$ of tile with index $t$, compute $a=52*t+\left\lfloor\frac{p}{5}\right\rfloor$ |