diff options
| -rw-r--r-- | basys3/basys3.srcs/ppu.vhd | 23 | ||||
| -rw-r--r-- | basys3/basys3.srcs/ppu_consts.vhd | 2 | ||||
| -rw-r--r-- | basys3/basys3.srcs/ppu_pceg.vhd | 18 | ||||
| -rw-r--r-- | basys3/basys3.srcs/ppu_pceg_tb.vhd | 10 | ||||
| -rw-r--r-- | basys3/basys3.srcs/ppu_sprite_bg.vhd | 17 | ||||
| -rw-r--r-- | basys3/basys3.srcs/ppu_sprite_fg.vhd | 148 | ||||
| -rw-r--r-- | basys3/basys3.xpr | 23 | ||||
| -rw-r--r-- | docs/architecture.md | 42 | 
8 files changed, 163 insertions, 120 deletions
| diff --git a/basys3/basys3.srcs/ppu.vhd b/basys3/basys3.srcs/ppu.vhd index 2425edc..c6dfe60 100644 --- a/basys3/basys3.srcs/ppu.vhd +++ b/basys3/basys3.srcs/ppu.vhd @@ -2,7 +2,6 @@ library ieee;  library work;  use ieee.std_logic_1164.all; ---use ieee.numeric_std.all;  use work.ppu_consts.all;  entity ppu is port( @@ -22,8 +21,8 @@ architecture Behavioral of ppu is  		CLK : in std_logic; -- system clock  		RESET : in std_logic; -- async reset  		SPRITE : out std_logic; -- sprite info fetch + sprite pixel fetch -		COMP_PAL : out std_logic; -- compositor + palette lookup -		DONE : out std_logic); -- last pipeline stage done +		DONE : out std_logic; -- last pipeline stage done +		READY : out std_logic); -- rgb buffer propagation ready  	end component;  	component ppu_addr_dec port( -- address decoder  		WEN : in std_logic; -- EXT write enable @@ -81,6 +80,7 @@ architecture Behavioral of ppu is  		-- inputs  		CLK : in std_logic; -- pipeline clock  		RESET : in std_logic; -- reset clock counter +		PL_RESET : in std_logic; -- reset pipeline clock counters  		OE : in std_logic; -- output enable (of CIDX)  		X : in std_logic_vector(PPU_POS_H_WIDTH-1 downto 0); -- current screen pixel x  		Y : in std_logic_vector(PPU_POS_V_WIDTH-1 downto 0); -- current screen pixel y @@ -105,6 +105,7 @@ architecture Behavioral of ppu is  			-- inputs  			CLK : in std_logic; -- system clock  			RESET : in std_logic; -- reset internal memory and clock counters +			PL_RESET : in std_logic; -- reset pipeline clock counters  			OE : in std_logic; -- output enable (of CIDX)  			X : in std_logic_vector(PPU_POS_H_WIDTH-1 downto 0); -- current screen pixel x  			Y : in std_logic_vector(PPU_POS_V_WIDTH-1 downto 0); -- current screen pixel y @@ -166,7 +167,7 @@ architecture Behavioral of ppu is  	-- signals  	signal SYSCLK, SYSRST : std_logic; -- system clock and reset -	signal PL_SPRITE, PL_COMP_PAL, PL_DONE : std_logic; -- pipeline stages +	signal PL_SPRITE, PL_DONE, PL_READY : std_logic; -- pipeline stages  	signal TMM_WEN, BAM_WEN, FAM_WEN, PAL_WEN, AUX_WEN : std_logic;  	signal TMM_W_ADDR, TMM_R_ADDR : std_logic_vector(PPU_TMM_ADDR_WIDTH-1 downto 0); -- read/write TMM addr (dual port)  	signal BAM_W_ADDR, BAM_R_ADDR : std_logic_vector(PPU_BAM_ADDR_WIDTH-1 downto 0); -- read/write BAM addr (dual port) @@ -181,7 +182,7 @@ architecture Behavioral of ppu is  	signal X : std_logic_vector(PPU_POS_H_WIDTH-1 downto 0); -- current screen pixel x  	signal Y : std_logic_vector(PPU_POS_V_WIDTH-1 downto 0); -- current screen pixel y  	signal UR,UG,UB : std_logic_vector(PPU_COLOR_OUTPUT_DEPTH-1 downto 0); -- unstable RGB (to be buffered) -	signal SR,SG,SB : std_logic_vector(PPU_COLOR_OUTPUT_DEPTH-1 downto 0); -- stable RGB (buffered until PL_COMP_PAL) +	signal SR,SG,SB : std_logic_vector(PPU_COLOR_OUTPUT_DEPTH-1 downto 0); -- stable RGB (buffered until PL_DONE)  	signal BG_SHIFT_X : std_logic_vector(PPU_POS_H_WIDTH-1 downto 0);  	signal BG_SHIFT_Y : std_logic_vector(PPU_POS_V_WIDTH-1 downto 0);  	signal FG_FETCH : std_logic; @@ -202,8 +203,8 @@ begin  		CLK => SYSCLK,  		RESET => SYSRST,  		SPRITE => PL_SPRITE, -		COMP_PAL => PL_COMP_PAL, -		DONE => PL_DONE); +		DONE => PL_DONE, +		READY => PL_READY);  	address_decoder : component ppu_addr_dec port map(  		WEN => WEN, @@ -255,6 +256,7 @@ begin  	background_sprite : component ppu_sprite_bg port map(  		CLK => PL_SPRITE,  		RESET => SYSRST, +		PL_RESET => PL_READY,  		OE => BG_EN,  		X => X,  		Y => Y, @@ -272,6 +274,7 @@ begin  			port map(  				CLK => SYSCLK,  				RESET => SYSRST, +				PL_RESET => PL_READY,  				OE => FG_EN(FG_IDX),  				X => X,  				Y => Y, @@ -303,13 +306,13 @@ begin  		B => UB);  	-- palette lookup output buffer (pipeline stage 5) -	process(PL_COMP_PAL, SYSRST) +	process(PL_DONE, SYSRST)  	begin  		if SYSRST = '1' then  			SR <= x"0";  			SG <= x"0";  			SB <= x"0"; -		elsif rising_edge(PL_COMP_PAL) then +		elsif rising_edge(PL_DONE) then  			SR <= UR;  			SG <= UG;  			SB <= UB; @@ -331,7 +334,7 @@ begin  		RESET => SYSRST,  		X => X,  		Y => Y, -		PREADY => PL_DONE, +		PREADY => PL_READY,  		RI => SR,  		GI => SG,  		BI => SB, diff --git a/basys3/basys3.srcs/ppu_consts.vhd b/basys3/basys3.srcs/ppu_consts.vhd index 75b6168..c7786c4 100644 --- a/basys3/basys3.srcs/ppu_consts.vhd +++ b/basys3/basys3.srcs/ppu_consts.vhd @@ -44,6 +44,8 @@ package ppu_consts is  	constant PPU_TMM_CACHE_FETCH_C_COUNT : natural := PPU_SPRITE_WORD_COUNT + 1;  	constant PPU_TMM_CACHE_FETCH_A_COUNT : natural := PPU_TMM_CACHE_FETCH_C_COUNT * PPU_FG_SPRITE_COUNT; -- amount of clocks to fetch new TMM cache  	constant PPU_TMM_CACHE_FETCH_A_WIDTH : natural := ceil_log2(PPU_TMM_CACHE_FETCH_A_COUNT); +	constant PPU_ACCURATE_FG_SPRITE_COUNT : natural := 16; +	constant PPU_PL_TOTAL_STAGES : natural := 14;  end package ppu_consts;  package body ppu_consts is  	-- https://stackoverflow.com/questions/21783280/number-of-bits-to-represent-an-integer-in-vhdl diff --git a/basys3/basys3.srcs/ppu_pceg.vhd b/basys3/basys3.srcs/ppu_pceg.vhd index 1aaeee4..5d9f4d6 100644 --- a/basys3/basys3.srcs/ppu_pceg.vhd +++ b/basys3/basys3.srcs/ppu_pceg.vhd @@ -1,25 +1,23 @@  library ieee;  use ieee.std_logic_1164.all; ---use ieee.numeric_std.all; +use work.ppu_consts.all;  entity ppu_pceg is port(  	CLK : in std_logic; -- system clock  	RESET : in std_logic; -- async reset  	SPRITE : out std_logic; -- sprite info fetch + sprite pixel fetch -	COMP_PAL : out std_logic; -- compositor + palette lookup -	DONE : out std_logic); -- last pipeline stage done +	DONE : out std_logic; -- last pipeline stage done +	READY : out std_logic); -- rgb buffer propagation ready  end ppu_pceg;  architecture Behavioral of ppu_pceg is -	constant PPU_PL_TOTAL_STAGES : natural := 14; - -	type states is (PL_SPRITE, PL_COMP_PAL, PL_DONE); +	type states is (PL_SPRITE, PL_DONE, PL_READY);  	signal state : states := PL_SPRITE;  begin  	-- output drivers  	SPRITE <= CLK when RESET = '0' and state = PL_SPRITE else '0'; -	COMP_PAL <= CLK when RESET = '0' and state = PL_COMP_PAL else '0'; -	DONE <= '1' when RESET = '0' and state = PL_DONE else '0'; +	DONE <= CLK when RESET = '0' and state = PL_DONE else '0'; +	READY <= '1' when RESET = '0' and state = PL_READY else '0';  	process(CLK, RESET)  		variable CLK_IDX : natural range 0 to PPU_PL_TOTAL_STAGES+1 := 0; @@ -31,9 +29,9 @@ begin  			if CLK_IDX < 4 then  				state <= PL_SPRITE;  			elsif CLK_IDX < 5 then -				state <= PL_COMP_PAL; -			else  				state <= PL_DONE; +			else +				state <= PL_READY;  			end if;  			-- increment clock counter diff --git a/basys3/basys3.srcs/ppu_pceg_tb.vhd b/basys3/basys3.srcs/ppu_pceg_tb.vhd index 719ec06..86061a0 100644 --- a/basys3/basys3.srcs/ppu_pceg_tb.vhd +++ b/basys3/basys3.srcs/ppu_pceg_tb.vhd @@ -13,22 +13,22 @@ architecture behavioral of ppu_pceg_tb is  		CLK : in std_logic; -- system clock  		RESET : in std_logic; -- async reset  		SPRITE : out std_logic; -- sprite info fetch + sprite pixel fetch -		COMP_PAL : out std_logic; -- compositor + palette lookup -		DONE : out std_logic); -- last pipeline stage done +		DONE : out std_logic; -- last pipeline stage done +		READY : out std_logic); -- rgb buffer propagation ready  	end component;  	signal CLK : std_logic := '0';  	signal RESET : std_logic := '0';  	signal SPRITE : std_logic; -	signal COMP_PAL : std_logic;  	signal DONE : std_logic; +	signal READY : std_logic;  begin  	uut : ppu_pceg port map(  		CLK => CLK,  		RESET => RESET,  		SPRITE => SPRITE, -		COMP_PAL => COMP_PAL, -		DONE => DONE); +		DONE => DONE, +		READY => READY);  	tb : process  	begin diff --git a/basys3/basys3.srcs/ppu_sprite_bg.vhd b/basys3/basys3.srcs/ppu_sprite_bg.vhd index dba5b8e..1892694 100644 --- a/basys3/basys3.srcs/ppu_sprite_bg.vhd +++ b/basys3/basys3.srcs/ppu_sprite_bg.vhd @@ -11,6 +11,7 @@ entity ppu_sprite_bg is port(  	-- inputs  	CLK : in std_logic; -- pipeline clock  	RESET : in std_logic; -- reset clock counter +	PL_RESET : in std_logic; -- reset pipeline clock counters  	OE : in std_logic; -- output enable (of CIDX)  	X : in std_logic_vector(PPU_POS_H_WIDTH-1 downto 0); -- current screen pixel x  	Y : in std_logic_vector(PPU_POS_V_WIDTH-1 downto 0); -- current screen pixel y @@ -108,16 +109,18 @@ begin  		                    (others => '0') when others;  	-- state machine (pipeline stage counter) + sync r/w -	process(CLK, RESET) +	process(CLK, RESET, PL_RESET)  	begin -		if RESET = '1' then +		if RESET = '1' or PL_RESET = '1' then  			-- reset state  			state <= PL_BAM_ADDR; -			-- reset internal pipeline registers -			R_BAM_ADDR <= (others => '0'); -			R_BAM_DATA <= (others => '0'); -			R_TMM_ADDR <= (others => '0'); -			R_TMM_DATA <= (others => '0'); +			if RESET = '1' then +				-- reset internal pipeline registers +				R_BAM_ADDR <= (others => '0'); +				R_BAM_DATA <= (others => '0'); +				R_TMM_ADDR <= (others => '0'); +				R_TMM_DATA <= (others => '0'); +			end if;  		elsif rising_edge(CLK) then  			case state is  				when PL_BAM_ADDR => diff --git a/basys3/basys3.srcs/ppu_sprite_fg.vhd b/basys3/basys3.srcs/ppu_sprite_fg.vhd index af7cfa3..dd315d8 100644 --- a/basys3/basys3.srcs/ppu_sprite_fg.vhd +++ b/basys3/basys3.srcs/ppu_sprite_fg.vhd @@ -14,6 +14,7 @@ entity ppu_sprite_fg is -- foreground sprite  		-- inputs  		CLK : in std_logic; -- system clock  		RESET : in std_logic; -- reset internal memory and clock counters +		PL_RESET : in std_logic; -- reset pipeline clock counters  		OE : in std_logic; -- output enable (of CIDX)  		X : in std_logic_vector(PPU_POS_H_WIDTH-1 downto 0); -- current screen pixel x  		Y : in std_logic_vector(PPU_POS_V_WIDTH-1 downto 0); -- current screen pixel y @@ -80,18 +81,7 @@ architecture Behavioral of ppu_sprite_fg is  	signal TRANS_TILE_PIXEL_IDX : integer := 0; -- index of pixel within tile (reading order)  	signal TILEMAP_WORD_OFFSET : integer := 0; -- word offset from tile start address in TMM  	signal TMM_DATA_PAL_IDX : std_logic_vector(PPU_PALETTE_COLOR_WIDTH-1 downto 0); -- color of palette - -	-- TMM cache lines -	signal TMM_CACHE_WEN, TMM_CACHE_UPDATE_TURN : std_logic := '0'; -	signal TMM_CACHE_DATA : std_logic_vector(PPU_TMM_DATA_WIDTH-1 downto 0) := (others => '0'); -	signal TMM_CACHE_ADDR : std_logic_vector(PPU_TMM_ADDR_WIDTH-1 downto 0) := (others => '0'); -	signal TMM_CACHE : std_logic_vector((PPU_SPRITE_WORD_COUNT * PPU_TMM_DATA_WIDTH)-1 downto 0);  begin -	-- output drivers -	CIDX <= T_CIDX when OE = '1' else (others => 'Z'); -	-- CIDX combination -	T_CIDX <= FAM_REG_COL_IDX & TMM_DATA_PAL_IDX; -  	-- FAM memory  	FAM : component er_ram  		generic map( @@ -107,11 +97,18 @@ begin  			DATA => FAM_DATA,  			REG => INT_FAM); +	-- output drivers +	CIDX <= T_CIDX when OE = '1' else (others => 'Z'); +	-- CIDX combination +	T_CIDX <= FAM_REG_COL_IDX & TMM_DATA_PAL_IDX; + +	T_TMM_DATA <= TMM_DATA; +  	-- pixel position within bounding box of sprite  	SPRITE_ACTIVE <= '1' when ((unsigned(X) + 16) >= unsigned(FAM_REG_POS_H)) and -	                          ((unsigned(X) + 16) < (unsigned(FAM_REG_POS_H) + to_unsigned(PPU_SPRITE_WIDTH, PPU_POS_H_WIDTH))) and -	                          ((unsigned(Y) + 16) >= unsigned(FAM_REG_POS_V)) and -	                          ((unsigned(Y) + 16) < (unsigned(FAM_REG_POS_V) + to_unsigned(PPU_SPRITE_HEIGHT, PPU_POS_V_WIDTH))) else '0'; +														((unsigned(X) + 16) < (unsigned(FAM_REG_POS_H) + to_unsigned(PPU_SPRITE_WIDTH, PPU_POS_H_WIDTH))) and +														((unsigned(Y) + 16) >= unsigned(FAM_REG_POS_V)) and +														((unsigned(Y) + 16) < (unsigned(FAM_REG_POS_V) + to_unsigned(PPU_SPRITE_HEIGHT, PPU_POS_V_WIDTH))) else '0';  	-- (sprite local) pixel coords  	TILE_PIDX_X <= resize(unsigned(X) + 16 - resize(unsigned(FAM_REG_POS_H), TILE_PIDX_X'length), TILE_PIDX_X'length); @@ -128,65 +125,80 @@ begin  	-- pixel index  	TRANS_TILE_PIXEL_IDX <= integer(PPU_SPRITE_WIDTH) * to_integer(TRANS_TILE_PIDX_Y) + to_integer(TRANS_TILE_PIDX_X); -	-- palette color at pixel -	TMM_DATA_PAL_IDX <= TMM_CACHE(TRANS_TILE_PIXEL_IDX * integer(PPU_PALETTE_COLOR_WIDTH) + integer(PPU_PALETTE_COLOR_WIDTH)-1 downto TRANS_TILE_PIXEL_IDX * integer(PPU_PALETTE_COLOR_WIDTH));  	-- if pixel in sprite hitbox and TMM_DATA_PAL_IDX > 0  	HIT <= SPRITE_ACTIVE and (nor TMM_DATA_PAL_IDX); -	-- FETCH LOGIC BELOW -	TMM_ADDR <= T_TMM_ADDR when TMM_CACHE_UPDATE_TURN else (others => 'Z'); -	T_TMM_DATA <= TMM_DATA; - -	-- TTM cache -	ttm_cache : component er_ram -		generic map( -			ADDR_W => PPU_TMM_ADDR_WIDTH, -			DATA_W => PPU_TMM_DATA_WIDTH, -			ADDR_LOW => 0, -			ADDR_RANGE => PPU_SPRITE_WORD_COUNT) -		port map( -			CLK => CLK, -			RST => RESET, -			WEN => TMM_CACHE_WEN, -			ADDR => TMM_CACHE_ADDR, -			DATA => TMM_CACHE_DATA, -			REG => TMM_CACHE); - -	-- fetch machine, should do the following (offset data read by one clock -> propagation/lookup delay): -	-- CLK[53 * IDX + 0] (addr = 0) -	-- CLK[53 * IDX + 1] (addr = 1, read data[0]) -	-- CLK[53 * IDX + 2] (addr = 2, read data[1]), etc -	-- a full tile is 52 words, but since the offset is 1 clock, a total copy takes 53 clock cycles -	process(CLK, RESET, FETCH) -		constant TMM_FETCH_CLK_RANGE_BEGIN : natural := PPU_TMM_CACHE_FETCH_C_COUNT * IDX; -- fetch CLK count for copying this module's sprite from TMM -		variable TMM_FETCH_CTR : unsigned(PPU_TMM_CACHE_FETCH_A_WIDTH-1 downto 0) := (others => '0'); -- CLK counter while FETCH=1 -		variable TMM_FETCH_CTR_REL : unsigned(PPU_TMM_CACHE_FETCH_A_WIDTH-1 downto 0) := (others => '0'); -- CLK counter relative for sprite[IDX] +	inaccurate_occlusion_shims: if IDX >= PPU_ACCURATE_FG_SPRITE_COUNT generate  	begin -		if RESET = '1' or FETCH = '0' then -			TMM_FETCH_CTR := (others => '0'); -			TMM_FETCH_CTR_REL := (others => '0'); -			TMM_CACHE_WEN <= '0'; -			TMM_CACHE_UPDATE_TURN <= '0'; -		elsif rising_edge(CLK) then -			TMM_FETCH_CTR := TMM_FETCH_CTR + 1; -			TMM_FETCH_CTR_REL := TMM_FETCH_CTR - TMM_FETCH_CLK_RANGE_BEGIN; - -			if TMM_FETCH_CTR >= TMM_FETCH_CLK_RANGE_BEGIN and -			   TMM_FETCH_CTR < (TMM_FETCH_CLK_RANGE_BEGIN + PPU_TMM_CACHE_FETCH_C_COUNT) then -				TMM_CACHE_UPDATE_TURN <= '1'; -				if TMM_FETCH_CTR_REL < PPU_TMM_CACHE_FETCH_C_COUNT - 1 then -- calculate address until second to last clock -					T_TMM_ADDR <= std_logic_vector(resize(TMM_FETCH_CTR - IDX, T_TMM_ADDR'length)); -					TMM_CACHE_ADDR <= std_logic_vector(resize(TMM_FETCH_CTR_REL - 1, TMM_CACHE_ADDR'length)); -				end if; - -				if TMM_FETCH_CTR_REL > 0 then -- read offset -					TMM_CACHE_DATA <= T_TMM_DATA; -					TMM_CACHE_WEN <= '1'; -				end if; -			else +		-- palette color at pixel +		TMM_DATA_PAL_IDX <= (others => '0'); + +		TMM_ADDR <= (others => 'Z'); +	end generate; + +	accurate_occlusion_logic: if IDX < PPU_ACCURATE_FG_SPRITE_COUNT generate +		-- TMM cache lines +		signal TMM_CACHE_WEN, TMM_CACHE_UPDATE_TURN : std_logic := '0'; +		signal TMM_CACHE_DATA : std_logic_vector(PPU_TMM_DATA_WIDTH-1 downto 0) := (others => '0'); +		signal TMM_CACHE_ADDR : std_logic_vector(PPU_TMM_ADDR_WIDTH-1 downto 0) := (others => '0'); +		signal TMM_CACHE : std_logic_vector((PPU_SPRITE_WORD_COUNT * PPU_TMM_DATA_WIDTH)-1 downto 0); +	begin +		-- palette color at pixel +		TMM_DATA_PAL_IDX <= TMM_CACHE(TRANS_TILE_PIXEL_IDX * integer(PPU_PALETTE_COLOR_WIDTH) + integer(PPU_PALETTE_COLOR_WIDTH)-1 downto TRANS_TILE_PIXEL_IDX * integer(PPU_PALETTE_COLOR_WIDTH)); + +		TMM_ADDR <= T_TMM_ADDR when TMM_CACHE_UPDATE_TURN else (others => 'Z'); + +		-- TTM cache +		ttm_cache : component er_ram +			generic map( +				ADDR_W => PPU_TMM_ADDR_WIDTH, +				DATA_W => PPU_TMM_DATA_WIDTH, +				ADDR_LOW => 0, +				ADDR_RANGE => PPU_SPRITE_WORD_COUNT) +			port map( +				CLK => CLK, +				RST => RESET, +				WEN => TMM_CACHE_WEN, +				ADDR => TMM_CACHE_ADDR, +				DATA => TMM_CACHE_DATA, +				REG => TMM_CACHE); + +		-- fetch machine, should do the following (offset data read by one clock -> propagation/lookup delay): +		-- CLK[53 * IDX + 0] (addr = 0) +		-- CLK[53 * IDX + 1] (addr = 1, read data[0]) +		-- CLK[53 * IDX + 2] (addr = 2, read data[1]), etc +		-- a full tile is 52 words, but since the offset is 1 clock, a total copy takes 53 clock cycles +		process(CLK, RESET, FETCH) +			constant TMM_FETCH_CLK_RANGE_BEGIN : natural := PPU_TMM_CACHE_FETCH_C_COUNT * IDX; -- fetch CLK count for copying this module's sprite from TMM +			variable TMM_FETCH_CTR : unsigned(PPU_TMM_CACHE_FETCH_A_WIDTH-1 downto 0) := (others => '0'); -- CLK counter while FETCH=1 +			variable TMM_FETCH_CTR_REL : unsigned(PPU_TMM_CACHE_FETCH_A_WIDTH-1 downto 0) := (others => '0'); -- CLK counter relative for sprite[IDX] +		begin +			if RESET = '1' or FETCH = '0' then +				TMM_FETCH_CTR := (others => '0'); +				TMM_FETCH_CTR_REL := (others => '0');  				TMM_CACHE_WEN <= '0';  				TMM_CACHE_UPDATE_TURN <= '0'; +			elsif rising_edge(CLK) then +				TMM_FETCH_CTR := TMM_FETCH_CTR + 1; +				TMM_FETCH_CTR_REL := TMM_FETCH_CTR - TMM_FETCH_CLK_RANGE_BEGIN; + +				if TMM_FETCH_CTR >= TMM_FETCH_CLK_RANGE_BEGIN and +					 TMM_FETCH_CTR < (TMM_FETCH_CLK_RANGE_BEGIN + PPU_TMM_CACHE_FETCH_C_COUNT) then +					TMM_CACHE_UPDATE_TURN <= '1'; +					if TMM_FETCH_CTR_REL < PPU_TMM_CACHE_FETCH_C_COUNT - 1 then -- calculate address until second to last clock +						T_TMM_ADDR <= std_logic_vector(resize(TMM_FETCH_CTR - IDX, T_TMM_ADDR'length)); +						TMM_CACHE_ADDR <= std_logic_vector(resize(TMM_FETCH_CTR_REL - 1, TMM_CACHE_ADDR'length)); +					end if; + +					if TMM_FETCH_CTR_REL > 0 then -- read offset +						TMM_CACHE_DATA <= T_TMM_DATA; +						TMM_CACHE_WEN <= '1'; +					end if; +				else +					TMM_CACHE_WEN <= '0'; +					TMM_CACHE_UPDATE_TURN <= '0'; +				end if;  			end if; -		end if; -	end process; +		end process; +	end generate;  end Behavioral; diff --git a/basys3/basys3.xpr b/basys3/basys3.xpr index 87ec9fe..22b1d66 100644 --- a/basys3/basys3.xpr +++ b/basys3/basys3.xpr @@ -61,7 +61,7 @@      <Option Name="IPStaticSourceDir" Val="$PIPUSERFILESDIR/ipstatic"/>      <Option Name="EnableBDX" Val="FALSE"/>      <Option Name="DSABoardId" Val="basys3"/> -    <Option Name="WTXSimLaunchSim" Val="118"/> +    <Option Name="WTXSimLaunchSim" Val="121"/>      <Option Name="WTModelSimLaunchSim" Val="0"/>      <Option Name="WTQuestaLaunchSim" Val="0"/>      <Option Name="WTIesLaunchSim" Val="0"/> @@ -268,12 +268,11 @@        </File>        <Config>          <Option Name="DesignMode" Val="RTL"/> -        <Option Name="TopModule" Val="ppu_addr_dec_tb"/> +        <Option Name="TopModule" Val="ppu_pceg_tb"/>          <Option Name="TopLib" Val="xil_defaultlib"/>          <Option Name="TransportPathDelay" Val="0"/>          <Option Name="TransportIntDelay" Val="0"/>          <Option Name="SelectedSimModel" Val="rtl"/> -        <Option Name="SimMode" Val="post-synthesis"/>          <Option Name="PamDesignTestbench" Val=""/>          <Option Name="PamDutBypassFile" Val="xil_dut_bypass"/>          <Option Name="PamSignalDriverFile" Val="xil_bypass_driver"/> @@ -283,6 +282,14 @@      </FileSet>      <FileSet Name="utils_1" Type="Utils" RelSrcDir="$PSRCDIR/utils_1" RelGenDir="$PGENDIR/utils_1">        <Filter Type="Utils"/> +      <File Path="$PSRCDIR/utils_1/imports/synth_1/ppu.dcp"> +        <FileInfo> +          <Attr Name="UsedIn" Val="synthesis"/> +          <Attr Name="UsedIn" Val="implementation"/> +          <Attr Name="UsedInSteps" Val="synth_1"/> +          <Attr Name="AutoDcp" Val="1"/> +        </FileInfo> +      </File>        <Config>          <Option Name="TopAutoSet" Val="TRUE"/>        </Config> @@ -337,7 +344,7 @@      </Simulator>    </Simulators>    <Runs Version="1" Minor="19"> -    <Run Id="synth_1" Type="Ft3:Synth" SrcSet="sources_1" Part="xc7a35tcpg236-1" ConstrsSet="constrs_1" Description="Vivado Synthesis Defaults" AutoIncrementalCheckpoint="true" WriteIncrSynthDcp="false" State="current" Dir="$PRUNDIR/synth_1" IncludeInArchive="true" IsChild="false" AutoIncrementalDir="$PSRCDIR/utils_1/imports/synth_1" AutoRQSDir="$PSRCDIR/utils_1/imports/synth_1"> +    <Run Id="synth_1" Type="Ft3:Synth" SrcSet="sources_1" Part="xc7a35tcpg236-1" ConstrsSet="constrs_1" Description="Vivado Synthesis Defaults" AutoIncrementalCheckpoint="true" IncrementalCheckpoint="$PSRCDIR/utils_1/imports/synth_1/ppu.dcp" WriteIncrSynthDcp="false" State="current" Dir="$PRUNDIR/synth_1" IncludeInArchive="true" IsChild="false" AutoIncrementalDir="$PSRCDIR/utils_1/imports/synth_1" AutoRQSDir="$PSRCDIR/utils_1/imports/synth_1">        <Strategy Version="1" Minor="2">          <StratHandle Name="Vivado Synthesis Defaults" Flow="Vivado Synthesis 2022"/>          <Step Id="synth_design"/> @@ -359,9 +366,7 @@      </Run>      <Run Id="ppu_bam_synth_1" Type="Ft3:Synth" SrcSet="ppu_bam" Part="xc7a35tcpg236-1" ConstrsSet="ppu_bam" Description="Vivado Synthesis Defaults" AutoIncrementalCheckpoint="false" WriteIncrSynthDcp="false" Dir="$PRUNDIR/ppu_bam_synth_1" IncludeInArchive="true" IsChild="false" AutoIncrementalDir="$PSRCDIR/utils_1/imports/ppu_bam_synth_1" AutoRQSDir="$PSRCDIR/utils_1/imports/ppu_bam_synth_1">        <Strategy Version="1" Minor="2"> -        <StratHandle Name="Vivado Synthesis Defaults" Flow="Vivado Synthesis 2022"> -          <Desc>Vivado Synthesis Defaults</Desc> -        </StratHandle> +        <StratHandle Name="Vivado Synthesis Defaults" Flow="Vivado Synthesis 2022"/>          <Step Id="synth_design"/>        </Strategy>        <GeneratedRun Dir="$PRUNDIR" File="gen_run.xml"/> @@ -405,9 +410,7 @@      </Run>      <Run Id="ppu_bam_impl_1" Type="Ft2:EntireDesign" Part="xc7a35tcpg236-1" ConstrsSet="ppu_bam" Description="Default settings for Implementation." AutoIncrementalCheckpoint="false" WriteIncrSynthDcp="false" SynthRun="ppu_bam_synth_1" IncludeInArchive="false" IsChild="false" GenFullBitstream="true" AutoIncrementalDir="$PSRCDIR/utils_1/imports/ppu_bam_impl_1" AutoRQSDir="$PSRCDIR/utils_1/imports/ppu_bam_impl_1">        <Strategy Version="1" Minor="2"> -        <StratHandle Name="Vivado Implementation Defaults" Flow="Vivado Implementation 2022"> -          <Desc>Default settings for Implementation.</Desc> -        </StratHandle> +        <StratHandle Name="Vivado Implementation Defaults" Flow="Vivado Implementation 2022"/>          <Step Id="init_design"/>          <Step Id="opt_design"/>          <Step Id="power_opt_design"/> diff --git a/docs/architecture.md b/docs/architecture.md index 5001eed..9a77e57 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -68,6 +68,7 @@ Here's a list of features our PPU has:  - 640x480 background canvas with scrolling  - NO background scrolling splits  - 128 total sprites on screen (NO scanline sprite limit) +  - the first 16 foreground sprites have accurate background occlusion  - sprites are always drawn on top of the background layer  - PPU control using DMA (dual-port asynchronous RAM)  - tiles can be flipped using FAM or BAM @@ -82,7 +83,9 @@ Notable differences:    Since we're using VGA, we can't use custom resolutions without an    upscaler/downscaler. This resolution was chosen because it's exactly half of -  the lowest standard VGA resolution 640x480. +  the lowest standard VGA resolution 640x480. The native resolution can't be +  used due to the pipelined pixel fetch logic, which needs at least 5 clock +  cycles to produce a stable color output.  - No scanline sprite limit      Unless not imposing any sprite limit makes the hardware implementation @@ -98,24 +101,24 @@ Notable differences:  - Single 1024 sprite tilemap shared between foreground and background sprites    The NES OAM registers contain a bit to select which tilemap to use (of two), -  which effectively expands each tile's index address by one byte. Instead of +  which effectively expands each tile's index address by one bit. Instead of    creating the illusion of two separate memory areas for tiles, having one -  large tilemap seems like a more sensible solution to indexed tiles. +  large tilemap seems like a more sensible solution.  - 8 total palettes, with 8 colors each -  More colors is better. Increasing the total palette count is a very memory -  intensive operation, while increasing the palette color count is likely slower +  More colors is better. Increasing the palette color count is a very memory +  intensive operation, while increasing the total amount of palettes is slower    when looking up color values for each pixel on real hardware.  - Sprites can be positioned partially off-screen on all screen edges using only    the offset bits in the FAM register    The NES has a separate PPUMASK register to control special color effects, and    to shift sprites off the left and top screen edges, as the sprite offsets -  count from 0. Our PPU's FAM sprite offset bits count from -15, so the sprite +  count from 0. Our PPU's FAM sprite offset bits count from -16, so the sprite    can shift past the top and left screen edges, as well as the standard bottom    and right edges. -- No status line register, only V-sync and H-sync outputs are supplied back to -  CPU +- No status line register, only vertical and horizontal blanking/sync outputs +  are supplied back to CPU    The NES status line register contains some handy lines, such as a buggy    status line for reaching the max sprite count per scanline, and a status line @@ -126,7 +129,7 @@ Notable differences:  - No background scrolling splits    This feature allows only part of the background canvas to be scrolled, while -  another portion stays still. This was used to draw HUD elements on the +  another portion remains still. This was used to draw HUD elements on the    background layer for displaying things like health bars or score counters.    Since we are working with a higher foreground sprite limit, we'll use regular    foreground sprites to display HUD elements. @@ -197,6 +200,7 @@ Important notes:    the RAM in it's own cache memory. The cache updates are fetched during the    VBLANK time between each frame. +<!-- inaccurate and no longer needed  ### Level 3  This diagram has several flaws, but a significant amount of time has already @@ -227,6 +231,24 @@ Important notes:    CIDX signal based on the EN signal from the compositor.  - All DATA and ADDR lines are shared between all RAM ports. WEN inputs are    controlled by the address decoder. +--> + +## Pipeline stage reference + +This table describes which components use which lines during pipeline stages +1-5. The pipeline stages happen for every pixel, and is run on the system clock +(100 MHz). + +|Stage|Component|Action|To|Type| +|-|-|-|-|-| +|1|`ppu_sprite_bg`|write|BAM address|bus| +|2|`ppu_sprite_bg`|read|BAM data|bus| +|2|`ppu_sprite_fg`|write|TMM address|bus| +|3|`ppu_sprite_bg`|write|TMM address|bus| +|3|`ppu_sprite_fg`|read|TMM data|bus| +|4|`ppu_sprite_bg`|read|TMM data|bus| +|5|`ppu_pceg`|write|pixel done|flag| +|6|`ppu_pceg`|write|pixel ready|flag|  ## Registers @@ -258,7 +280,7 @@ there is no address validity checking.    discarded padding bit per word)  - Pixel index order is from top-left to bottom-right in (English) reading    order. -- Bits `14 downto 3` of the byte with the highest address for a given tile are +- Bits `14 downto 3` of the word with the highest address for a given tile are    not used  - To calculate TMM address $a$ for any given pixel $p$ of tile with index $t$,    compute $a=52*t+\left\lfloor\frac{p}{5}\right\rfloor$ |