From 7d316cce9af0e724c6f95fa997cd32a680fdede7 Mon Sep 17 00:00:00 2001 From: lonkaars Date: Sat, 4 Mar 2023 14:09:08 +0100 Subject: foreground sprite optimization (untested) done --- basys3/basys3.srcs/ppu.vhd | 12 ++++-- basys3/basys3.srcs/ppu_pceg.vhd | 30 ++++++++------- basys3/basys3.srcs/ppu_pceg_tb.vhd | 25 ++++++++---- basys3/basys3.srcs/ppu_sprite_fg.vhd | 73 ++++++++++++++++++++++++++++-------- basys3/basys3.xpr | 2 +- 5 files changed, 100 insertions(+), 42 deletions(-) diff --git a/basys3/basys3.srcs/ppu.vhd b/basys3/basys3.srcs/ppu.vhd index c6dfe60..9e869d5 100644 --- a/basys3/basys3.srcs/ppu.vhd +++ b/basys3/basys3.srcs/ppu.vhd @@ -20,7 +20,8 @@ architecture Behavioral of ppu is component ppu_pceg port( -- pipeline clock edge generator CLK : in std_logic; -- system clock RESET : in std_logic; -- async reset - SPRITE : out std_logic; -- sprite info fetch + sprite pixel fetch + SPRITE_BG : out std_logic; -- sprite info fetch + sprite pixel fetch + SPRITE_FG : out std_logic; -- sprite pixel fetch DONE : out std_logic; -- last pipeline stage done READY : out std_logic); -- rgb buffer propagation ready end component; @@ -105,6 +106,7 @@ architecture Behavioral of ppu is -- inputs CLK : in std_logic; -- system clock RESET : in std_logic; -- reset internal memory and clock counters + PL_CLK : in std_logic; -- pipeline clock PL_RESET : in std_logic; -- reset pipeline clock counters OE : in std_logic; -- output enable (of CIDX) X : in std_logic_vector(PPU_POS_H_WIDTH-1 downto 0); -- current screen pixel x @@ -167,7 +169,7 @@ architecture Behavioral of ppu is -- signals signal SYSCLK, SYSRST : std_logic; -- system clock and reset - signal PL_SPRITE, PL_DONE, PL_READY : std_logic; -- pipeline stages + signal PL_SPRITE_FG, PL_SPRITE_BG, PL_DONE, PL_READY : std_logic; -- pipeline stages signal TMM_WEN, BAM_WEN, FAM_WEN, PAL_WEN, AUX_WEN : std_logic; signal TMM_W_ADDR, TMM_R_ADDR : std_logic_vector(PPU_TMM_ADDR_WIDTH-1 downto 0); -- read/write TMM addr (dual port) signal BAM_W_ADDR, BAM_R_ADDR : std_logic_vector(PPU_BAM_ADDR_WIDTH-1 downto 0); -- read/write BAM addr (dual port) @@ -202,7 +204,8 @@ begin pipeline_clock_edge_generator : component ppu_pceg port map( CLK => SYSCLK, RESET => SYSRST, - SPRITE => PL_SPRITE, + SPRITE_FG => PL_SPRITE_FG, + SPRITE_BG => PL_SPRITE_BG, DONE => PL_DONE, READY => PL_READY); @@ -254,7 +257,7 @@ begin FG_FETCH => FG_FETCH); background_sprite : component ppu_sprite_bg port map( - CLK => PL_SPRITE, + CLK => PL_SPRITE_BG, RESET => SYSRST, PL_RESET => PL_READY, OE => BG_EN, @@ -274,6 +277,7 @@ begin port map( CLK => SYSCLK, RESET => SYSRST, + PL_CLK => PL_SPRITE_FG, PL_RESET => PL_READY, OE => FG_EN(FG_IDX), X => X, diff --git a/basys3/basys3.srcs/ppu_pceg.vhd b/basys3/basys3.srcs/ppu_pceg.vhd index 5d9f4d6..d53d86a 100644 --- a/basys3/basys3.srcs/ppu_pceg.vhd +++ b/basys3/basys3.srcs/ppu_pceg.vhd @@ -5,34 +5,36 @@ use work.ppu_consts.all; entity ppu_pceg is port( CLK : in std_logic; -- system clock RESET : in std_logic; -- async reset - SPRITE : out std_logic; -- sprite info fetch + sprite pixel fetch + SPRITE_BG : out std_logic; -- sprite info fetch + sprite pixel fetch + SPRITE_FG : out std_logic; -- sprite pixel fetch DONE : out std_logic; -- last pipeline stage done READY : out std_logic); -- rgb buffer propagation ready end ppu_pceg; architecture Behavioral of ppu_pceg is - type states is (PL_SPRITE, PL_DONE, PL_READY); - signal state : states := PL_SPRITE; + signal PL_SPRITE_BG, PL_SPRITE_FG, PL_DONE, PL_READY : boolean := false; begin -- output drivers - SPRITE <= CLK when RESET = '0' and state = PL_SPRITE else '0'; - DONE <= CLK when RESET = '0' and state = PL_DONE else '0'; - READY <= '1' when RESET = '0' and state = PL_READY else '0'; + SPRITE_BG <= CLK when RESET = '0' and PL_SPRITE_BG else '0'; + SPRITE_FG <= CLK when RESET = '0' and PL_SPRITE_FG else '0'; + DONE <= CLK when RESET = '0' and PL_DONE else '0'; + READY <= '1' when RESET = '0' and PL_READY else '0'; process(CLK, RESET) variable CLK_IDX : natural range 0 to PPU_PL_TOTAL_STAGES+1 := 0; begin if RESET = '1' then - state <= PL_SPRITE; + CLK_IDX := 0; + PL_SPRITE_BG <= false; + PL_SPRITE_FG <= false; + PL_DONE <= false; + PL_READY <= false; elsif rising_edge(CLK) then -- clock counter ranges - if CLK_IDX < 4 then - state <= PL_SPRITE; - elsif CLK_IDX < 5 then - state <= PL_DONE; - else - state <= PL_READY; - end if; + PL_SPRITE_BG <= true when CLK_IDX >= 0 and CLK_IDX <= 3 else false; + PL_SPRITE_FG <= true when CLK_IDX >= 1 and CLK_IDX <= 2 else false; + PL_DONE <= true when CLK_IDX = 4 else false; + PL_READY <= true when CLK_IDX >= 5 else false; -- increment clock counter CLK_IDX := CLK_IDX + 1; diff --git a/basys3/basys3.srcs/ppu_pceg_tb.vhd b/basys3/basys3.srcs/ppu_pceg_tb.vhd index 86061a0..1c2c855 100644 --- a/basys3/basys3.srcs/ppu_pceg_tb.vhd +++ b/basys3/basys3.srcs/ppu_pceg_tb.vhd @@ -12,13 +12,15 @@ architecture behavioral of ppu_pceg_tb is component ppu_pceg port( CLK : in std_logic; -- system clock RESET : in std_logic; -- async reset - SPRITE : out std_logic; -- sprite info fetch + sprite pixel fetch + SPRITE_BG : out std_logic; -- sprite info fetch + sprite pixel fetch + SPRITE_FG : out std_logic; -- sprite pixel fetch DONE : out std_logic; -- last pipeline stage done READY : out std_logic); -- rgb buffer propagation ready end component; signal CLK : std_logic := '0'; signal RESET : std_logic := '0'; - signal SPRITE : std_logic; + signal SPRITE_BG : std_logic; + signal SPRITE_FG : std_logic; signal DONE : std_logic; signal READY : std_logic; @@ -26,17 +28,14 @@ begin uut : ppu_pceg port map( CLK => CLK, RESET => RESET, - SPRITE => SPRITE, + SPRITE_BG => SPRITE_BG, + SPRITE_FG => SPRITE_FG, DONE => DONE, READY => READY); tb : process begin for i in 0 to 32 loop - if i > 20 then - RESET <= '1'; - end if; - wait for 5 ns; CLK <= '1'; wait for 5 ns; @@ -44,4 +43,16 @@ begin end loop; wait; -- stop for simulator end process; + + gert : process + begin + RESET <= '1'; + wait for 1 ns; + RESET <= '0'; + wait for 100 ns; + RESET <= '1'; + wait for 5 ns; + RESET <= '0'; + wait; + end process; end; diff --git a/basys3/basys3.srcs/ppu_sprite_fg.vhd b/basys3/basys3.srcs/ppu_sprite_fg.vhd index dd315d8..3b4d2c6 100644 --- a/basys3/basys3.srcs/ppu_sprite_fg.vhd +++ b/basys3/basys3.srcs/ppu_sprite_fg.vhd @@ -14,6 +14,7 @@ entity ppu_sprite_fg is -- foreground sprite -- inputs CLK : in std_logic; -- system clock RESET : in std_logic; -- reset internal memory and clock counters + PL_CLK : in std_logic; -- pipeline clock PL_RESET : in std_logic; -- reset pipeline clock counters OE : in std_logic; -- output enable (of CIDX) X : in std_logic_vector(PPU_POS_H_WIDTH-1 downto 0); -- current screen pixel x @@ -58,9 +59,9 @@ architecture Behavioral of ppu_sprite_fg is REG : out std_logic_vector((ADDR_RANGE*DATA_W)-1 downto 0)); -- exposed register output end component; - -- FAM and TMM in/out lines - signal T_TMM_ADDR : std_logic_vector(PPU_TMM_ADDR_WIDTH-1 downto 0) := (others => '0'); - signal T_TMM_DATA : std_logic_vector(PPU_TMM_DATA_WIDTH-1 downto 0) := (others => '0'); + -- TMM in/out temp + registers + signal T_TMM_ADDR, R_TMM_ADDR : std_logic_vector(PPU_TMM_ADDR_WIDTH-1 downto 0) := (others => '0'); + signal T_TMM_DATA, R_TMM_DATA : std_logic_vector(PPU_TMM_DATA_WIDTH-1 downto 0) := (others => '0'); -- auxiliary signals (temp variables) signal T_CIDX : std_logic_vector(PPU_PALETTE_CIDX_WIDTH-1 downto 0) := (others => '0'); -- output color buffer/register @@ -76,11 +77,14 @@ architecture Behavioral of ppu_sprite_fg is signal SPRITE_ACTIVE : std_logic := '0'; -- is pixel in bounding box of sprite signal PIXEL_ABS_X, PIXEL_ABS_Y : integer := 0; -- absolute pixel position (relative to FG canvas instead of viewport) + signal PIXEL_BIT_OFFSET : integer := 0; -- pixel index within word of TMM signal TILE_PIDX_X, TRANS_TILE_PIDX_X : unsigned(PPU_SPRITE_POS_H_WIDTH-1 downto 0) := (others => '0'); -- xy position of pixel within tile (local tile coords) signal TILE_PIDX_Y, TRANS_TILE_PIDX_Y : unsigned(PPU_SPRITE_POS_V_WIDTH-1 downto 0) := (others => '0'); -- xy position of pixel within tile (local tile coords) - signal TRANS_TILE_PIXEL_IDX : integer := 0; -- index of pixel within tile (reading order) + signal TRANS_TILE_PIDX : integer := 0; -- index of pixel within tile (reading order) + signal TILEMAP_WORD : unsigned(PPU_TMM_ADDR_WIDTH-1 downto 0) := (others => '0'); signal TILEMAP_WORD_OFFSET : integer := 0; -- word offset from tile start address in TMM signal TMM_DATA_PAL_IDX : std_logic_vector(PPU_PALETTE_COLOR_WIDTH-1 downto 0); -- color of palette + begin -- FAM memory FAM : component er_ram @@ -97,11 +101,11 @@ begin DATA => FAM_DATA, REG => INT_FAM); - -- output drivers - CIDX <= T_CIDX when OE = '1' else (others => 'Z'); -- CIDX combination T_CIDX <= FAM_REG_COL_IDX & TMM_DATA_PAL_IDX; - + -- output drivers + CIDX <= T_CIDX when OE = '1' else (others => 'Z'); + -- TMM memory T_TMM_DATA <= TMM_DATA; -- pixel position within bounding box of sprite @@ -124,16 +128,51 @@ begin YO => TRANS_TILE_PIDX_Y); -- pixel index - TRANS_TILE_PIXEL_IDX <= integer(PPU_SPRITE_WIDTH) * to_integer(TRANS_TILE_PIDX_Y) + to_integer(TRANS_TILE_PIDX_X); - -- if pixel in sprite hitbox and TMM_DATA_PAL_IDX > 0 - HIT <= SPRITE_ACTIVE and (nor TMM_DATA_PAL_IDX); + TRANS_TILE_PIDX <= integer(PPU_SPRITE_WIDTH) * to_integer(TRANS_TILE_PIDX_Y) + to_integer(TRANS_TILE_PIDX_X); + TILEMAP_WORD <= resize(unsigned(FAM_REG_TILE_IDX) * PPU_SPRITE_WORD_COUNT, TILEMAP_WORD'length); -- TMM sprite starting word + TILEMAP_WORD_OFFSET <= TRANS_TILE_PIDX / PPU_PIXELS_PER_TILE_WORD; -- word offset from starting word of sprite + PIXEL_BIT_OFFSET <= TRANS_TILE_PIDX mod PPU_PIXELS_PER_TILE_WORD; -- pixel bit offset inaccurate_occlusion_shims: if IDX >= PPU_ACCURATE_FG_SPRITE_COUNT generate + -- state machine for synchronizing pipeline stages + type states is (PL_TMM_ADDR, PL_TMM_DATA); + signal state : states := PL_TMM_ADDR; begin - -- palette color at pixel - TMM_DATA_PAL_IDX <= (others => '0'); - - TMM_ADDR <= (others => 'Z'); + HIT <= SPRITE_ACTIVE; + -- only fetch if OE is high, and during the second pipeline stage + TMM_ADDR <= R_TMM_ADDR when OE = '1' and state = PL_TMM_ADDR else (others => 'Z'); + T_TMM_ADDR <= std_logic_vector(TILEMAP_WORD + to_unsigned(TILEMAP_WORD_OFFSET, PPU_TMM_ADDR_WIDTH)); -- TMM address + + -- TMM DATA + with PIXEL_BIT_OFFSET select + TMM_DATA_PAL_IDX <= R_TMM_DATA(2 downto 0) when 0, + R_TMM_DATA(5 downto 3) when 1, + R_TMM_DATA(8 downto 6) when 2, + R_TMM_DATA(11 downto 9) when 3, + R_TMM_DATA(14 downto 12) when 4, + (others => '0') when others; + + process(PL_CLK, RESET, PL_RESET) + begin + if RESET = '1' or PL_RESET = '1' then + -- reset state + state <= PL_TMM_ADDR; + if RESET = '1' then + -- reset internal pipeline registers + R_TMM_ADDR <= (others => '0'); + R_TMM_DATA <= (others => '0'); + end if; + elsif rising_edge(CLK) then + case state is + when PL_TMM_ADDR => + state <= PL_TMM_DATA; + R_TMM_ADDR <= T_TMM_ADDR; + when PL_TMM_DATA => + state <= PL_TMM_ADDR; + R_TMM_DATA <= T_TMM_DATA; + end case; + end if; + end process; end generate; accurate_occlusion_logic: if IDX < PPU_ACCURATE_FG_SPRITE_COUNT generate @@ -143,8 +182,10 @@ begin signal TMM_CACHE_ADDR : std_logic_vector(PPU_TMM_ADDR_WIDTH-1 downto 0) := (others => '0'); signal TMM_CACHE : std_logic_vector((PPU_SPRITE_WORD_COUNT * PPU_TMM_DATA_WIDTH)-1 downto 0); begin + HIT <= SPRITE_ACTIVE and (nor TMM_DATA_PAL_IDX); + -- palette color at pixel - TMM_DATA_PAL_IDX <= TMM_CACHE(TRANS_TILE_PIXEL_IDX * integer(PPU_PALETTE_COLOR_WIDTH) + integer(PPU_PALETTE_COLOR_WIDTH)-1 downto TRANS_TILE_PIXEL_IDX * integer(PPU_PALETTE_COLOR_WIDTH)); + TMM_DATA_PAL_IDX <= TMM_CACHE(TRANS_TILE_PIDX * integer(PPU_PALETTE_COLOR_WIDTH) + integer(PPU_PALETTE_COLOR_WIDTH)-1 downto TRANS_TILE_PIDX * integer(PPU_PALETTE_COLOR_WIDTH)); TMM_ADDR <= T_TMM_ADDR when TMM_CACHE_UPDATE_TURN else (others => 'Z'); @@ -186,7 +227,7 @@ begin TMM_FETCH_CTR < (TMM_FETCH_CLK_RANGE_BEGIN + PPU_TMM_CACHE_FETCH_C_COUNT) then TMM_CACHE_UPDATE_TURN <= '1'; if TMM_FETCH_CTR_REL < PPU_TMM_CACHE_FETCH_C_COUNT - 1 then -- calculate address until second to last clock - T_TMM_ADDR <= std_logic_vector(resize(TMM_FETCH_CTR - IDX, T_TMM_ADDR'length)); + T_TMM_ADDR <= std_logic_vector(resize(TMM_FETCH_CTR - IDX, T_TMM_ADDR'length)); -- -IDX to correct for each fetch cycle taking 1 extra clock cycle TMM_CACHE_ADDR <= std_logic_vector(resize(TMM_FETCH_CTR_REL - 1, TMM_CACHE_ADDR'length)); end if; diff --git a/basys3/basys3.xpr b/basys3/basys3.xpr index 22b1d66..a253b15 100644 --- a/basys3/basys3.xpr +++ b/basys3/basys3.xpr @@ -61,7 +61,7 @@