From df8902fba3a6e97ca3c5fdedb70999faac713815 Mon Sep 17 00:00:00 2001 From: lonkaars Date: Fri, 3 Mar 2023 17:30:09 +0100 Subject: WIP fg sprite optimilization --- basys3/basys3.srcs/ppu.vhd | 23 +++--- basys3/basys3.srcs/ppu_consts.vhd | 2 + basys3/basys3.srcs/ppu_pceg.vhd | 18 ++--- basys3/basys3.srcs/ppu_pceg_tb.vhd | 10 +-- basys3/basys3.srcs/ppu_sprite_bg.vhd | 17 ++-- basys3/basys3.srcs/ppu_sprite_fg.vhd | 148 +++++++++++++++++++---------------- basys3/basys3.xpr | 23 +++--- 7 files changed, 131 insertions(+), 110 deletions(-) (limited to 'basys3') diff --git a/basys3/basys3.srcs/ppu.vhd b/basys3/basys3.srcs/ppu.vhd index 2425edc..c6dfe60 100644 --- a/basys3/basys3.srcs/ppu.vhd +++ b/basys3/basys3.srcs/ppu.vhd @@ -2,7 +2,6 @@ library ieee; library work; use ieee.std_logic_1164.all; ---use ieee.numeric_std.all; use work.ppu_consts.all; entity ppu is port( @@ -22,8 +21,8 @@ architecture Behavioral of ppu is CLK : in std_logic; -- system clock RESET : in std_logic; -- async reset SPRITE : out std_logic; -- sprite info fetch + sprite pixel fetch - COMP_PAL : out std_logic; -- compositor + palette lookup - DONE : out std_logic); -- last pipeline stage done + DONE : out std_logic; -- last pipeline stage done + READY : out std_logic); -- rgb buffer propagation ready end component; component ppu_addr_dec port( -- address decoder WEN : in std_logic; -- EXT write enable @@ -81,6 +80,7 @@ architecture Behavioral of ppu is -- inputs CLK : in std_logic; -- pipeline clock RESET : in std_logic; -- reset clock counter + PL_RESET : in std_logic; -- reset pipeline clock counters OE : in std_logic; -- output enable (of CIDX) X : in std_logic_vector(PPU_POS_H_WIDTH-1 downto 0); -- current screen pixel x Y : in std_logic_vector(PPU_POS_V_WIDTH-1 downto 0); -- current screen pixel y @@ -105,6 +105,7 @@ architecture Behavioral of ppu is -- inputs CLK : in std_logic; -- system clock RESET : in std_logic; -- reset internal memory and clock counters + PL_RESET : in std_logic; -- reset pipeline clock counters OE : in std_logic; -- output enable (of CIDX) X : in std_logic_vector(PPU_POS_H_WIDTH-1 downto 0); -- current screen pixel x Y : in std_logic_vector(PPU_POS_V_WIDTH-1 downto 0); -- current screen pixel y @@ -166,7 +167,7 @@ architecture Behavioral of ppu is -- signals signal SYSCLK, SYSRST : std_logic; -- system clock and reset - signal PL_SPRITE, PL_COMP_PAL, PL_DONE : std_logic; -- pipeline stages + signal PL_SPRITE, PL_DONE, PL_READY : std_logic; -- pipeline stages signal TMM_WEN, BAM_WEN, FAM_WEN, PAL_WEN, AUX_WEN : std_logic; signal TMM_W_ADDR, TMM_R_ADDR : std_logic_vector(PPU_TMM_ADDR_WIDTH-1 downto 0); -- read/write TMM addr (dual port) signal BAM_W_ADDR, BAM_R_ADDR : std_logic_vector(PPU_BAM_ADDR_WIDTH-1 downto 0); -- read/write BAM addr (dual port) @@ -181,7 +182,7 @@ architecture Behavioral of ppu is signal X : std_logic_vector(PPU_POS_H_WIDTH-1 downto 0); -- current screen pixel x signal Y : std_logic_vector(PPU_POS_V_WIDTH-1 downto 0); -- current screen pixel y signal UR,UG,UB : std_logic_vector(PPU_COLOR_OUTPUT_DEPTH-1 downto 0); -- unstable RGB (to be buffered) - signal SR,SG,SB : std_logic_vector(PPU_COLOR_OUTPUT_DEPTH-1 downto 0); -- stable RGB (buffered until PL_COMP_PAL) + signal SR,SG,SB : std_logic_vector(PPU_COLOR_OUTPUT_DEPTH-1 downto 0); -- stable RGB (buffered until PL_DONE) signal BG_SHIFT_X : std_logic_vector(PPU_POS_H_WIDTH-1 downto 0); signal BG_SHIFT_Y : std_logic_vector(PPU_POS_V_WIDTH-1 downto 0); signal FG_FETCH : std_logic; @@ -202,8 +203,8 @@ begin CLK => SYSCLK, RESET => SYSRST, SPRITE => PL_SPRITE, - COMP_PAL => PL_COMP_PAL, - DONE => PL_DONE); + DONE => PL_DONE, + READY => PL_READY); address_decoder : component ppu_addr_dec port map( WEN => WEN, @@ -255,6 +256,7 @@ begin background_sprite : component ppu_sprite_bg port map( CLK => PL_SPRITE, RESET => SYSRST, + PL_RESET => PL_READY, OE => BG_EN, X => X, Y => Y, @@ -272,6 +274,7 @@ begin port map( CLK => SYSCLK, RESET => SYSRST, + PL_RESET => PL_READY, OE => FG_EN(FG_IDX), X => X, Y => Y, @@ -303,13 +306,13 @@ begin B => UB); -- palette lookup output buffer (pipeline stage 5) - process(PL_COMP_PAL, SYSRST) + process(PL_DONE, SYSRST) begin if SYSRST = '1' then SR <= x"0"; SG <= x"0"; SB <= x"0"; - elsif rising_edge(PL_COMP_PAL) then + elsif rising_edge(PL_DONE) then SR <= UR; SG <= UG; SB <= UB; @@ -331,7 +334,7 @@ begin RESET => SYSRST, X => X, Y => Y, - PREADY => PL_DONE, + PREADY => PL_READY, RI => SR, GI => SG, BI => SB, diff --git a/basys3/basys3.srcs/ppu_consts.vhd b/basys3/basys3.srcs/ppu_consts.vhd index 75b6168..c7786c4 100644 --- a/basys3/basys3.srcs/ppu_consts.vhd +++ b/basys3/basys3.srcs/ppu_consts.vhd @@ -44,6 +44,8 @@ package ppu_consts is constant PPU_TMM_CACHE_FETCH_C_COUNT : natural := PPU_SPRITE_WORD_COUNT + 1; constant PPU_TMM_CACHE_FETCH_A_COUNT : natural := PPU_TMM_CACHE_FETCH_C_COUNT * PPU_FG_SPRITE_COUNT; -- amount of clocks to fetch new TMM cache constant PPU_TMM_CACHE_FETCH_A_WIDTH : natural := ceil_log2(PPU_TMM_CACHE_FETCH_A_COUNT); + constant PPU_ACCURATE_FG_SPRITE_COUNT : natural := 16; + constant PPU_PL_TOTAL_STAGES : natural := 14; end package ppu_consts; package body ppu_consts is -- https://stackoverflow.com/questions/21783280/number-of-bits-to-represent-an-integer-in-vhdl diff --git a/basys3/basys3.srcs/ppu_pceg.vhd b/basys3/basys3.srcs/ppu_pceg.vhd index 1aaeee4..5d9f4d6 100644 --- a/basys3/basys3.srcs/ppu_pceg.vhd +++ b/basys3/basys3.srcs/ppu_pceg.vhd @@ -1,25 +1,23 @@ library ieee; use ieee.std_logic_1164.all; ---use ieee.numeric_std.all; +use work.ppu_consts.all; entity ppu_pceg is port( CLK : in std_logic; -- system clock RESET : in std_logic; -- async reset SPRITE : out std_logic; -- sprite info fetch + sprite pixel fetch - COMP_PAL : out std_logic; -- compositor + palette lookup - DONE : out std_logic); -- last pipeline stage done + DONE : out std_logic; -- last pipeline stage done + READY : out std_logic); -- rgb buffer propagation ready end ppu_pceg; architecture Behavioral of ppu_pceg is - constant PPU_PL_TOTAL_STAGES : natural := 14; - - type states is (PL_SPRITE, PL_COMP_PAL, PL_DONE); + type states is (PL_SPRITE, PL_DONE, PL_READY); signal state : states := PL_SPRITE; begin -- output drivers SPRITE <= CLK when RESET = '0' and state = PL_SPRITE else '0'; - COMP_PAL <= CLK when RESET = '0' and state = PL_COMP_PAL else '0'; - DONE <= '1' when RESET = '0' and state = PL_DONE else '0'; + DONE <= CLK when RESET = '0' and state = PL_DONE else '0'; + READY <= '1' when RESET = '0' and state = PL_READY else '0'; process(CLK, RESET) variable CLK_IDX : natural range 0 to PPU_PL_TOTAL_STAGES+1 := 0; @@ -31,9 +29,9 @@ begin if CLK_IDX < 4 then state <= PL_SPRITE; elsif CLK_IDX < 5 then - state <= PL_COMP_PAL; - else state <= PL_DONE; + else + state <= PL_READY; end if; -- increment clock counter diff --git a/basys3/basys3.srcs/ppu_pceg_tb.vhd b/basys3/basys3.srcs/ppu_pceg_tb.vhd index 719ec06..86061a0 100644 --- a/basys3/basys3.srcs/ppu_pceg_tb.vhd +++ b/basys3/basys3.srcs/ppu_pceg_tb.vhd @@ -13,22 +13,22 @@ architecture behavioral of ppu_pceg_tb is CLK : in std_logic; -- system clock RESET : in std_logic; -- async reset SPRITE : out std_logic; -- sprite info fetch + sprite pixel fetch - COMP_PAL : out std_logic; -- compositor + palette lookup - DONE : out std_logic); -- last pipeline stage done + DONE : out std_logic; -- last pipeline stage done + READY : out std_logic); -- rgb buffer propagation ready end component; signal CLK : std_logic := '0'; signal RESET : std_logic := '0'; signal SPRITE : std_logic; - signal COMP_PAL : std_logic; signal DONE : std_logic; + signal READY : std_logic; begin uut : ppu_pceg port map( CLK => CLK, RESET => RESET, SPRITE => SPRITE, - COMP_PAL => COMP_PAL, - DONE => DONE); + DONE => DONE, + READY => READY); tb : process begin diff --git a/basys3/basys3.srcs/ppu_sprite_bg.vhd b/basys3/basys3.srcs/ppu_sprite_bg.vhd index dba5b8e..1892694 100644 --- a/basys3/basys3.srcs/ppu_sprite_bg.vhd +++ b/basys3/basys3.srcs/ppu_sprite_bg.vhd @@ -11,6 +11,7 @@ entity ppu_sprite_bg is port( -- inputs CLK : in std_logic; -- pipeline clock RESET : in std_logic; -- reset clock counter + PL_RESET : in std_logic; -- reset pipeline clock counters OE : in std_logic; -- output enable (of CIDX) X : in std_logic_vector(PPU_POS_H_WIDTH-1 downto 0); -- current screen pixel x Y : in std_logic_vector(PPU_POS_V_WIDTH-1 downto 0); -- current screen pixel y @@ -108,16 +109,18 @@ begin (others => '0') when others; -- state machine (pipeline stage counter) + sync r/w - process(CLK, RESET) + process(CLK, RESET, PL_RESET) begin - if RESET = '1' then + if RESET = '1' or PL_RESET = '1' then -- reset state state <= PL_BAM_ADDR; - -- reset internal pipeline registers - R_BAM_ADDR <= (others => '0'); - R_BAM_DATA <= (others => '0'); - R_TMM_ADDR <= (others => '0'); - R_TMM_DATA <= (others => '0'); + if RESET = '1' then + -- reset internal pipeline registers + R_BAM_ADDR <= (others => '0'); + R_BAM_DATA <= (others => '0'); + R_TMM_ADDR <= (others => '0'); + R_TMM_DATA <= (others => '0'); + end if; elsif rising_edge(CLK) then case state is when PL_BAM_ADDR => diff --git a/basys3/basys3.srcs/ppu_sprite_fg.vhd b/basys3/basys3.srcs/ppu_sprite_fg.vhd index af7cfa3..dd315d8 100644 --- a/basys3/basys3.srcs/ppu_sprite_fg.vhd +++ b/basys3/basys3.srcs/ppu_sprite_fg.vhd @@ -14,6 +14,7 @@ entity ppu_sprite_fg is -- foreground sprite -- inputs CLK : in std_logic; -- system clock RESET : in std_logic; -- reset internal memory and clock counters + PL_RESET : in std_logic; -- reset pipeline clock counters OE : in std_logic; -- output enable (of CIDX) X : in std_logic_vector(PPU_POS_H_WIDTH-1 downto 0); -- current screen pixel x Y : in std_logic_vector(PPU_POS_V_WIDTH-1 downto 0); -- current screen pixel y @@ -80,18 +81,7 @@ architecture Behavioral of ppu_sprite_fg is signal TRANS_TILE_PIXEL_IDX : integer := 0; -- index of pixel within tile (reading order) signal TILEMAP_WORD_OFFSET : integer := 0; -- word offset from tile start address in TMM signal TMM_DATA_PAL_IDX : std_logic_vector(PPU_PALETTE_COLOR_WIDTH-1 downto 0); -- color of palette - - -- TMM cache lines - signal TMM_CACHE_WEN, TMM_CACHE_UPDATE_TURN : std_logic := '0'; - signal TMM_CACHE_DATA : std_logic_vector(PPU_TMM_DATA_WIDTH-1 downto 0) := (others => '0'); - signal TMM_CACHE_ADDR : std_logic_vector(PPU_TMM_ADDR_WIDTH-1 downto 0) := (others => '0'); - signal TMM_CACHE : std_logic_vector((PPU_SPRITE_WORD_COUNT * PPU_TMM_DATA_WIDTH)-1 downto 0); begin - -- output drivers - CIDX <= T_CIDX when OE = '1' else (others => 'Z'); - -- CIDX combination - T_CIDX <= FAM_REG_COL_IDX & TMM_DATA_PAL_IDX; - -- FAM memory FAM : component er_ram generic map( @@ -107,11 +97,18 @@ begin DATA => FAM_DATA, REG => INT_FAM); + -- output drivers + CIDX <= T_CIDX when OE = '1' else (others => 'Z'); + -- CIDX combination + T_CIDX <= FAM_REG_COL_IDX & TMM_DATA_PAL_IDX; + + T_TMM_DATA <= TMM_DATA; + -- pixel position within bounding box of sprite SPRITE_ACTIVE <= '1' when ((unsigned(X) + 16) >= unsigned(FAM_REG_POS_H)) and - ((unsigned(X) + 16) < (unsigned(FAM_REG_POS_H) + to_unsigned(PPU_SPRITE_WIDTH, PPU_POS_H_WIDTH))) and - ((unsigned(Y) + 16) >= unsigned(FAM_REG_POS_V)) and - ((unsigned(Y) + 16) < (unsigned(FAM_REG_POS_V) + to_unsigned(PPU_SPRITE_HEIGHT, PPU_POS_V_WIDTH))) else '0'; + ((unsigned(X) + 16) < (unsigned(FAM_REG_POS_H) + to_unsigned(PPU_SPRITE_WIDTH, PPU_POS_H_WIDTH))) and + ((unsigned(Y) + 16) >= unsigned(FAM_REG_POS_V)) and + ((unsigned(Y) + 16) < (unsigned(FAM_REG_POS_V) + to_unsigned(PPU_SPRITE_HEIGHT, PPU_POS_V_WIDTH))) else '0'; -- (sprite local) pixel coords TILE_PIDX_X <= resize(unsigned(X) + 16 - resize(unsigned(FAM_REG_POS_H), TILE_PIDX_X'length), TILE_PIDX_X'length); @@ -128,65 +125,80 @@ begin -- pixel index TRANS_TILE_PIXEL_IDX <= integer(PPU_SPRITE_WIDTH) * to_integer(TRANS_TILE_PIDX_Y) + to_integer(TRANS_TILE_PIDX_X); - -- palette color at pixel - TMM_DATA_PAL_IDX <= TMM_CACHE(TRANS_TILE_PIXEL_IDX * integer(PPU_PALETTE_COLOR_WIDTH) + integer(PPU_PALETTE_COLOR_WIDTH)-1 downto TRANS_TILE_PIXEL_IDX * integer(PPU_PALETTE_COLOR_WIDTH)); -- if pixel in sprite hitbox and TMM_DATA_PAL_IDX > 0 HIT <= SPRITE_ACTIVE and (nor TMM_DATA_PAL_IDX); - -- FETCH LOGIC BELOW - TMM_ADDR <= T_TMM_ADDR when TMM_CACHE_UPDATE_TURN else (others => 'Z'); - T_TMM_DATA <= TMM_DATA; - - -- TTM cache - ttm_cache : component er_ram - generic map( - ADDR_W => PPU_TMM_ADDR_WIDTH, - DATA_W => PPU_TMM_DATA_WIDTH, - ADDR_LOW => 0, - ADDR_RANGE => PPU_SPRITE_WORD_COUNT) - port map( - CLK => CLK, - RST => RESET, - WEN => TMM_CACHE_WEN, - ADDR => TMM_CACHE_ADDR, - DATA => TMM_CACHE_DATA, - REG => TMM_CACHE); - - -- fetch machine, should do the following (offset data read by one clock -> propagation/lookup delay): - -- CLK[53 * IDX + 0] (addr = 0) - -- CLK[53 * IDX + 1] (addr = 1, read data[0]) - -- CLK[53 * IDX + 2] (addr = 2, read data[1]), etc - -- a full tile is 52 words, but since the offset is 1 clock, a total copy takes 53 clock cycles - process(CLK, RESET, FETCH) - constant TMM_FETCH_CLK_RANGE_BEGIN : natural := PPU_TMM_CACHE_FETCH_C_COUNT * IDX; -- fetch CLK count for copying this module's sprite from TMM - variable TMM_FETCH_CTR : unsigned(PPU_TMM_CACHE_FETCH_A_WIDTH-1 downto 0) := (others => '0'); -- CLK counter while FETCH=1 - variable TMM_FETCH_CTR_REL : unsigned(PPU_TMM_CACHE_FETCH_A_WIDTH-1 downto 0) := (others => '0'); -- CLK counter relative for sprite[IDX] + inaccurate_occlusion_shims: if IDX >= PPU_ACCURATE_FG_SPRITE_COUNT generate begin - if RESET = '1' or FETCH = '0' then - TMM_FETCH_CTR := (others => '0'); - TMM_FETCH_CTR_REL := (others => '0'); - TMM_CACHE_WEN <= '0'; - TMM_CACHE_UPDATE_TURN <= '0'; - elsif rising_edge(CLK) then - TMM_FETCH_CTR := TMM_FETCH_CTR + 1; - TMM_FETCH_CTR_REL := TMM_FETCH_CTR - TMM_FETCH_CLK_RANGE_BEGIN; - - if TMM_FETCH_CTR >= TMM_FETCH_CLK_RANGE_BEGIN and - TMM_FETCH_CTR < (TMM_FETCH_CLK_RANGE_BEGIN + PPU_TMM_CACHE_FETCH_C_COUNT) then - TMM_CACHE_UPDATE_TURN <= '1'; - if TMM_FETCH_CTR_REL < PPU_TMM_CACHE_FETCH_C_COUNT - 1 then -- calculate address until second to last clock - T_TMM_ADDR <= std_logic_vector(resize(TMM_FETCH_CTR - IDX, T_TMM_ADDR'length)); - TMM_CACHE_ADDR <= std_logic_vector(resize(TMM_FETCH_CTR_REL - 1, TMM_CACHE_ADDR'length)); - end if; - - if TMM_FETCH_CTR_REL > 0 then -- read offset - TMM_CACHE_DATA <= T_TMM_DATA; - TMM_CACHE_WEN <= '1'; - end if; - else + -- palette color at pixel + TMM_DATA_PAL_IDX <= (others => '0'); + + TMM_ADDR <= (others => 'Z'); + end generate; + + accurate_occlusion_logic: if IDX < PPU_ACCURATE_FG_SPRITE_COUNT generate + -- TMM cache lines + signal TMM_CACHE_WEN, TMM_CACHE_UPDATE_TURN : std_logic := '0'; + signal TMM_CACHE_DATA : std_logic_vector(PPU_TMM_DATA_WIDTH-1 downto 0) := (others => '0'); + signal TMM_CACHE_ADDR : std_logic_vector(PPU_TMM_ADDR_WIDTH-1 downto 0) := (others => '0'); + signal TMM_CACHE : std_logic_vector((PPU_SPRITE_WORD_COUNT * PPU_TMM_DATA_WIDTH)-1 downto 0); + begin + -- palette color at pixel + TMM_DATA_PAL_IDX <= TMM_CACHE(TRANS_TILE_PIXEL_IDX * integer(PPU_PALETTE_COLOR_WIDTH) + integer(PPU_PALETTE_COLOR_WIDTH)-1 downto TRANS_TILE_PIXEL_IDX * integer(PPU_PALETTE_COLOR_WIDTH)); + + TMM_ADDR <= T_TMM_ADDR when TMM_CACHE_UPDATE_TURN else (others => 'Z'); + + -- TTM cache + ttm_cache : component er_ram + generic map( + ADDR_W => PPU_TMM_ADDR_WIDTH, + DATA_W => PPU_TMM_DATA_WIDTH, + ADDR_LOW => 0, + ADDR_RANGE => PPU_SPRITE_WORD_COUNT) + port map( + CLK => CLK, + RST => RESET, + WEN => TMM_CACHE_WEN, + ADDR => TMM_CACHE_ADDR, + DATA => TMM_CACHE_DATA, + REG => TMM_CACHE); + + -- fetch machine, should do the following (offset data read by one clock -> propagation/lookup delay): + -- CLK[53 * IDX + 0] (addr = 0) + -- CLK[53 * IDX + 1] (addr = 1, read data[0]) + -- CLK[53 * IDX + 2] (addr = 2, read data[1]), etc + -- a full tile is 52 words, but since the offset is 1 clock, a total copy takes 53 clock cycles + process(CLK, RESET, FETCH) + constant TMM_FETCH_CLK_RANGE_BEGIN : natural := PPU_TMM_CACHE_FETCH_C_COUNT * IDX; -- fetch CLK count for copying this module's sprite from TMM + variable TMM_FETCH_CTR : unsigned(PPU_TMM_CACHE_FETCH_A_WIDTH-1 downto 0) := (others => '0'); -- CLK counter while FETCH=1 + variable TMM_FETCH_CTR_REL : unsigned(PPU_TMM_CACHE_FETCH_A_WIDTH-1 downto 0) := (others => '0'); -- CLK counter relative for sprite[IDX] + begin + if RESET = '1' or FETCH = '0' then + TMM_FETCH_CTR := (others => '0'); + TMM_FETCH_CTR_REL := (others => '0'); TMM_CACHE_WEN <= '0'; TMM_CACHE_UPDATE_TURN <= '0'; + elsif rising_edge(CLK) then + TMM_FETCH_CTR := TMM_FETCH_CTR + 1; + TMM_FETCH_CTR_REL := TMM_FETCH_CTR - TMM_FETCH_CLK_RANGE_BEGIN; + + if TMM_FETCH_CTR >= TMM_FETCH_CLK_RANGE_BEGIN and + TMM_FETCH_CTR < (TMM_FETCH_CLK_RANGE_BEGIN + PPU_TMM_CACHE_FETCH_C_COUNT) then + TMM_CACHE_UPDATE_TURN <= '1'; + if TMM_FETCH_CTR_REL < PPU_TMM_CACHE_FETCH_C_COUNT - 1 then -- calculate address until second to last clock + T_TMM_ADDR <= std_logic_vector(resize(TMM_FETCH_CTR - IDX, T_TMM_ADDR'length)); + TMM_CACHE_ADDR <= std_logic_vector(resize(TMM_FETCH_CTR_REL - 1, TMM_CACHE_ADDR'length)); + end if; + + if TMM_FETCH_CTR_REL > 0 then -- read offset + TMM_CACHE_DATA <= T_TMM_DATA; + TMM_CACHE_WEN <= '1'; + end if; + else + TMM_CACHE_WEN <= '0'; + TMM_CACHE_UPDATE_TURN <= '0'; + end if; end if; - end if; - end process; + end process; + end generate; end Behavioral; diff --git a/basys3/basys3.xpr b/basys3/basys3.xpr index 87ec9fe..22b1d66 100644 --- a/basys3/basys3.xpr +++ b/basys3/basys3.xpr @@ -61,7 +61,7 @@