From 6c831d90661030a4cef2f507bd0e2f465cf761a6 Mon Sep 17 00:00:00 2001 From: Nils Wistoff Date: Fri, 11 Aug 2023 11:13:39 +0200 Subject: [PATCH] coherence: Pack Valid/Dirty SRAM (#36) * sram: Replace by sram_pulp Does not introduce cuts and allows for parametric byte width Signed-off-by: Nils Wistoff * std_cache: Pack valid/dirty SRAM Signed-off-by: Nils Wistoff * std_cache: Decouple Valid/Dirty WE from Data WE Signed-off-by: Nils Wistoff --------- Signed-off-by: Nils Wistoff --- Bender.yml | 2 +- common/local/util/sram_pulp.sv | 90 ++++++++++++++++++++++++++++ core/Flist.cva6 | 2 +- core/cache_subsystem/cache_ctrl.sv | 13 ++-- core/cache_subsystem/miss_handler.sv | 19 +++--- core/cache_subsystem/std_nbdcache.sv | 26 +++----- core/include/std_cache_pkg.sv | 7 ++- corev_apu/tb/ariane_tb.cpp | 4 +- corev_apu/tb/ariane_tb.sv | 2 +- 9 files changed, 128 insertions(+), 37 deletions(-) create mode 100644 common/local/util/sram_pulp.sv diff --git a/Bender.yml b/Bender.yml index 774929d4ba4..c84fcf4c559 100644 --- a/Bender.yml +++ b/Bender.yml @@ -201,7 +201,7 @@ sources: - include_dirs: - common/local/util files: - - common/local/util/sram.sv + - common/local/util/sram_pulp.sv - target: not(all(fpga, xilinx)) include_dirs: diff --git a/common/local/util/sram_pulp.sv b/common/local/util/sram_pulp.sv new file mode 100644 index 00000000000..f3c4773bf43 --- /dev/null +++ b/common/local/util/sram_pulp.sv @@ -0,0 +1,90 @@ +// Copyright 2018 ETH Zurich and University of Bologna. +// Copyright and related rights are licensed under the Solderpad Hardware +// License, Version 0.51 (the "License"); you may not use this file except in +// compliance with the License. You may obtain a copy of the License at +// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law +// or agreed to in writing, software, hardware and materials distributed under +// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. +// +// Author: Florian Zaruba , ETH Zurich +// Michael Schaffner , ETH Zurich +// Nils Wistoff , ETH Zurich +// Date: 15.08.2018 +// Description: generic tc_sram wrapper for CVA6 +// +// Note: the wrapped module contains two different implementations for +// ALTERA and XILINX tools, since these follow different coding styles for +// inferrable RAMS with byte enable. define `FPGA_TARGET_XILINX or +// `FPGA_TARGET_ALTERA in your build environment (default is ALTERA) + +module sram #( + parameter DATA_WIDTH = 64, + parameter BYTE_WIDTH = 8, + parameter USER_WIDTH = 1, + parameter USER_EN = 0, + parameter NUM_WORDS = 1024, + parameter SIM_INIT = "none", + parameter OUT_REGS = 0 // enables output registers in FPGA macro (read lat = 2) +)( + input logic clk_i, + input logic rst_ni, + input logic req_i, + input logic we_i, + input logic [$clog2(NUM_WORDS)-1:0] addr_i, + input logic [USER_WIDTH-1:0] wuser_i, + input logic [DATA_WIDTH-1:0] wdata_i, + input logic [(DATA_WIDTH+BYTE_WIDTH-1)/BYTE_WIDTH-1:0] be_i, + output logic [USER_WIDTH-1:0] ruser_o, + output logic [DATA_WIDTH-1:0] rdata_o +); + + tc_sram #( + .NumWords ( NUM_WORDS ), + .DataWidth ( DATA_WIDTH ), + .ByteWidth ( BYTE_WIDTH ), + .NumPorts ( 32'd1 ), + .Latency ( 32'd1 ), + .SimInit ( SIM_INIT ), + .PrintSimCfg ( 1'b0 ) + ) i_tc_sram ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .req_i ( req_i ), + .we_i ( we_i ), + .be_i ( be_i ), + .wdata_i ( wdata_i ), + .addr_i ( addr_i ), + .rdata_o ( rdata_o ) + ); + + if (USER_EN > 0) begin : gen_mem_user + tc_sram #( + .NumWords ( NUM_WORDS ), + .DataWidth ( DATA_WIDTH ), + .ByteWidth ( BYTE_WIDTH ), + .NumPorts ( 32'd1 ), + .Latency ( 32'd1 ), + .SimInit ( SIM_INIT ), + .PrintSimCfg ( 1'b0 ) + ) i_tc_sram_user ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .req_i ( req_i ), + .we_i ( we_i ), + .be_i ( be_i ), + .wdata_i ( wuser_i ), + .addr_i ( addr_i ), + .rdata_o ( ruser_o ) + ); + + if (USER_WIDTH != DATA_WIDTH) begin : gen_err_data_user_width + $fatal(1, "sram_pulp: USER_WIDTH needs to be equal to DATA_WIDTH (if USER_EN is set)."); + end + + end else begin + assign ruser_o = '0; + end + +endmodule : sram diff --git a/core/Flist.cva6 b/core/Flist.cva6 index daf4b28a673..622ae838c58 100644 --- a/core/Flist.cva6 +++ b/core/Flist.cva6 @@ -180,7 +180,7 @@ ${CVA6_REPO_DIR}/common/local/util/instr_tracer_if.sv ${CVA6_REPO_DIR}/common/local/util/instr_tracer.sv ${CVA6_REPO_DIR}/common/local/util/tc_sram_wrapper.sv ${CVA6_REPO_DIR}/vendor/pulp-platform/tech_cells_generic/src/rtl/tc_sram.sv -${CVA6_REPO_DIR}/common/local/util/sram.sv +${CVA6_REPO_DIR}/common/local/util/sram_pulp.sv // MMU Sv39 ${CVA6_REPO_DIR}/core/mmu_sv39/mmu.sv diff --git a/core/cache_subsystem/cache_ctrl.sv b/core/cache_subsystem/cache_ctrl.sv index 9c7cf28ce82..3b876c4e52b 100644 --- a/core/cache_subsystem/cache_ctrl.sv +++ b/core/cache_subsystem/cache_ctrl.sv @@ -298,14 +298,15 @@ module cache_ctrl // two memory look-ups on a single-ported SRAM and therefore is non-atomic if (!mshr_index_matches_i) begin // store data, write dirty bit - req_o = hit_way_q; - addr_o = mem_req_q.index; - we_o = 1'b1; - - be_o.vldrty = hit_way_q; + req_o = hit_way_q; + addr_o = mem_req_q.index; + we_o = 1'b1; // set the correct byte enable - be_o.data[cl_offset>>3+:8] = mem_req_q.be; + be_o.data[cl_offset>>3+:8] = mem_req_q.be; + for (int unsigned i = 0; i < DCACHE_SET_ASSOC; i++) begin + if (hit_way_q[i]) be_o.vldrty[i] = '{valid: 1, dirty: be_o.data}; + end data_o.data[cl_offset+:64] = mem_req_q.wdata; // ~> change the state data_o.dirty[cl_offset>>3+:8] = mem_req_q.be; diff --git a/core/cache_subsystem/miss_handler.sv b/core/cache_subsystem/miss_handler.sv index cfafb50cb8a..0312510173d 100644 --- a/core/cache_subsystem/miss_handler.sv +++ b/core/cache_subsystem/miss_handler.sv @@ -299,11 +299,14 @@ module miss_handler // we've got a valid response from refill unit if (valid_miss_fsm) begin - addr_o = mshr_q.addr[DCACHE_INDEX_WIDTH-1:0]; - req_o = evict_way_q; - we_o = 1'b1; - be_o = '1; - be_o.vldrty = evict_way_q; + addr_o = mshr_q.addr[DCACHE_INDEX_WIDTH-1:0]; + req_o = evict_way_q; + we_o = 1'b1; + be_o.tag = '1; + be_o.data = '1; + for (int unsigned i = 0; i < DCACHE_SET_ASSOC; i++) begin + if (evict_way_q[i]) be_o.vldrty[i] = '1; + end data_o.tag = mshr_q.addr[DCACHE_TAG_WIDTH+DCACHE_INDEX_WIDTH-1:DCACHE_INDEX_WIDTH]; data_o.data = data_miss_fsm; data_o.valid = 1'b1; @@ -350,9 +353,11 @@ module miss_handler we_o = 1'b1; data_o.valid = INVALIDATE_ON_FLUSH ? 1'b0 : 1'b1; // invalidate - be_o.vldrty = evict_way_q; + for (int unsigned i = 0; i < DCACHE_SET_ASSOC; i++) begin + if (evict_way_q[i]) be_o.vldrty[i] = '1; + end // go back to handling the miss or flushing, depending on where we came from - state_d = (state_q == WB_CACHELINE_MISS) ? MISS : FLUSH_REQ_STATUS; + state_d = (state_q == WB_CACHELINE_MISS) ? MISS : FLUSH_REQ_STATUS; end end diff --git a/core/cache_subsystem/std_nbdcache.sv b/core/cache_subsystem/std_nbdcache.sv index 0cc9647c3f3..5d904cf998b 100644 --- a/core/cache_subsystem/std_nbdcache.sv +++ b/core/cache_subsystem/std_nbdcache.sv @@ -222,30 +222,20 @@ module std_nbdcache // Valid/Dirty Regs // ---------------- - // align each valid/dirty bit pair to a byte boundary in order to leverage byte enable signals. - // note: if you have an SRAM that supports flat bit enables for your target technology, - // you can use it here to save the extra 17x overhead introduced by this workaround. - logic [(DCACHE_LINE_WIDTH+8)*DCACHE_SET_ASSOC-1:0] dirty_wdata, dirty_rdata; + vldrty_t [DCACHE_SET_ASSOC-1:0] dirty_wdata, dirty_rdata; for (genvar i = 0; i < DCACHE_SET_ASSOC; i++) begin - for (genvar j = 0; j < DCACHE_LINE_WIDTH / 8; j++) begin - // dirty bits assignment - assign dirty_wdata[(DCACHE_LINE_WIDTH+8)*i+8*j] = wdata_ram.dirty[j]; - assign rdata_ram[i].dirty[j] = dirty_rdata[(DCACHE_LINE_WIDTH+8)*i+8*j]; - end - // valid bit assignment - assign dirty_wdata[DCACHE_LINE_WIDTH+(DCACHE_LINE_WIDTH+8)*i] = wdata_ram.valid; - assign rdata_ram[i].valid = dirty_rdata[DCACHE_LINE_WIDTH+(DCACHE_LINE_WIDTH+8)*i]; - end - - // be construction for valid_dirty_sram - for (genvar i = 0; i < DCACHE_SET_ASSOC; i++) begin - assign be_valid_dirty_ram[i*(DCACHE_LINE_WIDTH/8+1)+:(DCACHE_LINE_WIDTH/8+1)] = {be_ram.vldrty[i], be_ram.data} & {(DCACHE_LINE_WIDTH/8+1){be_ram.vldrty[i]}}; + assign dirty_wdata[i] = '{dirty: wdata_ram.dirty, valid: wdata_ram.valid}; + assign rdata_ram[i].dirty = dirty_rdata[i].dirty; + assign rdata_ram[i].valid = dirty_rdata[i].valid; + assign be_valid_dirty_ram[i].valid = be_ram.vldrty[i].valid; + assign be_valid_dirty_ram[i].dirty = be_ram.vldrty[i].dirty; end sram #( .USER_WIDTH(1), - .DATA_WIDTH((DCACHE_LINE_WIDTH + 8) * DCACHE_SET_ASSOC), + .DATA_WIDTH(DCACHE_SET_ASSOC * $bits(vldrty_t)), + .BYTE_WIDTH(1), .NUM_WORDS (DCACHE_NUM_WORDS) ) valid_dirty_sram ( .clk_i (clk_i), diff --git a/core/include/std_cache_pkg.sv b/core/include/std_cache_pkg.sv index 7513cc866bf..626a2b4d8b7 100644 --- a/core/include/std_cache_pkg.sv +++ b/core/include/std_cache_pkg.sv @@ -62,6 +62,11 @@ package std_cache_pkg; } bypass_rsp_t; typedef struct packed { + logic [ariane_pkg::DCACHE_LINE_WIDTH/8-1:0] dirty; + logic valid; + } vldrty_t; + + typedef struct packed { logic [ariane_pkg::DCACHE_TAG_WIDTH-1:0] tag; // tag array logic [ariane_pkg::DCACHE_LINE_WIDTH-1:0] data; // data array logic valid; // state array @@ -72,7 +77,7 @@ package std_cache_pkg; typedef struct packed { logic [(ariane_pkg::DCACHE_TAG_WIDTH+7)/8-1:0] tag; // byte enable into tag array logic [(ariane_pkg::DCACHE_LINE_WIDTH+7)/8-1:0] data; // byte enable into data array - logic [ariane_pkg::DCACHE_SET_ASSOC-1:0] vldrty; // bit enable into state array (valid for a pair of dirty/valid bits) + vldrty_t [ariane_pkg::DCACHE_SET_ASSOC-1:0] vldrty; // bit enable into state array } cl_be_t; // convert one hot to bin for -> needed for cache replacement diff --git a/corev_apu/tb/ariane_tb.cpp b/corev_apu/tb/ariane_tb.cpp index 0e4ff8fa775..5c03f1b7e2a 100644 --- a/corev_apu/tb/ariane_tb.cpp +++ b/corev_apu/tb/ariane_tb.cpp @@ -333,10 +333,10 @@ int main(int argc, char **argv) { // Preload memory. #if (VERILATOR_VERSION_INTEGER >= 5000000) // Verilator v5: Use rootp pointer and .data() accessor. -#define MEM top->rootp->ariane_testharness__DOT__i_sram__DOT__gen_cut__BRA__0__KET____DOT__i_tc_sram_wrapper__DOT__i_tc_sram__DOT__sram.m_storage +#define MEM top->rootp->ariane_testharness__DOT__i_sram__DOT__i_tc_sram__DOT__sram.m_storage #else // Verilator v4 -#define MEM top->ariane_testharness__DOT__i_sram__DOT__gen_cut__BRA__0__KET____DOT__i_tc_sram_wrapper__DOT__i_tc_sram__DOT__sram +#define MEM top->ariane_testharness__DOT__i_sram__DOT__i_tc_sram__DOT__sram #endif long long addr; long long len; diff --git a/corev_apu/tb/ariane_tb.sv b/corev_apu/tb/ariane_tb.sv index 711b08b0c1f..46421c6b470 100644 --- a/corev_apu/tb/ariane_tb.sv +++ b/corev_apu/tb/ariane_tb.sv @@ -19,7 +19,7 @@ import uvm_pkg::*; `include "uvm_macros.svh" -`define MAIN_MEM(P) dut.i_sram.gen_cut[0].i_tc_sram_wrapper.i_tc_sram.init_val[(``P``)] +`define MAIN_MEM(P) dut.i_sram.i_tc_sram.init_val[(``P``)] // `define USER_MEM(P) dut.i_sram.gen_cut[0].gen_mem.gen_mem_user.i_tc_sram_wrapper_user.i_tc_sram.init_val[(``P``)] import "DPI-C" function read_elf(input string filename);