added warmup instruction support; updated all sample configuration fi…

…les with 100 million warmup instructions; added 3200 data rate option for DDR4; fixed some command scheduling bugs that were leading to activate-precharge without read or write command; changed the default scheduler to FRFCFS_Cap;
CMU-SAFARI · Dec 12, 2017 · cd96ed6 · lucjaulmes · Jul 2, 2018 · arthasSin
1 parent 7ce65d0
commit cd96ed6
Show file tree

Hide file tree

Showing 24 changed files with 228 additions and 53 deletions.
diff --git a/configs/ALDRAM-config.cfg b/configs/ALDRAM-config.cfg
@@ -21,6 +21,7 @@
 # early_exit = on, off (default value is on)
 # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
  expected_limit_insts = 200000000
+ warmup_insts = 100000000
  cache = no
 # cache = no, L1L2, L3, all (default value is no)
  translation = None

diff --git a/configs/DDR3-config.cfg b/configs/DDR3-config.cfg
@@ -21,6 +21,7 @@
 # early_exit = on, off (default value is on)
 # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
  expected_limit_insts = 200000000
+ warmup_insts = 100000000
  cache = no
 # cache = no, L1L2, L3, all (default value is no)
  translation = None

diff --git a/configs/DDR4-config.cfg b/configs/DDR4-config.cfg
@@ -21,6 +21,7 @@
 # early_exit = on, off (default value is on)
 # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
  expected_limit_insts = 200000000
+ warmup_insts = 100000000
  cache = no
 # cache = no, L1L2, L3, all (default value is no)
  translation = None

diff --git a/configs/DSARP-config.cfg b/configs/DSARP-config.cfg
@@ -22,6 +22,7 @@
 # early_exit = on, off (default value is on)
 # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
  expected_limit_insts = 200000000
+ warmup_insts = 100000000
  cache = no
 # cache = no, L1L2, L3, all (default value is no)
  translation = None

diff --git a/configs/GDDR5-config.cfg b/configs/GDDR5-config.cfg
@@ -21,6 +21,7 @@
 # early_exit = on, off (default value is on)
 # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
  expected_limit_insts = 200000000
+ warmup_insts = 100000000
  cache = no
 # cache = no, L1L2, L3, all (default value is no)
  translation = None

diff --git a/configs/HBM-config.cfg b/configs/HBM-config.cfg
@@ -21,6 +21,7 @@
 # early_exit = on, off (default value is on)
 # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
  expected_limit_insts = 200000000
+ warmup_insts = 100000000
  cache = no
 # cache = no, L1L2, L3, all (default value is no)
  translation = None

diff --git a/configs/LPDDR3-config.cfg b/configs/LPDDR3-config.cfg
@@ -21,6 +21,7 @@
 # early_exit = on, off (default value is on)
 # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
  expected_limit_insts = 200000000
+ warmup_insts = 100000000
  cache = no
 # cache = no, L1L2, L3, all (default value is no)
  translation = None

diff --git a/configs/LPDDR4-config.cfg b/configs/LPDDR4-config.cfg
@@ -21,6 +21,7 @@
 # early_exit = on, off (default value is on)
 # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
  expected_limit_insts = 200000000
+ warmup_insts = 100000000
  cache = no
 # cache = no, L1L2, L3, all (default value is no)
  translation = None

diff --git a/configs/SALP-config.cfg b/configs/SALP-config.cfg
@@ -22,6 +22,7 @@
 # early_exit = on, off (default value is on)
 # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
  expected_limit_insts = 200000000
+ warmup_insts = 100000000
  cache = no
 # cache = no, L1L2, L3, all (default value is no)
  translation = None

diff --git a/configs/TLDRAM-config.cfg b/configs/TLDRAM-config.cfg
@@ -22,6 +22,7 @@
 # early_exit = on, off (default value is on)
 # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
  expected_limit_insts = 200000000
+ warmup_insts = 100000000
  cache = no
 # cache = no, L1L2, L3, all (default value is no)
  translation = None

diff --git a/configs/WideIO-config.cfg b/configs/WideIO-config.cfg
@@ -21,6 +21,7 @@
 # early_exit = on, off (default value is on)
 # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
  expected_limit_insts = 200000000
+ warmup_insts = 100000000
  cache = no
 # cache = no, L1L2, L3, all (default value is no)
  translation = None

diff --git a/configs/WideIO2-config.cfg b/configs/WideIO2-config.cfg
@@ -21,6 +21,7 @@
 # early_exit = on, off (default value is on)
 # If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
  expected_limit_insts = 200000000
+ warmup_insts = 100000000
  cache = no
 # cache = no, L1L2, L3, all (default value is no)
  translation = None

diff --git a/src/Config.cpp b/src/Config.cpp
@@ -56,6 +56,8 @@ void Config::parse(const string& fname)
           mem_tick = atoi(tokens[1].c_str());
         } else if (tokens[0] == "expected_limit_insts") {
           expected_limit_insts = atoi(tokens[1].c_str());
+        } else if (tokens[0] == "warmup_insts") {
+          warmup_insts = atoi(tokens[1].c_str());
         }
     }
     file.close();

diff --git a/src/Config.h b/src/Config.h
@@ -22,6 +22,7 @@ class Config {
     int mem_tick;
     int core_num = 0;
     long expected_limit_insts = 0;
+    long warmup_insts = 0;
 
 public:
     Config() {}
@@ -60,6 +61,8 @@ class Config {
     int get_mem_tick() const {return mem_tick;}
     int get_core_num() const {return core_num;}
     long get_expected_limit_insts() const {return expected_limit_insts;}
+    long get_warmup_insts() const {return warmup_insts;}
+
     bool has_l3_cache() const {
       if (options.find("cache") != options.end()) {
         const std::string& cache_option = (options.find("cache"))->second;

diff --git a/src/Controller.cpp b/src/Controller.cpp
@@ -78,7 +78,7 @@ void Controller<TLDRAM>::tick(){
     /*** 3. Should we schedule writes? ***/
     if (!write_mode) {
         // yes -- write queue is almost full or read queue is empty
-        if (writeq.size() >= int(0.8 * writeq.max) || readq.size() == 0)
+        if (writeq.size() >= int(0.8 * writeq.max) /*|| readq.size() == 0*/)
             write_mode = true;
     }
     else {

diff --git a/src/Controller.h b/src/Controller.h
@@ -25,6 +25,8 @@ using namespace std;
 namespace ramulator
 {
 
+    extern bool warmup_complete;
+
 template <typename T>
 class Controller
 {
@@ -42,6 +44,7 @@ class Controller
     VectorStat write_row_hits;
     VectorStat write_row_misses;
     VectorStat write_row_conflicts;
+    ScalarStat useless_activates;
 
     ScalarStat read_latency_avg;
     ScalarStat read_latency_sum;
@@ -80,6 +83,11 @@ class Controller
 
     Queue readq;  // queue for read requests
     Queue writeq;  // queue for write requests
+    Queue actq; // read and write requests for which activate was issued are moved to 
+                   // actq, which has higher priority than readq and writeq.
+                   // This is an optimization
+                   // for avoiding useless activations (i.e., PRECHARGE
+                   // after ACTIVATE w/o READ of WRITE command)
     Queue otherq;  // queue for all "other" requests (e.g., refresh)
 
     deque<Request> pending;  // read requests that are about to receive data from DRAM
@@ -170,6 +178,12 @@ class Controller
             .precision(0)
             ;
 
+        useless_activates
+            .name("useless_activates_"+to_string(channel->id)+ "_core")
+            .desc("Number of useless activations. E.g, ACT -> PRE w/o RD or WR")
+            .precision(0)
+            ;
+
         read_transaction_bytes
             .name("read_transaction_bytes_"+to_string(channel->id))
             .desc("The total byte of read transaction per channel")
@@ -340,7 +354,10 @@ class Controller
         /*** 3. Should we schedule writes? ***/
         if (!write_mode) {
             // yes -- write queue is almost full or read queue is empty
-            if (writeq.size() >= int(0.8 * writeq.max) || readq.size() == 0)
+            if (writeq.size() >= int(0.8 * writeq.max) 
+                    /*|| readq.size() == 0*/) // Hasan: Switching to write mode when there are just a few 
+                                              // write requests, even if the read queue is empty, incurs a lot of overhead. 
+                                              // Commented out the read request queue empty condition
                 write_mode = true;
         }
         else {
@@ -350,11 +367,21 @@ class Controller
         }
 
         /*** 4. Find the best command to schedule, if any ***/
-        Queue* queue = !write_mode ? &readq : &writeq;
-        if (otherq.size())
-            queue = &otherq;  // "other" requests are rare, so we give them precedence over reads/writes
+
+        // First check the actq (which has higher priority) to see if there
+        // are requests available to service in this cycle
+        Queue* queue = &actq;
 
         auto req = scheduler->get_head(queue->q);
+        if (req == queue->q.end() || !is_ready(req)) {
+            queue = !write_mode ? &readq : &writeq;
+
+            if (otherq.size())
+                queue = &otherq;  // "other" requests are rare, so we give them precedence over reads/writes
+
+            req = scheduler->get_head(queue->q);
+        }
+
         if (req == queue->q.end() || !is_ready(req)) {
             // we couldn't find a command to schedule -- let's try to be speculative
             auto cmd = T::Command::PRE;
@@ -404,8 +431,15 @@ class Controller
         issue_cmd(cmd, get_addr_vec(cmd, req));
 
         // check whether this is the last command (which finishes the request)
-        if (cmd != channel->spec->translate[int(req->type)])
+        if (cmd != channel->spec->translate[int(req->type)]){
+            if(channel->spec->is_opening(cmd)) {
+                // promote the request that caused issuing activation to actq
+                actq.q.push_back(*req);
+                queue->q.erase(req);
+            }
+
             return;
+        }
 
         // set a future completion time for read requests
         if (req->type == Request::Type::READ) {
@@ -492,6 +526,13 @@ class Controller
     {
         assert(is_ready(cmd, addr_vec));
         channel->update(cmd, addr_vec.data(), clk);
+
+        if(cmd == T::Command::PRE){
+            if(rowtable->get_hits(addr_vec, true) == 0){
+                useless_activates++;
+            }
+        }
+
         rowtable->update(cmd, addr_vec, clk);
         if (record_cmd_trace){
             // select rank

diff --git a/src/DDR4.cpp b/src/DDR4.cpp
@@ -21,6 +21,7 @@ map<string, enum DDR4::Speed> DDR4::speed_map = {
     {"DDR4_1866M", DDR4::Speed::DDR4_1866M}, {"DDR4_1866N", DDR4::Speed::DDR4_1866N},
     {"DDR4_2133P", DDR4::Speed::DDR4_2133P}, {"DDR4_2133R", DDR4::Speed::DDR4_2133R},
     {"DDR4_2400R", DDR4::Speed::DDR4_2400R}, {"DDR4_2400U", DDR4::Speed::DDR4_2400U},
+    {"DDR4_3200", DDR4::Speed::DDR4_3200},
 };
 
 
@@ -52,40 +53,40 @@ void DDR4::set_rank_number(int rank) {
 
 void DDR4::init_speed()
 {
-    const static int RRDS_TABLE[2][4] = {
-        {4, 4, 4, 4},
-        {5, 5, 6, 7}
+    const static int RRDS_TABLE[2][5] = {
+        {4, 4, 4, 4, 4},
+        {5, 5, 6, 7, 9}
     };
-    const static int RRDL_TABLE[2][4] = {
-        {5, 5, 6, 6},
-        {6, 6, 7, 8}
+    const static int RRDL_TABLE[2][5] = {
+        {5, 5, 6, 6, 8},
+        {6, 6, 7, 8, 11}
     };
-    const static int FAW_TABLE[3][4] = {
-        {16, 16, 16, 16},
-        {20, 22, 23, 26},
-        {28, 28, 32, 36}
+    const static int FAW_TABLE[3][5] = {
+        {16, 16, 16, 16, 16},
+        {20, 22, 23, 26, 34},
+        {28, 28, 32, 36, 48}
     };
-    const static int RFC_TABLE[int(RefreshMode::MAX)][3][4] = {{   
-            {128, 150, 171, 192},
-            {208, 243, 278, 312},
-            {280, 327, 374, 420}
+    const static int RFC_TABLE[int(RefreshMode::MAX)][3][5] = {{   
+            {128, 150, 171, 192, 256},
+            {208, 243, 278, 312, 416},
+            {280, 327, 374, 420, 560}
         },{
-            {88, 103, 118, 132},
-            {128, 150, 171, 192},
-            {208, 243, 278, 312} 
+            {88, 103, 118, 132,  176},
+            {128, 150, 171, 192, 256},
+            {208, 243, 278, 312, 416} 
         },{
-            {72, 84, 96, 108},
-            {88, 103, 118, 132},
-            {128, 150, 171, 192}  
+            {72, 84, 96, 108, 144},
+            {88, 103, 118, 132, 176},
+            {128, 150, 171, 192, 256}  
         }
     };
-    const static int REFI_TABLE[4] = {
-        6240, 7280, 8320, 9360
+    const static int REFI_TABLE[5] = {
+        6240, 7280, 8320, 9360, 12480
     };
-    const static int XS_TABLE[3][4] = {
-        {136, 159, 182, 204},
-        {216, 252, 288, 324},
-        {288, 336, 384, 432}
+    const static int XS_TABLE[3][5] = {
+        {136, 159, 182, 204, 272},
+        {216, 252, 288, 324, 532},
+        {288, 336, 384, 432, 576}
     };
 
     int speed = 0, density = 0;
@@ -94,6 +95,7 @@ void DDR4::init_speed()
         case 1866: speed = 1; break;
         case 2133: speed = 2; break;
         case 2400: speed = 3; break;
+        case 3200: speed = 4; break;
         default: assert(false);
     };
     switch (org_entry.size >> 10){

diff --git a/src/DDR4.h b/src/DDR4.h
@@ -159,6 +159,7 @@ class DDR4
         DDR4_1866M, DDR4_1866N,
         DDR4_2133P, DDR4_2133R,
         DDR4_2400R, DDR4_2400U,
+        DDR4_3200,
         MAX
     };
 
@@ -192,7 +193,9 @@ class DDR4
         {2133, (400.0/3)*8, (3/0.4)/8, 4, 4, 6, 2, 15, 15, 15, 11, 36, 51, 8, 3, 8, 16, 0, 0, 0, 0, 0, 6, 7, 0, 7, 0, 0},
         {2133, (400.0/3)*8, (3/0.4)/8, 4, 4, 6, 2, 16, 16, 16, 11, 36, 52, 8, 3, 8, 16, 0, 0, 0, 0, 0, 6, 7, 0, 7, 0, 0},
         {2400, (400.0/3)*9, (3/0.4)/9, 4, 4, 6, 2, 16, 16, 16, 12, 39, 55, 9, 3, 9, 18, 0, 0, 0, 0, 0, 6, 8, 0, 7, 0, 0},
-        {2400, (400.0/3)*9, (3/0.4)/9, 4, 4, 6, 2, 18, 18, 18, 12, 39, 57, 9, 3, 9, 18, 0, 0, 0, 0, 0, 6, 8, 0, 7, 0, 0}
+        {2400, (400.0/3)*9, (3/0.4)/9, 4, 4, 6, 2, 18, 18, 18, 12, 39, 57, 9, 3, 9, 18, 0, 0, 0, 0, 0, 6, 8, 0, 7, 0, 0},
+        {3200, 1600, 0.625, prefetch_size/2/*DDR*/, 4,     10,   2,    22, 22,  22, 16,  56,  78, 12,  4,    12,   24, 8,    10,   40,  0,   0,    8,  10, 0,     8,     0,  0}
+        //rate, freq, tCK,  nBL,           nCCDS  nCCDL nRTRS nCL nRCD nRP nCWL nRAS nRC nRTP nWTRS nWTRL nWR nRRDS nRRDL nFAW nRFC nREFI nPD nXP nXPDLL nCKESR nXS nXSDLL
     }, speed_entry;
 
     int read_latency;