cscenter · PavelSavchenkov · Dec 3, 2014 · Dec 9, 2014 · Dec 9, 2014 · Dec 9, 2014
diff --git a/.gitignore b/.gitignore
@@ -1,15 +1,10 @@
-<<<<<<< HEAD
 *.html
 *~
 src/third_party/open-vcdiff/
 src/gtest
-
-
-=======
 .idea/
 .svn/
 *~
 .DS_Store
 src/third_party/open-vcdiff/src/config.h
 src/third_party/open-vcdiff/src/stamp-h1
->>>>>>> upstream/master
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -2,11 +2,7 @@ cmake_minimum_required(VERSION 2.8.11)
 project(SInGe)
 #set(CMAKE_VERBOSE_MAKEFILE ON)
 
-<<<<<<< HEAD
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Wextra")
-=======
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++1y -Wall -Wextra")
->>>>>>> upstream/master
 set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
 set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
 

diff --git a/README.md b/README.md
@@ -2,12 +2,4 @@
 
 [![Build Status](https://travis-ci.org/cscenter/SInGe.svg?branch=master)](https://travis-ci.org/cscenter/SInGe)
 
-## How to build first time:
-1) go to src/third_party/open-vcdiff
-2) ./autogen.sh
-3) ./configure
-
-
-After that use Cmake as usual
-
 SDCH Dictionary Incremental Geenrator
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -1,10 +1,7 @@
 add_subdirectory(third_party/)
 add_subdirectory(gtest)
 add_subdirectory(dict_builder)
-add_subdirectory(incremental_updater/)
-<<<<<<< HEAD
 add_subdirectory(incremental_tester/)
-=======
->>>>>>> upstream/master
-
+add_subdirectory(incremental_updater/)
+add_subdirectory(two_dicts_tester/)
 
diff --git a/src/dict_builder/CMakeLists.txt b/src/dict_builder/CMakeLists.txt
@@ -1,6 +1,3 @@
-<<<<<<< HEAD
-add_library( dictgen
-=======
 find_package(Protobuf REQUIRED)
 
 set (PROTO_SOURCES
@@ -10,15 +7,12 @@ set (PROTO_SOURCES
 PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS ${PROTO_SOURCES})
 
 add_library(dictgen
->>>>>>> upstream/master
 	dictionary.cpp
 	dictionary.hpp
 	node.cpp
 	node.hpp
 	suffix_automaton.cpp
 	suffix_automaton.hpp
-<<<<<<< HEAD
-=======
 	${PROTO_SRCS}
 	${PROTO_HDRS}
 )
@@ -30,7 +24,6 @@ target_include_directories (dictgen PUBLIC
  ${CMAKE_CURRENT_SOURCE_DIR}
  ${PROTOBUF_INCLUDE_DIRS}
  ${CMAKE_CURRENT_BINARY_DIR}
->>>>>>> upstream/master
 )
 
 add_executable(pzip
@@ -42,16 +35,12 @@ add_executable(dict_builder_tests
 	node_test.cpp
 	dictionary_test.cpp
 	suffix_automaton_test.cpp
-<<<<<<< HEAD
-=======
 	serialization_tests.cpp
->>>>>>> upstream/master
 )
 
 target_link_libraries(dict_builder_tests
   gtest_main
   dictgen
 )
-target_include_directories (dictgen PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 
 add_test(NAME dict_builder_tests COMMAND dict_builder_tests)
diff --git a/src/dict_builder/README.md b/src/dict_builder/README.md
@@ -22,3 +22,25 @@ So we can calculate `DocsOccursIn` for each node. We are going to solve the very
 In the last part of solution we just sort all survived substring by their rating and pick them until we reach the limit on the dictionary size.  
 
 Time complexity is `O(sum_length_documents)` with relatively small constant.
+
+**Further improvements**
+
+We can make our algorithm online, that is, process documents one by one (or set by set) and we should be able to obtain updated dictionary after each operation. Also we should remember that the newest documents should have the bigger influence to the resulting dictionary than the old ones. It has been done in the following way: after each update operation score of the all existing string multiplied by some constant  ![equation](http://www.sciweavers.org/upload/Tex2Img_1418126860/eqn.png). Smaller alpha corresponds to a huge sensitivity to the new documents. One can see that we can reach almost the same by multiplying the newest document by ![equation](http://www.sciweavers.org/upload/Tex2Img_1418126773/render.png).  
+
+Another modification is related to reducing automaton's size when it become too big. In that case we just delete the node without outgoing edges with smallest score (the leaf of the automaton) until we reach desired amount of nodes.  
+
+The remaining part of our model is still the same.
+
+**Usage**  
+
+We use this tool via objects of the class `Dictionary`. One can pass the following parameters to the constructor: maximum size of the dictionary, minimum length of a string in the dictionary, stop symbol (say, `#`), maximum size of the automaton, coefficient ![equation](http://www.sciweavers.org/upload/Tex2Img_1418126860/eqn.png).  
+
+Also, where are useful methods:  
+`Dictionary::AddDocument` corresponds to the operation `whole_string += document`,  
+`Dictionary::AddDocumentViaStopSymbol` corresponds to the `whole_string += stop_symbol + document`,
+`Dictionary::BuildDict` builds the dictionary from the current whole string,  
+`Dictionary::GetDict` returns dictionary obtained via the latest call of the previous method.  
+
+Note that if one do not call `GetDict` and add tons of documents, the dictionary from `GetDict` would be empty!  
+
+We suggest to call `GetDict` only then one really needs the current dictionary.
diff --git a/src/dict_builder/dictionary.cpp b/src/dict_builder/dictionary.cpp
@@ -1,36 +1,24 @@
-<<<<<<< HEAD
-=======
-#include <map>
->>>>>>> upstream/master
-#include <fstream>
+#include <algorithm>
+#include <cassert>
 #include <cmath>
 #include <cstdlib>
-#include <algorithm>
+#include <fstream>
 #include <iostream>
-<<<<<<< HEAD
-=======
-#include <queue>
-#include <cassert>
->>>>>>> upstream/master
 #include <map>
+#include <queue>
 
 #include "dictionary.hpp"
 #include "suffix_automaton.hpp"
 
-using std::vector;
-using std::string;
-using std::pair;
-using std::make_pair;
-<<<<<<< HEAD
-using std::endl;
-using std::cout;
-=======
 using std::cerr;
-using std::endl;
 using std::cout;
-using std::queue;
+using std::endl;
+using std::make_pair;
 using std::map;
->>>>>>> upstream/master
+using std::pair;
+using std::queue;
+using std::string;
+using std::vector;
 
 namespace {
   const double kEps = 1e-10;
@@ -44,18 +32,28 @@ namespace {
   }	
 };
 
-<<<<<<< HEAD
 Dictionary::Dictionary() : kMaxDict(1 << 20), kMinLen(20), kMinDocsOccursIn(2) {}
 
-Dictionary::Dictionary(size_t kMaxDict, size_t kMinLen, char kStopSymbol, size_t kMaxAutomatonSize, double kAutomatonCoef) : kMaxDict(kMaxDict), kMinLen(kMinLen), kMinDocsOccursIn(2), automaton_all_(SuffixAutomaton(kStopSymbol, kMaxAutomatonSize, kAutomatonCoef)) {
+Dictionary::Dictionary(size_t kMaxDict
+      , size_t kMinLen
+      , char kStopSymbol
+      , size_t kMaxAutomatonSize
+      , double kAutomatonCoef)
+    : kMaxDict(kMaxDict)
+    , kMinLen(kMinLen)
+    , kMinDocsOccursIn(2)
+    , automaton_all_(
+        SuffixAutomaton(kStopSymbol, kMaxAutomatonSize, kAutomatonCoef)) {
 }
-=======
-const size_t Dictionary::kMaxDict = 1 << 16;
-const size_t Dictionary::kMinLen = 3;
-const size_t Dictionary::kMinDocsOccursIn = 2;
 
-Dictionary::Dictionary() {}
->>>>>>> upstream/master
+Dictionary::Dictionary(size_t kMaxDict
+      , size_t kMinLen
+      , SuffixAutomaton& automaton)
+    : kMaxDict(kMaxDict)
+    , kMinLen(kMinLen)
+    , kMinDocsOccursIn(2)
+    , automaton_all_(automaton) {
+}
 
 Dictionary::~Dictionary() {}
 
@@ -101,45 +99,20 @@ void Dictionary::BuildDict() {
   ResetLastDocument();
   dict_.clear();
 
-<<<<<<< HEAD
-=======
-  cout << "automaton size = " << automaton_all_.AmountAliveNodes() << endl;
-/*
-  for (size_t id : automaton_all_) {
-    cout << "occurs " << GetNode(id)->docs_occurs_in << " " << GetNode(id)->len_within_document << endl;
-  }
-*/
-  cout << "building dictionary..." << endl;
-
->>>>>>> upstream/master
   vector<size_t> substrings; 
   CollectGoodSubstrings(&substrings);
 
   sort(substrings.begin(), substrings.end(), [&] (int id1, int id2) { return DoubleLess(automaton_all_.GetScore(id2), automaton_all_.GetScore(id1)); });
 
-<<<<<<< HEAD
-=======
-  cout << "good substrings have been collected and sorted" << endl;
-
->>>>>>> upstream/master
   size_t length_dict = 0;
   for (size_t i = 0; i < substrings.size() && length_dict + kMinLen <= kMaxDict; ++i) {
     auto* node = GetNode(substrings[i]);
     if  (length_dict + node->len_within_document > kMaxDict) {
       continue;
     }
-<<<<<<< HEAD
     length_dict += node->len_within_document;
     dict_.push_back(substrings[i]);
   }
-=======
-//    printf("occurs = %d, len = %d\n", node->docs_occurs_in, node->len_within_document);
-    length_dict += node->len_within_document;
-    dict_.push_back(substrings[i]);
-  }
-
-  cout << "dict's length = " << length_dict << endl;
->>>>>>> upstream/master
 }
 
 vector<pair<string, size_t> > Dictionary::GetDictSubstringsList() {
@@ -161,6 +134,10 @@ string Dictionary::GetDict() {
   return dict_str;
 }
 
+SuffixAutomaton& Dictionary::GetAutomaton() {
+  return automaton_all_;
+}
+
 void Dictionary::OutputDictTo(string path) {
   std::ofstream file(path);
   file << GetDict();
@@ -171,11 +148,8 @@ void Dictionary::ResetLastDocument() {
     return;
   }
 
-<<<<<<< HEAD
 //  cout << "calculate occurences for document with length " << last_document_.size() << endl;
-=======
-  cout << "calculate occurences for document with length " << last_document_.size() << endl;
->>>>>>> upstream/master
+
 	size_t cur_hash = (rand() << 16) ^ rand();
   size_t id = automaton_all_.root();
 	size_t pos = 0;
@@ -200,20 +174,8 @@ void Dictionary::CollectGoodSubstrings(vector <size_t>* substrings) {
   vector<double> max_score_substring(nodes, -1e20);
   vector<double> max_score_upstring(nodes, -1e20);
   vector<char> can_to_dict(nodes, true);
-<<<<<<< HEAD
   vector<size_t> order = automaton_all_.GetNodesInOrder();
 
-=======
-  vector<size_t> order;
-  order.reserve(nodes - 1);
-
-  for (size_t id : automaton_all_) {
-    order.push_back(id);
-  }
-
-  sort(order.begin(), order.end(), [&] (size_t id1, size_t id2) { return GetNode(id1)->len_actual < GetNode(id2)->len_actual; } );
-
->>>>>>> upstream/master
   // calc max_score_substring
   for (size_t id : order) {
     double max_score = -1e20;
@@ -291,7 +253,3 @@ void Dictionary::CollectGoodSubstrings(vector <size_t>* substrings) {
 bool Dictionary::CanAffordSubstringFrom(Node* node) const {
   return node->len_within_document >= kMinLen && node->docs_occurs_in >= kMinDocsOccursIn;
 }
-<<<<<<< HEAD
-
-=======
->>>>>>> upstream/master
diff --git a/src/dict_builder/dictionary.hpp b/src/dict_builder/dictionary.hpp
@@ -20,6 +20,8 @@ class Dictionary {
 
   Dictionary(size_t kMaxDict, size_t kMinLen, char kStopSymbol, size_t kMaxAutomatonSize, double kAutomatonCoef);
 
+  Dictionary(size_t kMaxDict, size_t kMinLen, SuffixAutomaton& automaton);
+
   ~Dictionary();
 
   void AddDocument(std::string& doc);
@@ -36,6 +38,8 @@ class Dictionary {
 
   std::string GetDict();
 
+  SuffixAutomaton& GetAutomaton();
+
   void OutputDictTo(std::string path);
 
   void ResetLastDocument();

diff --git a/src/dict_builder/dictionary_test.cpp b/src/dict_builder/dictionary_test.cpp
@@ -25,11 +25,7 @@ TEST(DictionaryTest, MainDictionaryTest) {
   std::string s2 = "qwecabarty";
   std::string s3 = "caba_cabaqwe";
 
-<<<<<<< HEAD
   Dictionary dict(100, 3, '#', 1000, 1.0);
-=======
-  Dictionary dict;
->>>>>>> upstream/master
   dict.AddDocumentViaStopSymbol(s1);
   dict.AddDocumentViaStopSymbol(s2);
   dict.AddDocumentViaStopSymbol(s3);

diff --git a/src/dict_builder/node.cpp b/src/dict_builder/node.cpp
@@ -74,14 +74,9 @@ bool Node::AddEdge(char ch, size_t to) {
 }
 
 bool Node::AddRevEdge(char ch, size_t from) {
-<<<<<<< HEAD
   for (auto& it : rev_edges_) {
     if  (it.second == from) {
       it.first = ch;
-=======
-  for (auto it : rev_edges_) {
-    if  (it == make_pair(ch, from)) {
->>>>>>> upstream/master
       return false;
     }
   }
@@ -103,11 +98,11 @@ void Node::SortEdges() {
   std::sort(edges_.begin(), edges_.end());
 }
 
-size_t Node::InDegree() {
+size_t Node::InDegree() const {
   return rev_edges_.size();
 }
 
-size_t Node::OutDegree() {
+size_t Node::OutDegree() const {
   return edges_.size();
 }
 
@@ -132,11 +127,7 @@ bool Node::DeleteRevEdge(size_t from) {
     if  (rev_edges_[i].second == from) {
       pos = i;
       break;
-<<<<<<< HEAD
-    } 
-=======
     }
->>>>>>> upstream/master
   }
   if  (pos < rev_edges_.size()) {
     rev_edges_.erase(rev_edges_.begin() + pos);
@@ -159,11 +150,9 @@ bool Node::DeleteRevLink(size_t from) {
   }
   return false;
 }
-<<<<<<< HEAD
-=======
 
 std::unique_ptr<ProtoNode> Node::GetProtoNode() const {
-  auto proto_node = std::make_unique<ProtoNode>();
+  auto proto_node = std::unique_ptr<ProtoNode>(new ProtoNode());
   auto *proto_repeated_ptrs_edges = proto_node->mutable_edges();
   proto_repeated_ptrs_edges->Reserve(edges_.size());
   for (const auto &edge : edges_) {
@@ -224,4 +213,3 @@ Node::Node(const ProtoNode& proto_node) : Node() {
     rev_links_.emplace_back(rev_link);
   }
 }
->>>>>>> upstream/master