diff --git a/.github/workflows/timbl.yml b/.github/workflows/timbl.yml index dd86342..97fc632 100644 --- a/.github/workflows/timbl.yml +++ b/.github/workflows/timbl.yml @@ -2,7 +2,7 @@ name: C/C++ CI on: push: - branches: [master] + branches: [master, valueclass] paths: - 'src/**' - 'include/**' @@ -32,6 +32,7 @@ jobs: ${{ github.actor }} started a build of ${{ github.event.repository.name }} [${{ steps.extract_branch.outputs.branch }}] + build: runs-on: ${{ matrix.os }} needs: notification diff --git a/configure.ac b/configure.ac index 63a6a8b..ecf91bc 100644 --- a/configure.ac +++ b/configure.ac @@ -1,12 +1,12 @@ # -*- Autoconf -*- # Process this file with autoconf to produce a configure script. -AC_PREREQ(2.61) +AC_PREREQ([2.69]) AC_INIT([timbl],[6.8],[lamasoftware@science.ru.nl]) #also adapt in codemeta.json! AM_INIT_AUTOMAKE AC_CONFIG_SRCDIR([.]) AC_CONFIG_MACRO_DIR([m4]) -AC_CONFIG_HEADER([config.h]) +AC_CONFIG_HEADERS([config.h]) if test x"${CXXFLAGS+set}" = xset; then # the user set CXXFLAGS; don't override it. diff --git a/demos/api_test6.cxx b/demos/api_test6.cxx index 02c3478..763bae7 100755 --- a/demos/api_test6.cxx +++ b/demos/api_test6.cxx @@ -34,12 +34,12 @@ using namespace Timbl; int main(){ TimblAPI My_Experiment( "-a IB1 +vDI+DB -G 0 -k3", "test6" ); My_Experiment.Learn( "dimin.train" ); - const ValueDistribution *vd; + const ClassDistribution *vd; const TargetValue *tv = My_Experiment.Classify( std::string("-,=,O,m,+,h,K,=,-,n,I,N,K"), vd ); cout << "resulting target: " << tv << endl; cout << "resulting Distribution: " << vd << endl; - ValueDistribution::dist_iterator it=vd->begin(); + ClassDistribution::dist_iterator it=vd->begin(); while ( it != vd->end() ){ cout << it->second << " OR "; cout << it->second->Value() << " " << it->second->Weight() << endl; @@ -48,7 +48,7 @@ int main(){ cout << "the same with neighborSets" << endl; const neighborSet *nb = My_Experiment.classifyNS( "-,=,O,m,+,h,K,=,-,n,I,N,K" ); - WValueDistribution *vd2 = nb->bestDistribution(); + WClassDistribution *vd2 = nb->bestDistribution(); vd2->Normalize(); cout << "default answer " << vd2 << endl; decayStruct *dc = new expDecay(0.3); diff --git a/include/timbl/BestArray.h b/include/timbl/BestArray.h index c385714..fa22787 100644 --- a/include/timbl/BestArray.h +++ b/include/timbl/BestArray.h @@ -38,15 +38,15 @@ namespace Timbl { friend std::ostream& operator<< ( std::ostream&, const BestRec * ); public: BestRec(); + BestRec( const BestRec& ) = delete; // forbid copies + BestRec& operator=( const BestRec& ) = delete; // forbid copies ~BestRec(); size_t totalBests() const { return aggregateDist.totalSize(); }; double bestDistance; - ValueDistribution aggregateDist; - std::vector bestDistributions; + ClassDistribution aggregateDist; + std::vector bestDistributions; std::vector bestInstances; private: - BestRec( const BestRec& ); - BestRec& operator=( const BestRec& ); }; class BestArray { @@ -61,7 +61,7 @@ namespace Timbl { ~BestArray(); void init( unsigned int, unsigned int, bool, bool, bool ); double addResult( double, - const ValueDistribution *, + const ClassDistribution *, const icu::UnicodeString& ); void initNeighborSet( neighborSet& ) const; void addToNeighborSet( neighborSet& , size_t ) const; diff --git a/include/timbl/Choppers.h b/include/timbl/Choppers.h index f41b284..b05c766 100644 --- a/include/timbl/Choppers.h +++ b/include/timbl/Choppers.h @@ -34,8 +34,13 @@ namespace Timbl{ class Chopper { public: + Chopper(): + vSize(0) + {}; virtual ~Chopper() {}; virtual bool chop( const icu::UnicodeString&, size_t ) = 0; + const icu::UnicodeString& operator[]( int i ) const { + return choppedInput[i]; } const icu::UnicodeString& getField( size_t i ) const { return choppedInput[i]; }; @@ -68,24 +73,32 @@ namespace Timbl{ class ExChopper: public virtual Chopper { public: - double getExW() const { return exW; }; + ExChopper(): + Chopper(), + exW(-1.0) + {}; + double getExW() const override { return exW; }; protected: - void init( const icu::UnicodeString&, size_t, bool ); + void init( const icu::UnicodeString&, size_t, bool ) override; double exW; }; class OccChopper: public virtual Chopper { public: - int getOcc() const { return occ; }; + OccChopper(): + Chopper(), + occ(-1) + {}; + int getOcc() const override { return occ; }; protected: - void init( const icu::UnicodeString&, size_t, bool ); + void init( const icu::UnicodeString&, size_t, bool ) override; int occ; }; class C45_Chopper : public virtual Chopper { public: - bool chop( const icu::UnicodeString&, size_t ); - icu::UnicodeString getString() const; + bool chop( const icu::UnicodeString&, size_t ) override; + icu::UnicodeString getString() const override; }; class C45_ExChopper : public C45_Chopper, public ExChopper { @@ -96,7 +109,7 @@ namespace Timbl{ class ARFF_Chopper : public C45_Chopper { public: - bool chop( const icu::UnicodeString&, size_t ); + bool chop( const icu::UnicodeString&, size_t ) override; }; class ARFF_ExChopper : public C45_ExChopper { @@ -107,8 +120,8 @@ namespace Timbl{ class Bin_Chopper : public virtual Chopper { public: - bool chop( const icu::UnicodeString&, size_t ); - icu::UnicodeString getString() const; + bool chop( const icu::UnicodeString&, size_t ) override; + icu::UnicodeString getString() const override; }; class Bin_ExChopper : public Bin_Chopper, public ExChopper { @@ -120,8 +133,8 @@ namespace Timbl{ class Compact_Chopper : public virtual Chopper { public: explicit Compact_Chopper( int L ): fLen(L){}; - bool chop( const icu::UnicodeString&, size_t ); - icu::UnicodeString getString() const; + bool chop( const icu::UnicodeString&, size_t ) override; + icu::UnicodeString getString() const override; private: int fLen; Compact_Chopper(); @@ -143,8 +156,8 @@ namespace Timbl{ class Columns_Chopper : public virtual Chopper { public: - bool chop( const icu::UnicodeString&, size_t ); - icu::UnicodeString getString() const; + bool chop( const icu::UnicodeString&, size_t ) override; + icu::UnicodeString getString() const override; }; class Columns_ExChopper : public Columns_Chopper, public ExChopper { @@ -155,8 +168,8 @@ namespace Timbl{ class Tabbed_Chopper : public virtual Chopper { public: - bool chop( const icu::UnicodeString&, size_t ); - icu::UnicodeString getString() const; + bool chop( const icu::UnicodeString&, size_t ) override; + icu::UnicodeString getString() const override; }; class Tabbed_ExChopper : public Tabbed_Chopper, public ExChopper { @@ -168,8 +181,8 @@ namespace Timbl{ class Sparse_Chopper : public virtual Chopper { public: - bool chop( const icu::UnicodeString&, size_t ); - icu::UnicodeString getString() const; + bool chop( const icu::UnicodeString&, size_t ) override; + icu::UnicodeString getString() const override; }; class Sparse_ExChopper : public Sparse_Chopper, public ExChopper { diff --git a/include/timbl/Common.h b/include/timbl/Common.h index 7b00c85..4f2a5a6 100644 --- a/include/timbl/Common.h +++ b/include/timbl/Common.h @@ -29,12 +29,13 @@ #define TIMBL_COMMON_H #include +#include #include -#include #include namespace Common { - const double Epsilon = DBL_EPSILON; // smallest x so that 1+x != 1 + const double Epsilon = std::numeric_limits::epsilon(); + // smallest x so that 1+x != 1 const int DEFAULT_MAX_FEATS = 2500; // default maximun number of Features std::string Version(); diff --git a/include/timbl/Features.h b/include/timbl/Features.h new file mode 100644 index 0000000..4febb8c --- /dev/null +++ b/include/timbl/Features.h @@ -0,0 +1,222 @@ +/* + Copyright (c) 1998 - 2023 + ILK - Tilburg University + CLST - Radboud University + CLiPS - University of Antwerp + + This file is part of timbl + + timbl is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + timbl is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . + + For questions and suggestions, see: + https://github.com/LanguageMachines/timbl/issues + or send mail to: + lamasoftware (at ) science.ru.nl + +*/ +#ifndef TIMBL_FEATURES_H +#define TIMBL_FEATURES_H + +#include +#include +#include +#include "timbl/MsgClass.h" +#include "timbl/Matrices.h" +#include "ticcutils/Unicode.h" + +namespace Hash { + class UnicodeHash; +} + +namespace Timbl { + + class ValueClass; + class TargetValue; + class Targets; + class metricClass; + + class SparseValueProbClass { + friend std::ostream& operator<< ( std::ostream&, SparseValueProbClass * ); + public: + typedef std::map< size_t, double > IDmaptype; + typedef IDmaptype::const_iterator IDiterator; + explicit SparseValueProbClass( size_t d ): dimension(d) {}; + void Assign( const size_t i, const double d ) { vc_map[i] = d; }; + void Clear() { vc_map.clear(); }; + IDiterator begin() const { return vc_map.begin(); }; + IDiterator end() const { return vc_map.end(); }; + private: + IDmaptype vc_map; + size_t dimension; + }; + + enum FeatVal_Stat { + Unknown, + Singleton, + SingletonNumeric, + NumericValue, + NotNumeric + }; + + class FeatureValue: public ValueClass { + friend class Feature; + friend class Feature_List; + friend struct D_D; + public: + explicit FeatureValue( const icu::UnicodeString& ); + FeatureValue( const icu::UnicodeString&, size_t ); + ~FeatureValue(); + void ReconstructDistribution( const ClassDistribution& vd ) { + TargetDist.Merge( vd ); + _frequency = TargetDist.totalSize(); + }; + bool isUnknown() const { return _index == 0; }; + SparseValueProbClass *valueClassProb() const { return ValueClassProb; }; + private: + SparseValueProbClass *ValueClassProb; + ClassDistribution TargetDist; + }; + + + class Feature: public MsgClass { + friend class MBLClass; + friend class Feature_List; + public: + explicit Feature( Hash::UnicodeHash *T ); + ~Feature(); + bool Ignore() const { return ignore; }; + void Ignore( const bool val ){ ignore = val; }; + bool setMetricType( const MetricType ); + MetricType getMetricType() const; + double Weight() const { return weight; }; + void SetWeight( const double w ) { weight = w; }; + double InfoGain() const { return info_gain; }; + void InfoGain( const double w ){ info_gain = w; }; + double SplitInfo() const { return split_info; }; + void SplitInfo( const double w ){ split_info = w; }; + double GainRatio() const { return gain_ratio; }; + void GainRatio( const double w ){ gain_ratio = w; }; + double ChiSquare() const { return chi_square; }; + void ChiSquare( const double w ){ chi_square = w; }; + double SharedVariance() const { return shared_variance; }; + void SharedVariance( const double w ){ shared_variance = w; }; + double StandardDeviation() const { return standard_deviation; }; + void StandardDeviation( const double w ){ standard_deviation = w; }; + double Min() const { return n_min; }; + void Min( const double val ){ n_min = val; }; + double Max() const { return n_max; }; + void Max( const double val ){ n_max = val; }; + double fvDistance( FeatureValue *, FeatureValue *, size_t=1 ) const; + FeatureValue *add_value( const icu::UnicodeString&, TargetValue *, int=1 ); + FeatureValue *add_value( size_t, TargetValue *, int=1 ); + FeatureValue *Lookup( const icu::UnicodeString& ) const; + bool decrement_value( FeatureValue *, TargetValue * ); + bool increment_value( FeatureValue *, TargetValue * ); + size_t EffectiveValues() const; + size_t TotalValues() const; + bool isNumerical() const; + bool isStorableMetric() const; + bool AllocSparseArrays( size_t ); + void InitSparseArrays(); + bool ArrayRead(){ return vcpb_read; }; + bool matrixPresent( bool& ) const; + size_t matrix_byte_size() const; + bool store_matrix( int = 1 ); + void clear_matrix(); + bool fill_matrix( std::istream& ); + void print_matrix( std::ostream&, bool = false ) const; + void print_vc_pb_array( std::ostream& ) const; + bool read_vc_pb_array( std::istream & ); + FeatVal_Stat prepare_numeric_stats(); + void Statistics( double, const Targets&, bool ); + void NumStatistics( double, const Targets&, int, bool ); + void ClipFreq( size_t f ){ matrix_clip_freq = f; }; + size_t ClipFreq() const { return matrix_clip_freq; }; + SparseSymetricMatrix *metric_matrix; + private: + Feature( const Feature& ); + Feature& operator=( const Feature& ); + Hash::UnicodeHash *TokenTree; + metricClass *metric; + bool ignore; + bool numeric; + bool vcpb_read; + enum ps_stat{ ps_undef, ps_failed, ps_ok, ps_read }; + enum ps_stat PrestoreStatus; + MetricType Prestored_metric; + void delete_matrix(); + double entropy; + double info_gain; + double split_info; + double gain_ratio; + double chi_square; + double shared_variance; + double standard_deviation; + size_t matrix_clip_freq; + std::vector n_dot_j; + std::vector n_i_dot; + double n_min; + double n_max; + size_t SaveSize; + size_t SaveNum; + double weight; + void Statistics( double ); + void NumStatistics( std::vector&, double ); + void ChiSquareStatistics( std::vector&, const Targets& ); + void ChiSquareStatistics( const Targets& ); + void SharedVarianceStatistics( const Targets&, int ); + void StandardDeviationStatistics(); + std::vector values_array; + std::unordered_map< size_t, FeatureValue *> reverse_values; + bool is_reference; + }; + + class Feature_List: public MsgClass { + friend class MBLClass; + public: + Feature_List(): + _eff_feats(0), + _num_of_feats(0), + _num_of_num_feats(0), + _feature_hash(0), + _is_reference(false) + { + } + explicit Feature_List( Hash::UnicodeHash *hash ): + Feature_List() + { + _feature_hash = hash; + } + Feature_List &operator=( const Feature_List& ); + ~Feature_List(); + void init( size_t, const std::vector& ); + Hash::UnicodeHash *hash() const { return _feature_hash; }; + size_t effective_feats(){ return _eff_feats; }; + Feature *operator[]( size_t i ) const { return feats[i]; }; + void write_permutation( std::ostream & ) const; + void calculate_permutation( const std::vector& ); + size_t _eff_feats; + size_t _num_of_feats; + size_t _num_of_num_feats; + std::vector feats; + std::vector perm_feats; + std::vector permutation; + private: + Hash::UnicodeHash *_feature_hash; + bool _is_reference; + }; + +} // namespace Timbl + +#endif // TIMBL_FEATURES_H diff --git a/include/timbl/GetOptClass.h b/include/timbl/GetOptClass.h index e8ebcf8..e9bad8b 100644 --- a/include/timbl/GetOptClass.h +++ b/include/timbl/GetOptClass.h @@ -31,12 +31,15 @@ #include #include +namespace TiCC { + class CL_Options; +} namespace Timbl { class TimblExperiment; - class GetOptClass: public MsgClass { public: explicit GetOptClass( const TiCC::CL_Options& ); + GetOptClass& operator=( const GetOptClass& ) = delete; // forbid copies virtual ~GetOptClass(); GetOptClass *Clone( std::ostream * = 0 ) const; bool parse_options( const TiCC::CL_Options&, const int=0 ); @@ -46,6 +49,7 @@ namespace Timbl { int MaxFeatures() const { return MaxFeats; }; VerbosityFlags getVerbosity() { return myVerbosity; }; private: + GetOptClass( const GetOptClass& ); AlgorithmType local_algo; MetricType local_metric; OrdeningType local_order; @@ -93,14 +97,12 @@ namespace Timbl { std::string inPath; std::string outPath; int occIn; - void Error( const std::string& ) const; + void Error( const std::string& ) const override; inline bool parse_range( std::string&, std::string::iterator&, MetricType ); inline bool parse_metrics( const std::string&, MetricType& ); - GetOptClass( const GetOptClass& ); - GetOptClass& operator=( const GetOptClass& ); }; } diff --git a/include/timbl/IBtree.h b/include/timbl/IBtree.h index 24a5e2d..3438dd1 100644 --- a/include/timbl/IBtree.h +++ b/include/timbl/IBtree.h @@ -48,10 +48,11 @@ namespace Timbl { class Feature; class FeatureValue; class Instance; - class Target; + class Feature_List; + class Targets; class TargetValue; - class ValueDistribution; - class WValueDistribution; + class ClassDistribution; + class WClassDistribution; class IBtree { friend class InstanceBase_base; @@ -68,12 +69,14 @@ namespace Timbl { private: FeatureValue *FValue; const TargetValue *TValue; - ValueDistribution *TDistribution; + ClassDistribution *TDistribution; IBtree *link; IBtree *next; IBtree(); explicit IBtree( FeatureValue * ); + IBtree( const IBtree& ) = delete; // forbid copies + IBtree& operator=( const IBtree& ) = delete; // forbid copies ~IBtree(); IBtree *Reduce( const TargetValue *, unsigned long&, long ); #ifdef IBSTATS @@ -81,7 +84,7 @@ namespace Timbl { #else static inline IBtree *add_feat_val( FeatureValue *, IBtree **, unsigned long& ); #endif - inline ValueDistribution *sum_distributions( bool ); + inline ClassDistribution *sum_distributions( bool ); inline IBtree *make_unique( const TargetValue *, unsigned long& ); void cleanDistributions(); void re_assign_defaults( bool, bool ); @@ -90,11 +93,9 @@ namespace Timbl { void countBranches( unsigned int, std::vector&, std::vector& ); - const ValueDistribution *exact_match( const Instance& ) const; + const ClassDistribution *exact_match( const Instance& ) const; protected: const IBtree *search_node( FeatureValue * ) const; - IBtree( const IBtree& ); - IBtree& operator=( const IBtree& ); }; typedef std::unordered_map FI_map; @@ -103,8 +104,8 @@ namespace Timbl { friend class IG_InstanceBase; friend class TRIBL_InstanceBase; friend class TRIBL2_InstanceBase; - InstanceBase_base( const InstanceBase_base& ); - InstanceBase_base& operator=( const InstanceBase_base& ); + InstanceBase_base( const InstanceBase_base& ) = delete; // forbid copies + InstanceBase_base& operator=( const InstanceBase_base& ) = delete; // forbid copies friend std::ostream& operator<<( std::ostream &os, const InstanceBase_base& ); friend std::ostream& operator<<( std::ostream &os, @@ -119,23 +120,23 @@ namespace Timbl { void summarizeNodes( std::vector&, std::vector& ); virtual bool MergeSub( InstanceBase_base * ); - const ValueDistribution *ExactMatch( const Instance& I ) const { + const ClassDistribution *ExactMatch( const Instance& I ) const { return InstBase->exact_match( I ); }; - virtual const ValueDistribution *InitGraphTest( std::vector&, + virtual const ClassDistribution *InitGraphTest( std::vector&, const std::vector *, - size_t, - size_t ); - virtual const ValueDistribution *NextGraphTest( std::vector&, + const size_t, + const size_t ); + virtual const ClassDistribution *NextGraphTest( std::vector&, size_t& ); unsigned long int GetDistSize( ) const { return NumOfTails; }; - virtual const ValueDistribution *IG_test( const Instance& , size_t&, bool&, + virtual const ClassDistribution *IG_test( const Instance& , size_t&, bool&, const TargetValue *& ); virtual IB_InstanceBase *TRIBL_test( const Instance& , size_t, const TargetValue *&, - const ValueDistribution *&, + const ClassDistribution *&, size_t& ); virtual IB_InstanceBase *TRIBL2_test( const Instance& , - const ValueDistribution *&, + const ClassDistribution *&, size_t& ); bool read_hash( std::istream&, Hash::UnicodeHash&, @@ -151,20 +152,18 @@ namespace Timbl { void toXML( std::ostream& ); void printStatsTree( std::ostream&, unsigned int startLevel ); virtual bool ReadIB( std::istream&, - std::vector&, - Target&, - int ); - virtual bool ReadIB( std::istream&, - std::vector&, - Target&, - Hash::UnicodeHash&, - Hash::UnicodeHash&, + Feature_List&, + Targets&, int ); + virtual bool ReadIB_hashed( std::istream&, + Feature_List&, + Targets&, + int ); virtual void Prune( const TargetValue *, long = 0 ); virtual bool IsPruned() const { return false; }; void CleanPartition( bool ); unsigned long int GetSizeInfo( unsigned long int&, double & ) const; - const ValueDistribution *TopDist() const { return TopDistribution; }; + const ClassDistribution *TopDist() const { return TopDistribution; }; bool HasDistributions() const; const TargetValue *TopTarget( bool & ); bool PersistentD() const { return PersistentDistributions; }; @@ -181,47 +180,46 @@ namespace Timbl { bool Random; bool PersistentDistributions; int Version; - ValueDistribution *TopDistribution; - WValueDistribution *WTop; + ClassDistribution *TopDistribution; + WClassDistribution *WTop; const TargetValue *TopT; FI_map fast_index; bool tiedTop; IBtree *InstBase; IBtree *LastInstBasePos; - const IBtree **RestartSearch; - const IBtree **SkipSearch; - const IBtree **InstPath; + std::vector RestartSearch; + std::vector SkipSearch; + std::vector InstPath; unsigned long int& ibCount; size_t Depth; unsigned long int NumOfTails; - IBtree *read_list( std::istream &, - std::vector&, - Target&, + IBtree *read_list( std::istream&, + Feature_List&, + Targets&, int ); - IBtree *read_local( std::istream &, - std::vector&, - Target&, + IBtree *read_local( std::istream&, + Feature_List&, + Targets&, int ); - IBtree *read_list_hashed( std::istream &, - std::vector&, - Target&, + IBtree *read_list_hashed( std::istream&, + Feature_List&, + Targets&, int ); - IBtree *read_local_hashed( std::istream &, - std::vector&, - Target&, + IBtree *read_local_hashed( std::istream&, + Feature_List&, + Targets&, int ); void write_tree( std::ostream &os, const IBtree * ) const; void write_tree_hashed( std::ostream &os, const IBtree * ) const; bool read_IB( std::istream&, - std::vector&, - Target&, + Feature_List& , + Targets&, int ); - bool read_IB( std::istream&, - std::vector&, - Target&, - Hash::UnicodeHash&, - Hash::UnicodeHash&, int ); + bool read_IB_hashed( std::istream&, + Feature_List& , + Targets&, + int ); void fill_index(); const IBtree *fast_search_node( FeatureValue * ); }; @@ -234,17 +232,15 @@ namespace Timbl { effFeat(0), testInst(0) {}; - IB_InstanceBase *Copy() const; - IB_InstanceBase *clone() const; - const ValueDistribution *InitGraphTest( std::vector&, + IB_InstanceBase *Copy() const override; + IB_InstanceBase *clone() const override; + const ClassDistribution *InitGraphTest( std::vector&, const std::vector *, - size_t, - size_t ); - const ValueDistribution *NextGraphTest( std::vector&, - size_t& ); + const size_t, + const size_t ) override; + const ClassDistribution *NextGraphTest( std::vector&, + size_t& ) override; private: - IB_InstanceBase( const IB_InstanceBase& ); // inhibit copy - IB_InstanceBase& operator=( const IB_InstanceBase& ); // inhibit copy size_t offSet; size_t effFeat; const std::vector *testInst; @@ -255,24 +251,24 @@ namespace Timbl { IG_InstanceBase( size_t size, unsigned long& cnt, bool rand, bool pruned, bool keep_dists ): InstanceBase_base( size, cnt, rand, keep_dists ), Pruned( pruned ) {}; - IG_InstanceBase *clone() const; - IG_InstanceBase *Copy() const; - void Prune( const TargetValue *, long = 0 ); + IG_InstanceBase *clone() const override; + IG_InstanceBase *Copy() const override; + void Prune( const TargetValue *, long = 0 ) override; void specialPrune( const TargetValue * ); - bool IsPruned() const { return Pruned; }; - const ValueDistribution *IG_test( const Instance& , size_t&, bool&, - const TargetValue *& ); - bool ReadIB( std::istream&, - std::vector&, - Target&, - int ); + bool IsPruned() const override { return Pruned; }; + const ClassDistribution *IG_test( const Instance& , + size_t&, + bool&, + const TargetValue *& ) override; bool ReadIB( std::istream&, - std::vector&, - Target&, - Hash::UnicodeHash&, - Hash::UnicodeHash&, - int ); - bool MergeSub( InstanceBase_base * ); + Feature_List&, + Targets&, + int ) override; + bool ReadIB_hashed( std::istream&, + Feature_List&, + Targets&, + int ) override; + bool MergeSub( InstanceBase_base * ) override; protected: bool Pruned; }; @@ -282,13 +278,13 @@ namespace Timbl { TRIBL_InstanceBase( size_t size, unsigned long& cnt, bool rand, bool keep_dists ): InstanceBase_base( size, cnt, rand, keep_dists ), Threshold(0) {}; - TRIBL_InstanceBase *clone() const; - TRIBL_InstanceBase *Copy() const; + TRIBL_InstanceBase *clone() const override; + TRIBL_InstanceBase *Copy() const override; IB_InstanceBase *TRIBL_test( const Instance&, size_t, const TargetValue *&, - const ValueDistribution *&, - size_t& ); + const ClassDistribution *&, + size_t& ) override; private: IB_InstanceBase *IBPartition( IBtree * ) const; void AssignDefaults( size_t ); @@ -301,11 +297,11 @@ namespace Timbl { bool rand, bool keep_dists ): InstanceBase_base( size, cnt, rand, keep_dists ) { }; - TRIBL2_InstanceBase *clone() const; - TRIBL2_InstanceBase *Copy() const; + TRIBL2_InstanceBase *clone() const override; + TRIBL2_InstanceBase *Copy() const override; IB_InstanceBase *TRIBL2_test( const Instance& , - const ValueDistribution *&, - size_t& ); + const ClassDistribution *&, + size_t& ) override; private: IB_InstanceBase *IBPartition( IBtree * ) const; }; diff --git a/include/timbl/Instance.h b/include/timbl/Instance.h index cc903f4..145016e 100644 --- a/include/timbl/Instance.h +++ b/include/timbl/Instance.h @@ -28,17 +28,9 @@ #ifndef TIMBL_INSTANCE_H #define TIMBL_INSTANCE_H -#include -#include -#include -#include -#include -#include "unicode/unistr.h" -#include "timbl/MsgClass.h" #include "ticcutils/Unicode.h" - -template -class SparseSymetricMatrix; +#include "timbl/Targets.h" +#include "timbl/Features.h" namespace Hash { class UnicodeHash; @@ -46,308 +38,17 @@ namespace Hash { namespace Timbl { - enum FeatVal_Stat { Unknown, Singleton, SingletonNumeric, NumericValue, - NotNumeric }; - class TargetValue; - - class Vfield{ - friend class ValueDistribution; - friend class WValueDistribution; - friend std::ostream& operator<<( std::ostream&, const Vfield& ); - friend std::ostream& operator<<( std::ostream&, const Vfield * ); - public: - Vfield( const TargetValue *val, int freq, double w ): - value(val), frequency(freq), weight(w) {}; - Vfield( const Vfield& in ): - value(in.value), frequency(in.frequency), weight(in.weight) {}; - ~Vfield(){}; - std::ostream& put( std::ostream& ) const; - const TargetValue *Value() const { return value; }; - void Value( const TargetValue *t ){ value = t; }; - size_t Freq() const { return frequency; }; - void IncFreq( int inc=1 ) { frequency += inc; }; - void AddFreq( int f ) { frequency += f; weight += f; }; - void DecFreq() { frequency -= 1; }; - double Weight() const { return weight; }; - void SetWeight( double w ){ weight = w; }; - size_t Index(); - protected: - const TargetValue *value; - size_t frequency; - double weight; - private: - Vfield& operator=( const Vfield& ); - }; - - class Target; - - class WValueDistribution; - - class ValueDistribution{ - friend std::ostream& operator<<( std::ostream&, const ValueDistribution& ); - friend std::ostream& operator<<( std::ostream&, const ValueDistribution * ); - friend class WValueDistribution; - public: - typedef std::map VDlist; - typedef VDlist::const_iterator dist_iterator; - ValueDistribution( ): total_items(0) {}; - ValueDistribution( const ValueDistribution& ); - virtual ~ValueDistribution(){ clear(); }; - size_t totalSize() const{ return total_items; }; - size_t size() const{ return distribution.size(); }; - bool empty() const{ return distribution.empty(); }; - void clear(); - dist_iterator begin() const { return distribution.begin(); }; - dist_iterator end() const { return distribution.end(); }; - virtual const TargetValue* BestTarget( bool&, bool = false ) const; - void Merge( const ValueDistribution& ); - virtual void SetFreq( const TargetValue *, int, double=1.0 ); - virtual bool IncFreq( const TargetValue *, size_t, double=1.0 ); - void DecFreq( const TargetValue * ); - static ValueDistribution *read_distribution( std::istream&, - Target& , bool ); - static ValueDistribution *read_distribution_hashed( std::istream&, - Target& , bool ); - const std::string DistToString() const; - const std::string DistToStringW( int ) const; - double Confidence( const TargetValue * ) const; - virtual const std::string SaveHashed() const; - virtual const std::string Save() const; - bool ZeroDist() const { return total_items == 0; }; - double Entropy() const; - ValueDistribution *to_VD_Copy( ) const; - virtual WValueDistribution *to_WVD_Copy() const; - protected: - virtual void DistToString( std::string&, double=0 ) const; - virtual void DistToStringWW( std::string&, int ) const; - const TargetValue* BestTargetN( bool &, bool = false ) const; - const TargetValue* BestTargetW( bool &, bool = false ) const; - virtual ValueDistribution *clone( ) const { - return new ValueDistribution(); }; - size_t total_items; - VDlist distribution; - }; - - class WValueDistribution: public ValueDistribution { - public: - WValueDistribution(): ValueDistribution() {}; - const TargetValue* BestTarget( bool &, bool = false ) const; - void SetFreq( const TargetValue *, int, double ); - bool IncFreq( const TargetValue *, size_t, double ); - WValueDistribution *to_WVD_Copy( ) const; - const std::string SaveHashed() const; - const std::string Save() const; - void Normalize(); - void Normalize_1( double, const Target * ); - void Normalize_2(); - void MergeW( const ValueDistribution&, double ); - private: - void DistToString( std::string&, double=0 ) const; - void DistToStringWW( std::string&, int ) const; - WValueDistribution *clone() const { - return new WValueDistribution; }; - }; - - class ValueClass { - public: - ValueClass( const icu::UnicodeString& n, size_t i ): - name( n ), index( i ), Frequency( 1 ) {}; - virtual ~ValueClass() {}; - void ValFreq( size_t f ){ Frequency = f; }; - void IncValFreq( int f ){ Frequency += f; }; - size_t ValFreq( ) const { return Frequency; }; - void incr_val_freq(){ Frequency++; }; - void decr_val_freq(){ Frequency--; }; - size_t Index() const { return index; }; - const icu::UnicodeString& name_u() const { return name; }; - const std::string Name() const { return TiCC::UnicodeToUTF8(name); }; - friend std::ostream& operator<<( std::ostream& os, ValueClass const *vc ); - protected: - const icu::UnicodeString& name; - size_t index; - size_t Frequency; - ValueClass( const ValueClass& ); - ValueClass& operator=( const ValueClass& ); - }; - - class TargetValue: public ValueClass { - public: - TargetValue( const icu::UnicodeString&, size_t ); - }; - - class SparseValueProbClass { - friend std::ostream& operator<< ( std::ostream&, SparseValueProbClass * ); - public: - typedef std::map< size_t, double > IDmaptype; - typedef IDmaptype::const_iterator IDiterator; - explicit SparseValueProbClass( size_t d ): dimension(d) {}; - void Assign( const size_t i, const double d ) { vc_map[i] = d; }; - void Clear() { vc_map.clear(); }; - IDiterator begin() const { return vc_map.begin(); }; - IDiterator end() const { return vc_map.end(); }; - private: - IDmaptype vc_map; - size_t dimension; - }; - - class FeatureValue: public ValueClass { - friend class Feature; - friend struct D_D; - public: - explicit FeatureValue( const icu::UnicodeString& ); - FeatureValue( const icu::UnicodeString&, size_t ); - ~FeatureValue(); - void ReconstructDistribution( const ValueDistribution& vd ) { - TargetDist.Merge( vd ); - Frequency = TargetDist.totalSize(); - }; - bool isUnknown() const { return index == 0; }; - SparseValueProbClass *valueClassProb() const { return ValueClassProb; }; - private: - SparseValueProbClass *ValueClassProb; - ValueDistribution TargetDist; - FeatureValue( const FeatureValue& ); // inhibit copies - FeatureValue& operator=( const FeatureValue& ); // inhibit copies - }; - - class BaseFeatTargClass: public MsgClass { - public: - explicit BaseFeatTargClass( Hash::UnicodeHash * ); - virtual ~BaseFeatTargClass(); - virtual size_t EffectiveValues() const = 0; - virtual size_t TotalValues() const = 0; - virtual ValueClass *Lookup( const icu::UnicodeString& ) const = 0; - Hash::UnicodeHash *hash() const { return TokenTree; }; - protected: - Hash::UnicodeHash *TokenTree; - BaseFeatTargClass( const BaseFeatTargClass& ); - private: - BaseFeatTargClass& operator=( const BaseFeatTargClass& ); - }; - - class Target: public BaseFeatTargClass { - friend class MBLClass; - friend class WValueDistribution; - friend class ConfusionMatrix; - public: - explicit Target( Hash::UnicodeHash *T ): BaseFeatTargClass(T) {}; - ~Target(); - TargetValue *add_value( const icu::UnicodeString&, int freq = 1 ); - TargetValue *add_value( size_t, int freq = 1 ); - TargetValue *Lookup( const icu::UnicodeString& ) const override; - TargetValue *ReverseLookup( size_t ) const; - bool decrement_value( TargetValue * ); - bool increment_value( TargetValue * ); - TargetValue *MajorityClass() const; - size_t EffectiveValues() const override; - size_t TotalValues() const override; - size_t num_of_values() const { return values_array.size(); }; - private: - std::vector values_array; - std::unordered_map< size_t, TargetValue *> reverse_values; - }; - - class metricClass; - - class Feature: public BaseFeatTargClass { - friend class MBLClass; - public: - explicit Feature( Hash::UnicodeHash *T ); - ~Feature(); - bool Ignore() const { return ignore; }; - void Ignore( const bool val ){ ignore = val; }; - bool setMetricType( const MetricType ); - MetricType getMetricType() const; - double Weight() const { return weight; }; - void SetWeight( const double w ) { weight = w; }; - double InfoGain() const { return info_gain; }; - void InfoGain( const double w ){ info_gain = w; }; - double SplitInfo() const { return split_info; }; - void SplitInfo( const double w ){ split_info = w; }; - double GainRatio() const { return gain_ratio; }; - void GainRatio( const double w ){ gain_ratio = w; }; - double ChiSquare() const { return chi_square; }; - void ChiSquare( const double w ){ chi_square = w; }; - double SharedVariance() const { return shared_variance; }; - void SharedVariance( const double w ){ shared_variance = w; }; - double StandardDeviation() const { return standard_deviation; }; - void StandardDeviation( const double w ){ standard_deviation = w; }; - double Min() const { return n_min; }; - void Min( const double val ){ n_min = val; }; - double Max() const { return n_max; }; - void Max( const double val ){ n_max = val; }; - double fvDistance( FeatureValue *, FeatureValue *, size_t=1 ) const; - FeatureValue *add_value( const icu::UnicodeString&, TargetValue *, int=1 ); - FeatureValue *add_value( size_t, TargetValue *, int=1 ); - FeatureValue *Lookup( const icu::UnicodeString& ) const override; - bool decrement_value( FeatureValue *, TargetValue * ); - bool increment_value( FeatureValue *, TargetValue * ); - size_t EffectiveValues() const override; - size_t TotalValues() const override; - bool isNumerical() const; - bool isStorableMetric() const; - bool AllocSparseArrays( size_t ); - void InitSparseArrays(); - bool ArrayRead(){ return vcpb_read; }; - bool matrixPresent( bool& ) const; - size_t matrix_byte_size() const; - bool store_matrix( int = 1 ); - void clear_matrix(); - bool fill_matrix( std::istream& ); - void print_matrix( std::ostream&, bool = false ) const; - void print_vc_pb_array( std::ostream& ) const; - bool read_vc_pb_array( std::istream & ); - FeatVal_Stat prepare_numeric_stats(); - void Statistics( double, Target *, bool ); - void NumStatistics( double, Target *, int, bool ); - void ClipFreq( size_t f ){ matrix_clip_freq = f; }; - size_t ClipFreq() const { return matrix_clip_freq; }; - SparseSymetricMatrix *metric_matrix; - private: - metricClass *metric; - bool ignore; - bool numeric; - bool vcpb_read; - enum ps_stat{ ps_undef, ps_failed, ps_ok, ps_read }; - enum ps_stat PrestoreStatus; - MetricType Prestored_metric; - void delete_matrix(); - double entropy; - double info_gain; - double split_info; - double gain_ratio; - double chi_square; - double shared_variance; - double standard_deviation; - size_t matrix_clip_freq; - long int *n_dot_j; - long int* n_i_dot; - double n_min; - double n_max; - size_t SaveSize; - size_t SaveNum; - double weight; - void Statistics( double ); - void NumStatistics( std::vector&, double ); - void ChiSquareStatistics( std::vector&, Target * ); - void ChiSquareStatistics( Target * ); - void SharedVarianceStatistics( Target *, int ); - void StandardDeviationStatistics(); - Feature( const Feature& ); - Feature& operator=( const Feature& ); - std::vector values_array; - std::unordered_map< size_t, FeatureValue *> reverse_values; - bool is_reference; - }; + class FeatureValue; class Instance { friend std::ostream& operator<<(std::ostream&, const Instance& ); friend std::ostream& operator<<(std::ostream&, const Instance * ); public: Instance(); - explicit Instance( size_t s ): TV(NULL), sample_weight(0.0), occ(1) - { Init( s ); }; + explicit Instance( size_t s ): Instance() { Init( s ); }; + Instance( const Instance& ) = delete; // inhibit copies + Instance& operator=( const Instance& ) = delete; // inhibit copies ~Instance(); void Init( size_t ); void clear(); @@ -357,12 +58,8 @@ namespace Timbl { void Occurrences( const int o ) { occ = o; }; size_t size() const { return FV.size(); }; std::vector FV; - void permute( const std::vector& ); // Obsolete - // NO implementation provided, so the linker will punish us! TargetValue *TV; private: - Instance( const Instance& ); // inhibit copies - Instance& operator=( const Instance& ); // inhibit copies double sample_weight; // relative weight int occ; }; diff --git a/include/timbl/MBLClass.h b/include/timbl/MBLClass.h index ed60783..4a9b5b9 100644 --- a/include/timbl/MBLClass.h +++ b/include/timbl/MBLClass.h @@ -47,7 +47,7 @@ namespace Timbl { class Chopper; class neighborSet; - class MBLClass { + class MBLClass: public MsgClass { public: bool SetOption( const std::string& ); xmlNode *settingsToXml() const; @@ -60,9 +60,10 @@ namespace Timbl { bool MBLInit() const { return MBL_init; }; void MBLInit( bool b ) { MBL_init = b; }; bool ExpInvalid( bool b = true ) const { - if ( err_count > 0 ){ - if ( b ) + if ( err_cnt > 0 ){ + if ( b ){ InvalidMessage(); + } return true; } else @@ -77,6 +78,7 @@ namespace Timbl { int getOcc() const { return doOcc; }; protected: explicit MBLClass( const std::string& = "" ); + void init_options_table( size_t ); MBLClass& operator=( const MBLClass& ); enum PhaseValue { TrainWords, LearnWords, TestWords, TrainLearnWords }; friend std::ostream& operator<< ( std::ostream&, const PhaseValue& ); @@ -94,8 +96,7 @@ namespace Timbl { void writePermutation( std::ostream& ) const; void LearningInfo( std::ostream& ); virtual ~MBLClass(); - void InitClass( const size_t ); - void Initialize( size_t = 0 ); + void Initialize( size_t ); bool PutInstanceBase( std::ostream& ) const; VerbosityFlags get_verbosity() const { return verbosity; }; void set_verbosity( VerbosityFlags v ) { verbosity = v; }; @@ -104,7 +105,7 @@ namespace Timbl { bool HideInstance( const Instance& ); bool UnHideInstance( const Instance& ); icu::UnicodeString formatInstance( const std::vector&, - std::vector&, + const std::vector&, size_t, size_t ) const; bool setInputFormat( const InputFormatType ); size_t countFeatures( const icu::UnicodeString&, @@ -116,18 +117,18 @@ namespace Timbl { InstanceBase_base * = NULL, size_t = 0 ); icu::UnicodeString get_org_input( ) const; - const ValueDistribution *ExactMatch( const Instance& ) const; + const ClassDistribution *ExactMatch( const Instance& ) const; void fillNeighborSet( neighborSet& ) const; void addToNeighborSet( neighborSet& ns, size_t n ) const; double getBestDistance() const; - WValueDistribution *getBestDistribution( unsigned int =0 ); + WClassDistribution *getBestDistribution( unsigned int =0 ); IB_Stat IBStatus() const; bool get_ranges( const std::string& ); - bool get_IB_Info( std::istream&, bool&, int&, bool&, std::string& ); - size_t NumOfFeatures() const { return num_of_features; }; + size_t get_IB_Info( std::istream&, bool&, int&, bool&, std::string& ); + size_t NumOfFeatures() const { return features._num_of_feats; }; size_t targetPos() const { return target_pos; }; - size_t NumNumFeatures() const { return num_of_num_features; }; - size_t EffectiveFeatures() const { return effective_feats; }; + size_t NumNumFeatures() const { return features._num_of_num_feats; }; + size_t EffectiveFeatures() const { return features._eff_feats; }; void IBInfo( std::ostream& os ) const; void MatrixInfo( std::ostream& ) const; int RandomSeed() const { return random_seed; }; @@ -139,10 +140,8 @@ namespace Timbl { int Progress() const { return progress; }; void Progress( int p ){ progress = p; }; std::string extract_limited_m( size_t ); - Target *Targets; - std::vector Features; - std::vector PermFeatures; - std::vector permutation; + Targets targets; + Feature_List features; InstanceBase_base *InstanceBase; std::ostream *mylog; std::ostream *myerr; @@ -165,6 +164,9 @@ namespace Timbl { void set_order(void); void calculatePermutation( const std::vector& ); void calculate_fv_entropy( bool ); + bool recalculate_stats( Feature_List&, + std::vector&, + bool ); OptionTableClass Options; PhaseValue runningPhase; WeightType Weighting; @@ -173,8 +175,6 @@ namespace Timbl { size_t num_of_neighbors; bool dynamic_neighbors; DecayType decay_flag; - Hash::UnicodeHash *TargetStrings; - Hash::UnicodeHash *FeatureStrings; std::string exp_name; Instance CurrInst; BestArray bestArray; @@ -205,11 +205,7 @@ namespace Timbl { std::vector UserOptions; InputFormatType input_format; VerbosityFlags verbosity; - mutable int err_count; - size_t num_of_features; - size_t num_of_num_features; size_t target_pos; - size_t effective_feats; int clip_factor; int Bin_Size; int progress; @@ -243,9 +239,8 @@ namespace Timbl { return false; } }; - void fill_table(); void InvalidMessage() const ; - double calculate_db_entropy( Target * ); + void do_numeric_statistics( ); void test_instance( const Instance& , diff --git a/include/timbl/Makefile.am b/include/timbl/Makefile.am index bcba8f8..c3bd077 100644 --- a/include/timbl/Makefile.am +++ b/include/timbl/Makefile.am @@ -2,7 +2,8 @@ # $URL$ pkginclude_HEADERS = Common.h GetOptClass.h IBtree.h Matrices.h \ - Instance.h MBLClass.h MsgClass.h BestArray.h \ + Features.h Targets.h Instance.h \ + MBLClass.h MsgClass.h BestArray.h \ StringOps.h TimblAPI.h Options.h \ TimblExperiment.h Types.h neighborSet.h Statistics.h \ Choppers.h Testers.h Metrics.h diff --git a/include/timbl/Metrics.h b/include/timbl/Metrics.h index 61ca7ae..a223367 100644 --- a/include/timbl/Metrics.h +++ b/include/timbl/Metrics.h @@ -60,98 +60,128 @@ namespace Timbl{ public: explicit distanceMetricClass( MetricType m ): metricClass(m){}; virtual ~distanceMetricClass() {}; - bool isSimilarityMetric() const { return false; }; + bool isSimilarityMetric() const override { return false; }; }; class OverlapMetric: public distanceMetricClass { public: OverlapMetric(): distanceMetricClass( Overlap ){}; - bool isNumerical() const { return false; }; - bool isStorable() const { return false; }; - double distance( FeatureValue *, FeatureValue *, size_t, double ) const; + bool isNumerical() const override { return false; }; + bool isStorable() const override { return false; }; + double distance( FeatureValue *, + FeatureValue *, + size_t, + double ) const override; }; class NumericMetricClass: public distanceMetricClass { public: explicit NumericMetricClass( MetricType m ): distanceMetricClass( m ){}; virtual ~NumericMetricClass() {}; - bool isNumerical() const { return true; }; - bool isStorable() const { return false; }; + bool isNumerical() const override { return true; }; + bool isStorable() const override { return false; }; }; class NumericMetric: public NumericMetricClass { public: NumericMetric(): NumericMetricClass( Numeric ){}; - double distance( FeatureValue *, FeatureValue *, size_t, double ) const; + double distance( FeatureValue *, + FeatureValue *, + size_t, + double ) const override; }; class EuclideanMetric: public NumericMetricClass { public: EuclideanMetric(): NumericMetricClass( Euclidean ){}; - double distance( FeatureValue *, FeatureValue *, size_t, double ) const; + double distance( FeatureValue *, + FeatureValue *, + size_t, + double ) const override; }; class ValueDiffMetric: public distanceMetricClass { public: ValueDiffMetric(): distanceMetricClass( ValueDiff ){}; - bool isNumerical() const { return false; }; - bool isStorable() const { return true; }; - double distance( FeatureValue *, FeatureValue *, size_t, double ) const; + bool isNumerical() const override { return false; }; + bool isStorable() const override { return true; }; + double distance( FeatureValue *, + FeatureValue *, + size_t, + double ) const override; }; class DiceMetric: public distanceMetricClass { public: DiceMetric(): distanceMetricClass( Dice ){}; - bool isNumerical() const { return false; }; - bool isStorable() const { return true; }; - double distance( FeatureValue *, FeatureValue *, size_t, double ) const; + bool isNumerical() const override { return false; }; + bool isStorable() const override { return true; }; + double distance( FeatureValue *, + FeatureValue *, + size_t, + double ) const override; }; class JeffreyMetric: public distanceMetricClass { public: JeffreyMetric(): distanceMetricClass( JeffreyDiv ){}; - bool isNumerical() const { return false; }; - bool isStorable() const { return true; }; - double distance( FeatureValue *, FeatureValue *, size_t, double ) const; + bool isNumerical() const override{ return false; }; + bool isStorable() const override { return true; }; + double distance( FeatureValue *, + FeatureValue *, + size_t, + double ) const override; }; class JSMetric: public distanceMetricClass { public: JSMetric(): distanceMetricClass( JSDiv ){}; - bool isNumerical() const { return false; }; - bool isStorable() const { return true; }; - double distance( FeatureValue *, FeatureValue *, size_t, double ) const; + bool isNumerical() const override { return false; }; + bool isStorable() const override { return true; }; + double distance( FeatureValue *, + FeatureValue *, + size_t, + double ) const override; }; class LevenshteinMetric: public distanceMetricClass { public: LevenshteinMetric(): distanceMetricClass( Levenshtein ){}; - bool isNumerical() const { return false; }; - bool isStorable() const { return true; }; - double distance( FeatureValue *, FeatureValue *, size_t, double ) const; + bool isNumerical() const override { return false; }; + bool isStorable() const override { return true; }; + double distance( FeatureValue *, + FeatureValue *, + size_t, + double ) const override; }; class similarityMetricClass: public metricClass { public: explicit similarityMetricClass( MetricType m ): metricClass( m ){}; virtual ~similarityMetricClass() {}; - bool isSimilarityMetric() const { return true; }; - bool isNumerical() const { return true; }; - bool isStorable() const { return false; }; + bool isSimilarityMetric() const override { return true; }; + bool isNumerical() const override { return true; }; + bool isStorable() const override { return false; }; }; class CosineMetric: public similarityMetricClass { public: CosineMetric(): similarityMetricClass( Cosine ){}; - double distance( FeatureValue *, FeatureValue *, size_t, double ) const; - double get_max_similarity() const { return 1.0; }; + double distance( FeatureValue *, + FeatureValue *, + size_t, + double ) const override; + double get_max_similarity() const override { return 1.0; }; }; class DotProductMetric: public similarityMetricClass { public: DotProductMetric(): similarityMetricClass( DotProduct ){}; - double distance( FeatureValue *, FeatureValue *, size_t, double ) const; - double get_max_similarity() const { + double distance( FeatureValue *, + FeatureValue *, + size_t, + double ) const override; + double get_max_similarity() const override { return std::numeric_limits::max(); }; }; diff --git a/include/timbl/MsgClass.h b/include/timbl/MsgClass.h index b98976f..df5243a 100644 --- a/include/timbl/MsgClass.h +++ b/include/timbl/MsgClass.h @@ -29,14 +29,17 @@ #define TIMBL_MSGCLASS_H namespace Timbl { - class MsgClass{ + class MsgClass { public: - MsgClass() {}; + MsgClass(): + err_cnt(0) + {}; virtual ~MsgClass() {}; virtual void Info( const std::string& ) const; virtual void Warning( const std::string& ) const ; virtual void Error( const std::string& ) const ; virtual void FatalError( const std::string& ) const ; + mutable int err_cnt; }; } diff --git a/include/timbl/Options.h b/include/timbl/Options.h index 09c75b6..fd62a20 100644 --- a/include/timbl/Options.h +++ b/include/timbl/Options.h @@ -29,12 +29,12 @@ #define TIMBL_OPTIONS_H #include +#include #include #include #include "ticcutils/StringOps.h" namespace Timbl { - const int MAX_TABLE_SIZE = 50; class OptionClass { friend class OptionTableClass; @@ -52,23 +52,25 @@ namespace Timbl { }; template - class OptionClassT: public OptionClass { - public: + class OptionClassT: public OptionClass { + public: OptionClassT( const std::string& n, Type *tp, Type t ):OptionClass(n), - Content(tp) { *Content = t; }; - virtual bool set_option( const std::string& line ){ + Content(tp) { *Content = t; }; + virtual bool set_option( const std::string& line ) override { Type T; bool result = TiCC::stringTo( line, T ); - if ( result ) *Content = T; + if ( result ) { + *Content = T; + } return result; }; - virtual std::ostream& show_opt( std::ostream &os ) const { + virtual std::ostream& show_opt( std::ostream &os ) const override { os.width(20); os.setf( std::ios::left, std::ios::adjustfield ); os << Name << " : " << TiCC::toString(*Content); return os; }; - virtual std::ostream& show_full( std::ostream &os ) const { + virtual std::ostream& show_full( std::ostream &os ) const override { return show_opt( os ); }; private: @@ -89,7 +91,7 @@ namespace Timbl { } template <> - inline std::ostream& OptionClassT::show_full( std::ostream &os ) const { + inline std::ostream& OptionClassT::show_full( std::ostream &os ) const{ os.width(20); os.setf( std::ios::left, std::ios::adjustfield ); os.setf( std::ios::boolalpha ); @@ -115,8 +117,9 @@ namespace Timbl { os.setf( std::ios::left, std::ios::adjustfield ); os << Name << " : {"; InputFormatType i = UnknownInputFormat; - for ( ++i; i < MaxInputFormat-1; ++i ) + for ( ++i; i < MaxInputFormat-1; ++i ){ os << TiCC::toString(i) << ", "; + } os << TiCC::toString(i) << "}, [ " << TiCC::toString(*Content) << "]"; return os; @@ -131,8 +134,9 @@ namespace Timbl { os.setf( std::ios::left, std::ios::adjustfield ); os << Name << " : {"; MetricType i = UnknownMetric; - for ( ++i; i < MaxMetric-1; ++i ) + for ( ++i; i < MaxMetric-1; ++i ){ os << TiCC::toString(i) << ", "; + } os << TiCC::toString(i) << "}, [ " << TiCC::toString(*Content) << "]"; return os; @@ -146,8 +150,9 @@ namespace Timbl { os.setf( std::ios::left, std::ios::adjustfield ); os << Name << " : {"; AlgorithmType i = Unknown_a; - for ( ++i; i < Max_a-1; ++i ) + for ( ++i; i < Max_a-1; ++i ){ os << TiCC::toString(i) << ", "; + } os << TiCC::toString(i) << "}, [ " << TiCC::toString(*Content) << "]"; return os; @@ -161,8 +166,9 @@ namespace Timbl { os.setf( std::ios::left, std::ios::adjustfield ); os << Name << " : {"; DecayType i = UnknownDecay; - for ( ++i; i < MaxDecay-1; ++i ) + for ( ++i; i < MaxDecay-1; ++i ){ os << TiCC::toString(i) << ", "; + } os << TiCC::toString(i) << "}, [ " << TiCC::toString(*Content) << "]"; return os; @@ -176,8 +182,9 @@ namespace Timbl { os.setf( std::ios::left, std::ios::adjustfield ); os << Name << " : {"; SmoothingType i = UnknownSmoothing; - for ( ++i; i < MaxSmoothing-1; ++i ) + for ( ++i; i < MaxSmoothing-1; ++i ){ os << TiCC::toString(i) << ", "; + } os << TiCC::toString(i) << "}, [ " << TiCC::toString(*Content) << "]"; return os; @@ -191,8 +198,9 @@ namespace Timbl { os.setf( std::ios::left, std::ios::adjustfield ); os << Name << " : {"; WeightType i = Unknown_w; - for ( ++i; i < Max_w-1; ++i ) + for ( ++i; i < Max_w-1; ++i ){ os << TiCC::toString(i) << ", "; + } os << TiCC::toString(i) << "}, [ " << TiCC::toString(*Content) << "]"; return os; @@ -206,8 +214,9 @@ namespace Timbl { os.setf( std::ios::left, std::ios::adjustfield ); os << Name << " : {"; OrdeningType i = UnknownOrdening; - for ( ++i; i < MaxOrdening-1; ++i ) + for ( ++i; i < MaxOrdening-1; ++i ){ os << TiCC::toString(i) << ", "; + } os << TiCC::toString(i) << "}, [ " << TiCC::toString(*Content) << "]"; return os; @@ -221,8 +230,9 @@ namespace Timbl { os.setf( std::ios::left, std::ios::adjustfield ); os << Name << " : {"; normType i = unknownNorm; - for ( ++i; i < maxNorm-1; ++i ) + for ( ++i; i < maxNorm-1; ++i ){ os << TiCC::toString(i) << ", "; + } os << TiCC::toString(i) << "}, [ " << TiCC::toString(*Content) << "]"; return os; @@ -232,16 +242,16 @@ namespace Timbl { // Array of options types // template - class OptionArrayClass: public OptionClass { - public: + class OptionArrayClass: public OptionClass { + public: OptionArrayClass( const std::string& n, std::vector& ta, const size_t size ): OptionClass( n ), TA(ta), Size(size ){}; - protected: + protected: std::vector& TA; size_t Size; - private: + private: OptionArrayClass(const OptionArrayClass&); OptionArrayClass& operator = (const OptionArrayClass&); }; @@ -253,13 +263,12 @@ namespace Timbl { std::vector& mp, MetricType& m, size_t s ): - OptionArrayClass( n, mp, s ), def(m){ - for ( size_t i=0; i < s; i++ ) - TA[i] = m; + OptionArrayClass( n, mp, s ), def(m){ + TA.resize(s,m); }; - bool set_option( const std::string& line ); - std::ostream& show_opt( std::ostream &os ) const; - std::ostream& show_full( std::ostream &os ) const; + bool set_option( const std::string& line ) override; + std::ostream& show_opt( std::ostream &os ) const override; + std::ostream& show_full( std::ostream &os ) const override; private: const MetricType& def; }; @@ -271,8 +280,9 @@ namespace Timbl { bool result = TiCC::split_at( line, res, "=" ) == 2 && TiCC::stringTo( res[1], m ) && TiCC::stringTo( res[0], i, 0, Size ); - if ( result ) + if ( result ){ TA[i] = m; + } return result; } @@ -280,9 +290,11 @@ namespace Timbl { os.width(20); os.setf( std::ios::left, std::ios::adjustfield ); os << Name << " : "; - for ( size_t i=0; i < Size; i++ ) - if ( TA[i] != def ) + for ( size_t i=0; i < Size; i++ ){ + if ( TA[i] != def ){ os << i << ":" << TiCC::toString(TA[i]) << ", "; + } + } return os; } @@ -293,10 +305,12 @@ namespace Timbl { bool first = true; for ( size_t i=0; i < Size; i++ ){ if ( TA[i] != def ){ - if ( !first ) + if ( !first ){ os << ","; - else + } + else { first = false; + } os << i << ":" << TiCC::toString(TA[i]); } } @@ -315,20 +329,22 @@ namespace Timbl { Content( tp), minVal( Min ), maxVal( Max ) { *Content = t; }; - virtual bool set_option( const std::string& line ){ + virtual bool set_option( const std::string& line ) override { Type T; bool result = TiCC::stringTo( line, T, minVal, maxVal ); - if ( result ) *Content = T; + if ( result ) { + *Content = T; + } return result; }; - virtual std::ostream& show_opt( std::ostream &os ) const { + virtual std::ostream& show_opt( std::ostream &os ) const override { os.width(20); os.setf( std::ios::showpoint ); os.setf( std::ios::left, std::ios::adjustfield ); os << Name << " : " << *Content; return os; }; - virtual std::ostream& show_full( std::ostream &os ) const { + virtual std::ostream& show_full( std::ostream &os ) const override { os.width(20); os.setf( std::ios::showpoint ); os.setf( std::ios::left, std::ios::adjustfield ); @@ -351,44 +367,76 @@ namespace Timbl { enum SetOptRes { Opt_OK, Opt_Frozen, Opt_Unknown, Opt_Ill_Val}; + struct ci_less + { + // case-independent (ci) compare_less binary function + struct nocase_compare + { + bool operator() (const unsigned char& c1, const unsigned char& c2) const { + return tolower (c1) < tolower (c2); + } + }; + bool operator() (const std::string & s1, const std::string & s2) const { + return std::lexicographical_compare + (s1.begin(), s1.end(), // source range + s2.begin(), s2.end(), // dest range + nocase_compare()); // comparison + } + }; + class OptionTableClass { public: + OptionTableClass(): + table_frozen(false){}; + OptionTableClass( const OptionTableClass& ) = delete; // forbid copies + OptionTableClass& operator=( const OptionTableClass& ) = delete; // forbid copies + ~OptionTableClass(){ + for ( const auto& it : global_table ){ + delete it.second; + } + for ( const auto& it : runtime_table ){ + delete it.second; + } + }; bool Add( OptionClass *opt ){ - Table[table_size++] = opt; - return table_size < MAX_TABLE_SIZE; + // std::cerr << "Table add: " << opt->Name << std::endl; + runtime_table[opt->Name] = opt; + return true; }; - void SetFreezeMark(void){ table_start = table_size; }; - void FreezeTable(void){ table_frozen = true; }; + void FreezeTable(void); bool TableFrozen(void){ return table_frozen; }; SetOptRes SetOption( const std::string& ); void Show_Settings( std::ostream& ) const; void Show_Options( std::ostream& ) const; - OptionTableClass(): - table_start(0), table_size(0), table_frozen(false),Table(0){ - Table = new OptionClass *[MAX_TABLE_SIZE]; }; - ~OptionTableClass(){ - for ( int i=0; i < table_size; i++ ) - delete Table[i]; - delete [] Table; - }; private: - int table_start; - int table_size; bool table_frozen; - OptionClass **Table; + std::map runtime_table; + std::map global_table; inline OptionClass *look_up( const std::string&, bool & ); - OptionTableClass( const OptionTableClass& ); - OptionTableClass& operator=( const OptionTableClass& ); }; + inline void OptionTableClass::FreezeTable(void){ + global_table = runtime_table; + runtime_table.clear(); + table_frozen = true; + } + inline void OptionTableClass::Show_Settings( std::ostream& os ) const{ - for ( int i=0; i show_opt( os ) << std::endl; + for ( const auto& it: global_table ){ + it.second->show_opt( os ) << std::endl; + } + for ( const auto& it: runtime_table ){ + it.second->show_opt( os ) << std::endl; + } } inline void OptionTableClass::Show_Options( std::ostream& os ) const { - for ( int i=0; i show_full( os ) << std::endl; + for ( const auto& it: global_table ){ + it.second->show_full( os ) << std::endl; + } + for ( const auto& it: runtime_table ){ + it.second->show_full( os ) << std::endl; + } } inline void split_line( const std::string& line, @@ -409,11 +457,21 @@ namespace Timbl { inline OptionClass *OptionTableClass::look_up( const std::string& option_name, bool &runtime ){ - for ( int i=0; i < table_size; i++ ) - if ( compare_nocase( option_name, Table[i]->Name ) ){ - runtime = (i >= table_start || !table_frozen ); - return Table[i]; + // std::cerr << "lookup: " << option_name << std::endl; + const auto itr = runtime_table.find( option_name ); + if ( itr != runtime_table.end() ){ + runtime = true; + // std::cerr << "FOUND: runtime= " << option_name << std::endl; + return itr->second; + } + else { + const auto itg = global_table.find( option_name ); + if ( itg != global_table.end() ){ + runtime = table_frozen; + // std::cerr << "FOUND global= " << option_name << std::endl; + return itg->second; } + } return NULL; } @@ -425,14 +483,16 @@ namespace Timbl { split_line( line, option_name, value ); OptionClass *option = look_up( option_name, runtime ); if ( option ){ - if ( !runtime ) + if ( !runtime ){ result = Opt_Frozen; // may not be changed at this stage - else - if ( !option->set_option( value ) ) - result = Opt_Ill_Val; // illegal value + } + else if ( !option->set_option( value ) ){ + result = Opt_Ill_Val; // illegal value + } } - else + else { result = Opt_Unknown; // What the hell ??? + } return result; } diff --git a/include/timbl/Statistics.h b/include/timbl/Statistics.h index 0f51ac2..e5fe93d 100644 --- a/include/timbl/Statistics.h +++ b/include/timbl/Statistics.h @@ -31,7 +31,7 @@ #include "timbl/MsgClass.h" namespace Timbl { - class Target; + class Targets; class TargetValue; class ConfusionMatrix: public MsgClass { @@ -41,8 +41,8 @@ namespace Timbl { explicit ConfusionMatrix( size_t ); virtual ~ConfusionMatrix(); void Increment( const TargetValue*, const TargetValue* ); - void Print( std::ostream&, const Target * ) const; - void FScore( std::ostream&, const Target *, bool ) const; + void Print( std::ostream&, const Targets& ) const; + void FScore( std::ostream&, const Targets&, bool ) const; void merge( const ConfusionMatrix * ); }; diff --git a/include/timbl/Targets.h b/include/timbl/Targets.h new file mode 100644 index 0000000..fe19b3b --- /dev/null +++ b/include/timbl/Targets.h @@ -0,0 +1,204 @@ +/* + Copyright (c) 1998 - 2023 + ILK - Tilburg University + CLST - Radboud University + CLiPS - University of Antwerp + + This file is part of timbl + + timbl is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + timbl is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . + + For questions and suggestions, see: + https://github.com/LanguageMachines/timbl/issues + or send mail to: + lamasoftware (at ) science.ru.nl + +*/ +#ifndef TIMBL_TARGETS_H +#define TIMBL_TARGETS_H + +#include +#include +#include +#include "unicode/unistr.h" +#include "timbl/MsgClass.h" +#include "ticcutils/Unicode.h" + +namespace Hash { + class UnicodeHash; +} + +namespace Timbl { + + class ValueClass { + public: + ValueClass( const icu::UnicodeString& n, size_t i ): + _name( n ), _index( i ), _frequency( 1 ) {}; + ValueClass( const ValueClass& ) = delete; // forbid copies + ValueClass& operator=( const ValueClass& ) = delete; // forbid copies + virtual ~ValueClass() {}; + void ValFreq( size_t f ){ _frequency = f; }; + void IncValFreq( int f ){ _frequency += f; }; + size_t ValFreq( ) const { return _frequency; }; + void incr_val_freq(){ ++_frequency; }; + void decr_val_freq(){ --_frequency; }; + size_t Index() const { return _index; }; + const icu::UnicodeString& name() const { return _name; }; + const std::string name_string() const { return TiCC::UnicodeToUTF8(_name);}; + // temporary for backward compatability + const icu::UnicodeString& name_u() const { return _name; }; // HACK + const std::string Name() const { return TiCC::UnicodeToUTF8(_name); }; // HACK + // REMOVE ^^^^ + friend std::ostream& operator<<( std::ostream& os, ValueClass const *vc ); + protected: + const icu::UnicodeString& _name; + size_t _index; + size_t _frequency; + }; + + class TargetValue: public ValueClass { + public: + TargetValue( const icu::UnicodeString&, size_t ); + }; + + class Targets: public MsgClass { + friend class MBLClass; + friend class WClassDistribution; + friend class ConfusionMatrix; + public: + explicit Targets( Hash::UnicodeHash *T ): + target_hash( T ), + is_reference(false) + {}; + ~Targets(); + Targets& operator=( const Targets& ); + void init(); + TargetValue *add_value( const icu::UnicodeString&, int freq = 1 ); + TargetValue *add_value( size_t, int freq = 1 ); + TargetValue *Lookup( const icu::UnicodeString& ) const; + TargetValue *ReverseLookup( size_t ) const; + bool decrement_value( TargetValue * ); + bool increment_value( TargetValue * ); + TargetValue *MajorityClass() const; + size_t EffectiveValues() const; + size_t TotalValues() const; + size_t num_of_values() const { return values_array.size(); }; + Hash::UnicodeHash *hash() const { return target_hash; }; + private: + Hash::UnicodeHash *target_hash; + std::vector values_array; + std::unordered_map< size_t, TargetValue *> reverse_values; + bool is_reference; + }; + + class Vfield{ + friend class ClassDistribution; + friend class WClassDistribution; + friend std::ostream& operator<<( std::ostream&, const Vfield& ); + friend std::ostream& operator<<( std::ostream&, const Vfield * ); + public: + Vfield( const TargetValue *val, int freq, double w ): + value(val), frequency(freq), weight(w) {}; + Vfield( const Vfield& in ): + value(in.value), frequency(in.frequency), weight(in.weight) {}; + Vfield& operator=( const Vfield& ) = delete; // forbid copies + ~Vfield(){}; + std::ostream& put( std::ostream& ) const; + const TargetValue *Value() const { return value; }; + void Value( const TargetValue *t ){ value = t; }; + size_t Freq() const { return frequency; }; + void IncFreq( int inc=1 ) { frequency += inc; }; + void AddFreq( int f ) { frequency += f; weight += f; }; + void DecFreq() { frequency -= 1; }; + double Weight() const { return weight; }; + void SetWeight( double w ){ weight = w; }; + size_t Index(); + protected: + const TargetValue *value; + size_t frequency; + double weight; + private: + }; + + class WClassDistribution; + + class ClassDistribution{ + friend std::ostream& operator<<( std::ostream&, const ClassDistribution& ); + friend std::ostream& operator<<( std::ostream&, const ClassDistribution * ); + friend class WClassDistribution; + public: + typedef std::map VDlist; + typedef VDlist::const_iterator dist_iterator; + ClassDistribution( ): total_items(0) {}; + ClassDistribution( const ClassDistribution& ); + virtual ~ClassDistribution(){ clear(); }; + size_t totalSize() const{ return total_items; }; + size_t size() const{ return distribution.size(); }; + bool empty() const{ return distribution.empty(); }; + void clear(); + dist_iterator begin() const { return distribution.begin(); }; + dist_iterator end() const { return distribution.end(); }; + virtual const TargetValue* BestTarget( bool&, bool = false ) const; + void Merge( const ClassDistribution& ); + virtual void SetFreq( const TargetValue *, int, double=1.0 ); + virtual bool IncFreq( const TargetValue *, size_t, double=1.0 ); + void DecFreq( const TargetValue * ); + static ClassDistribution *read_distribution( std::istream&, + Targets&, + bool ); + static ClassDistribution *read_distribution_hashed( std::istream&, + Targets&, + bool ); + const std::string DistToString() const; + const std::string DistToStringW( int ) const; + double Confidence( const TargetValue * ) const; + virtual const std::string SaveHashed() const; + virtual const std::string Save() const; + bool ZeroDist() const { return total_items == 0; }; + double Entropy() const; + ClassDistribution *to_VD_Copy( ) const; + virtual WClassDistribution *to_WVD_Copy() const; + protected: + virtual void DistToString( std::string&, double=0 ) const; + virtual void DistToStringWW( std::string&, int ) const; + const TargetValue* BestTargetN( bool &, bool = false ) const; + const TargetValue* BestTargetW( bool &, bool = false ) const; + virtual ClassDistribution *clone( ) const { + return new ClassDistribution(); }; + size_t total_items; + VDlist distribution; + }; + + class WClassDistribution: public ClassDistribution { + public: + WClassDistribution(): ClassDistribution() {}; + const TargetValue* BestTarget( bool &, bool = false ) const override; + void SetFreq( const TargetValue *, int, double ) override; + bool IncFreq( const TargetValue *, size_t, double ) override; + WClassDistribution *to_WVD_Copy( ) const override; + const std::string SaveHashed() const override; + const std::string Save() const override; + void Normalize(); + void Normalize_1( double, const Targets& ); + void Normalize_2(); + void MergeW( const ClassDistribution&, double ); + private: + void DistToString( std::string&, double=0 ) const override; + void DistToStringWW( std::string&, int ) const override; + WClassDistribution *clone() const override { + return new WClassDistribution; }; + }; + +} +#endif // TINBL_TARGETS_H diff --git a/include/timbl/Testers.h b/include/timbl/Testers.h index f266026..18dbbfe 100644 --- a/include/timbl/Testers.h +++ b/include/timbl/Testers.h @@ -42,7 +42,7 @@ namespace Timbl{ public: double test( FeatureValue *FV, FeatureValue *G, - Feature *Feat ) const; + Feature *Feat ) const override; }; class valueDiffTestFunction: public metricTestFunction { @@ -53,15 +53,16 @@ namespace Timbl{ {}; double test( FeatureValue *, FeatureValue *, - Feature * ) const; + Feature * ) const override; protected: int threshold; }; class TesterClass { public: - TesterClass( const std::vector&, - const std::vector & ); + TesterClass( const Feature_List& ); + TesterClass( const TesterClass& ) = delete; // inhibit copies + TesterClass& operator=( const TesterClass& ) = delete; // inhibit copies virtual ~TesterClass(){}; void init( const Instance&, size_t, size_t ); virtual size_t test( std::vector&, @@ -74,78 +75,61 @@ namespace Timbl{ size_t offSet; const std::vector *FV; const std::vector &features; - std::vector permFeatures; const std::vector &permutation; + std::vector permFeatures; std::vector distances; private: - TesterClass( const TesterClass& ); // inhibit copies - TesterClass& operator=( const TesterClass& ); // inhibit copies }; class DistanceTester: public TesterClass { public: - DistanceTester( const std::vector&, - const std::vector&, + DistanceTester( const Feature_List&, int ); ~DistanceTester(); - double getDistance( size_t ) const; + double getDistance( size_t ) const override; size_t test( std::vector&, size_t, - double ); + double ) override; private: - DistanceTester( const DistanceTester& ); // inhibit copies - DistanceTester& operator=( const DistanceTester& ); // inhibit copies - metricTestFunction **metricTest; - + std::vector metricTest; }; class SimilarityTester: public TesterClass { public: - SimilarityTester( const std::vector& pf, - const std::vector& p ): - TesterClass( pf, p ){}; + SimilarityTester( const Feature_List& pf ): + TesterClass( pf ){}; ~SimilarityTester() {}; virtual size_t test( std::vector&, size_t, - double ) = 0; + double ) override = 0; protected: private: - SimilarityTester( const SimilarityTester & ); // inhibit copies - SimilarityTester& operator=( const SimilarityTester & ); // inhibit copies }; class CosineTester: public SimilarityTester { public: - CosineTester( const std::vector& pf, - const std::vector& p ): - SimilarityTester( pf, p ){}; - double getDistance( size_t ) const; + CosineTester( const Feature_List& pf ): + SimilarityTester( pf ){}; + double getDistance( size_t ) const override; size_t test( std::vector&, size_t, - double ); + double ) override; private: - CosineTester( const CosineTester & ); // inhibit copies - CosineTester& operator=( const CosineTester & ); // inhibit copies }; class DotProductTester: public SimilarityTester { public: - DotProductTester( const std::vector& pf, - const std::vector& p ): - SimilarityTester( pf, p ){}; - double getDistance( size_t ) const; + DotProductTester( const Feature_List& pf ): + SimilarityTester( pf ){}; + double getDistance( size_t ) const override; size_t test( std::vector&, size_t, - double ); + double ) override; private: - DotProductTester( const DotProductTester & ); // inhibit copies - DotProductTester& operator=( const DotProductTester & ); // inhibit copies - }; TesterClass* getTester( MetricType, - const std::vector&, - const std::vector&, + const Feature_List&, int ); } diff --git a/include/timbl/TimblAPI.h b/include/timbl/TimblAPI.h index 3ed3979..27195a9 100644 --- a/include/timbl/TimblAPI.h +++ b/include/timbl/TimblAPI.h @@ -33,7 +33,6 @@ #include #include "ticcutils/CommandLine.h" #include "timbl/Common.h" -#include "timbl/MsgClass.h" #include "timbl/Types.h" #include "timbl/Instance.h" #include "timbl/neighborSet.h" @@ -83,19 +82,19 @@ namespace Timbl{ const std::string& = "" ); const TargetValue *Classify( const std::string& ); const TargetValue *Classify( const std::string&, - const ValueDistribution *& ); + const ClassDistribution *& ); const TargetValue *Classify( const std::string&, double& ); const TargetValue *Classify( const std::string&, - const ValueDistribution *&, + const ClassDistribution *&, double& ); const TargetValue *Classify( const icu::UnicodeString& ); const TargetValue *Classify( const icu::UnicodeString&, - const ValueDistribution *& ); + const ClassDistribution *& ); const TargetValue *Classify( const icu::UnicodeString&, double& ); const TargetValue *Classify( const icu::UnicodeString&, - const ValueDistribution *&, + const ClassDistribution *&, double& ); const neighborSet *classifyNS( const icu::UnicodeString& ); bool classifyNS( const icu::UnicodeString&, @@ -105,7 +104,7 @@ namespace Timbl{ return classifyNS( TiCC::UnicodeFromUTF8(in), st ); } const Instance *lastHandledInstance() const; - const Target *myTargets() const; + const Targets& myTargets() const; bool Classify( const std::string&, std::string& ); bool Classify( const std::string&, @@ -153,7 +152,7 @@ namespace Timbl{ bool initExperiment(); private: TimblAPI(); - TimblAPI& operator=( const TimblAPI& ); // so nobody may use them + TimblAPI& operator=( const TimblAPI& ); // forbid copies TimblExperiment *pimpl; bool i_am_fine; }; @@ -163,5 +162,7 @@ namespace Timbl{ bool string_to( const std::string&, Algorithm& ); bool string_to( const std::string&, Weighting& ); + typedef ClassDistribution ValueDistribution; // for backward compatability + typedef WClassDistribution WValueDistribution; // for backward compatability } #endif // TIMBL_API_H diff --git a/include/timbl/TimblExperiment.h b/include/timbl/TimblExperiment.h index 0990558..06619d8 100644 --- a/include/timbl/TimblExperiment.h +++ b/include/timbl/TimblExperiment.h @@ -30,6 +30,7 @@ #define TIMBL_EXPERIMENT_H #include +#include #include #include #include "ticcutils/XMLtools.h" @@ -56,7 +57,8 @@ namespace Timbl { class resultStore: public MsgClass { public: - resultStore(): rawDist(0), + resultStore(): + rawDist(0), dist(0), disposable(false), isTop(false), @@ -66,13 +68,15 @@ namespace Timbl { best_target(0), targets(0) {}; + resultStore( const resultStore& ) = delete; // inhibit copies + resultStore& operator=( const resultStore& ) = delete; // inhibit copies ~resultStore(); - bool reset( int, normType, double, const Target * ); + bool reset( int, normType, double, const Targets& ); void clear(); - void addConstant( const ValueDistribution *, const TargetValue * ); - void addTop( const ValueDistribution *, const TargetValue * ); - void addDisposable( ValueDistribution *, const TargetValue * ); - const WValueDistribution *getResultDist(); + void addConstant( const ClassDistribution *, const TargetValue * ); + void addTop( const ClassDistribution *, const TargetValue * ); + void addDisposable( ClassDistribution *, const TargetValue * ); + const WClassDistribution *getResultDist(); std::string getResult(); void prepare(); void normalize(); @@ -93,17 +97,15 @@ namespace Timbl { } }; private: - resultStore( const resultStore& ); // inhibit copies - resultStore& operator=( const resultStore& ); // inhibit copies - const ValueDistribution *rawDist; - WValueDistribution *dist; + const ClassDistribution *rawDist; + WClassDistribution *dist; bool disposable; bool isTop; int beam; normType norm; double factor; const TargetValue *best_target; - const Target *targets; + const Targets *targets; std::string topCache; std::string resultCache; }; @@ -134,13 +136,13 @@ namespace Timbl { virtual bool CVprepare( const std::string& = "", WeightType = GR_w, const std::string& = "" ); - virtual bool Increment( const icu::UnicodeString& ) - { FatalError( "Increment" ); return false; }; - virtual bool Decrement( const icu::UnicodeString& ) - { FatalError( "Decrement" ); return false; }; + virtual bool Increment( const icu::UnicodeString& ){ + FatalError( "Increment" ); return false; }; + virtual bool Decrement( const icu::UnicodeString& ){ + FatalError( "Decrement" ); return false; }; virtual bool Expand( const std::string& ); virtual bool Remove( const std::string& ){ - FatalError( "Remove" ); return false;}; + FatalError( "Remove" ); return false;}; virtual bool Test( const std::string&, const std::string& ); virtual bool NS_Test( const std::string&, @@ -202,7 +204,7 @@ namespace Timbl { virtual AlgorithmType Algorithm() const = 0; const TargetValue *Classify( const icu::UnicodeString& Line, - const ValueDistribution *& db, + const ClassDistribution *& db, double& di ){ const TargetValue *res = classifyString( Line, di ); if ( res ){ @@ -211,16 +213,15 @@ namespace Timbl { } return res; } - const TargetValue *Classify( const icu::UnicodeString& Line ){ double dum_d; return classifyString( Line, dum_d ); } const TargetValue *Classify( const icu::UnicodeString& Line, - const ValueDistribution *& db ){ + const ClassDistribution *& db ){ double dum_d; - const TargetValue *res = classifyString( Line, dum_d ); + const TargetValue *res = classifyString( Line, dum_d ); if ( res ){ normalizeResult(); db = bestResult.getResultDist(); @@ -241,7 +242,7 @@ namespace Timbl { TimblExperiment( const AlgorithmType, const std::string& = "" ); virtual bool checkLine( const icu::UnicodeString& ); virtual bool ClassicLearn( const std::string& = "", bool = true ); - virtual const TargetValue *LocalClassify( const Instance& , + virtual const TargetValue *LocalClassify( const Instance&, double&, bool& ); virtual bool GetInstanceBase( std::istream& ) = 0; @@ -258,7 +259,7 @@ namespace Timbl { InstanceBase_base *, size_t = 0 ); void normalizeResult(); - const neighborSet *LocalClassify( const Instance& ); + const neighborSet *LocalClassify( const Instance& ); bool nextLine( std::istream &, icu::UnicodeString&, int& ); bool nextLine( std::istream &, icu::UnicodeString& ); bool skipARFFHeader( std::istream & ); @@ -308,23 +309,23 @@ namespace Timbl { IB1_Experiment( const size_t N = DEFAULT_MAX_FEATS, const std::string& s= "", const bool init = true ); - bool Increment( const icu::UnicodeString& ); - bool Decrement( const icu::UnicodeString& ); - bool Remove( const std::string& ); - AlgorithmType Algorithm() const { return IB1_a; }; - void InitInstanceBase(); + bool Increment( const icu::UnicodeString& ) override; + bool Decrement( const icu::UnicodeString& ) override; + bool Remove( const std::string& ) override; + AlgorithmType Algorithm() const override { return IB1_a; }; + void InitInstanceBase() override; bool NS_Test( const std::string&, - const std::string& ); + const std::string& ) override; protected: - TimblExperiment *clone() const { + TimblExperiment *clone() const override { return new IB1_Experiment( MaxFeats(), "", false ); }; - bool checkTestFile(); - bool checkLine( const icu::UnicodeString& ); + bool checkTestFile() override; + bool checkLine( const icu::UnicodeString& ) override; bool Increment( const Instance& I ) { return UnHideInstance( I ); }; bool Decrement( const Instance& I ) { return HideInstance( I ); }; private: - bool GetInstanceBase( std::istream& ); + bool GetInstanceBase( std::istream& ) override; }; class IB2_Experiment: public IB1_Experiment { @@ -333,51 +334,56 @@ namespace Timbl { IB1_Experiment( N, s ) { IB2_offset( 0 ); }; - bool Prepare( const std::string& = "", bool = false, bool = false ); - bool Expand( const std::string& ); - bool Remove( const std::string& ); - bool Learn( const std::string& = "", bool = false ); - AlgorithmType Algorithm() const { return IB2_a; }; + bool Prepare( const std::string& = "", + bool=false, + bool=false ) override; + bool Expand( const std::string& ) override; + bool Remove( const std::string& ) override; + bool Learn( const std::string& = "", bool = false ) override; + AlgorithmType Algorithm() const override { return IB2_a; }; protected: - bool checkTestFile( ); - TimblExperiment *clone() const { return new IB2_Experiment( MaxFeats() ); }; + bool checkTestFile() override; + TimblExperiment *clone() const override { + return new IB2_Experiment( MaxFeats() ); }; bool Expand_N( const std::string& ); bool show_learn_progress( std::ostream& os, time_t, size_t ); }; class LOO_Experiment: public IB1_Experiment { public: - LOO_Experiment( int N, const std::string& s = "" ): - IB1_Experiment( N, s ) { + LOO_Experiment( int N, const std::string& s = "" ): + IB1_Experiment( N, s ) { }; bool Test( const std::string&, - const std::string& ); - AlgorithmType Algorithm() const { return LOO_a; }; - bool ReadInstanceBase( const std::string& ); - void initExperiment( bool = false ); + const std::string& ) override; + AlgorithmType Algorithm() const override { return LOO_a; }; + bool ReadInstanceBase( const std::string& ) override; + void initExperiment( bool = false ) override; protected: - bool checkTestFile( ); - void showTestingInfo( std::ostream& ); + bool checkTestFile() override; + void showTestingInfo( std::ostream& ) override; }; class CV_Experiment: public IB1_Experiment { public: - CV_Experiment( int N = DEFAULT_MAX_FEATS, const std::string& s = "" ): - IB1_Experiment( N, s ), CV_fileW(Unknown_w) { }; - bool Learn( const std::string& = "", bool = true ); - bool Prepare( const std::string& = "", bool = true, bool = false ); + CV_Experiment( int N = DEFAULT_MAX_FEATS, const std::string& s = "" ): + IB1_Experiment( N, s ), CV_fileW(Unknown_w) { }; + CV_Experiment( const CV_Experiment& ) = delete; // forbid copies + CV_Experiment& operator=( const CV_Experiment& ) = delete; // forbid copies + bool Learn( const std::string& = "", bool = true ) override; + bool Prepare( const std::string& = "", + bool=true, + bool=false ) override; bool Test( const std::string&, - const std::string& ); + const std::string& ) override; bool CVprepare( const std::string& = "", WeightType = GR_w, - const std::string& = "" ); - AlgorithmType Algorithm() const { return CV_a; }; + const std::string& = "" ) override; + AlgorithmType Algorithm() const override { return CV_a; }; protected: - bool checkTestFile(); + bool checkTestFile() override; bool get_file_names( const std::string& ); private: - CV_Experiment( const CV_Experiment& ); - CV_Experiment& operator=( const CV_Experiment& ); std::vector FileNames; std::string CV_WfileName; std::string CV_PfileName; @@ -390,21 +396,21 @@ namespace Timbl { const std::string& s = "", const bool init = true ): TimblExperiment( TRIBL_a, s ) { - if ( init ) InitClass( N ); + if ( init ) init_options_table( N ); }; - void InitInstanceBase(); + void InitInstanceBase() override; protected: - TimblExperiment *clone() const { + TimblExperiment *clone() const override { return new TRIBL_Experiment( MaxFeats(), "", false ); }; - void showTestingInfo( std::ostream& ); - bool checkTestFile(); - AlgorithmType Algorithm() const { return TRIBL_a; }; - bool checkLine( const icu::UnicodeString& ); - const TargetValue *LocalClassify( const Instance& , + void showTestingInfo( std::ostream& ) override; + bool checkTestFile() override; + AlgorithmType Algorithm() const override { return TRIBL_a; }; + bool checkLine( const icu::UnicodeString& ) override; + const TargetValue *LocalClassify( const Instance&, double&, - bool& ); + bool& ) override; private: - bool GetInstanceBase( std::istream& ); + bool GetInstanceBase( std::istream& ) override; }; class TRIBL2_Experiment: public TimblExperiment { @@ -413,20 +419,20 @@ namespace Timbl { const std::string& s = "", const bool init = true ): TimblExperiment( TRIBL2_a, s ) { - if ( init ) InitClass( N ); + if ( init ) init_options_table( N ); }; - void InitInstanceBase(); + void InitInstanceBase() override; protected: - TimblExperiment *clone() const { + TimblExperiment *clone() const override { return new TRIBL2_Experiment( MaxFeats(), "", false ); }; - bool checkTestFile(); - AlgorithmType Algorithm() const { return TRIBL2_a; }; - bool checkLine( const icu::UnicodeString& ); + bool checkTestFile() override; + AlgorithmType Algorithm() const override { return TRIBL2_a; }; + bool checkLine( const icu::UnicodeString& ) override; const TargetValue *LocalClassify( const Instance& , double&, - bool& ); + bool& ) override; private: - bool GetInstanceBase( std::istream& ); + bool GetInstanceBase( std::istream& ) override; }; class IG_Experiment: public TimblExperiment { @@ -435,32 +441,32 @@ namespace Timbl { const std::string& s = "", const bool init = true ): TimblExperiment( IGTREE_a, s ) { - if ( init ) InitClass( N ); + if ( init ) init_options_table( N ); }; - AlgorithmType Algorithm() const { return IGTREE_a; }; - void InitInstanceBase(); - bool WriteInstanceBase( const std::string& ); - bool ReadInstanceBase( const std::string& ); - void initExperiment( bool = false ); - bool Expand( const std::string& ){ + AlgorithmType Algorithm() const override { return IGTREE_a; }; + void InitInstanceBase() override; + bool WriteInstanceBase( const std::string& ) override; + bool ReadInstanceBase( const std::string& ) override; + void initExperiment( bool = false ) override; + bool Expand( const std::string& ) override { FatalError( "Expand not supported for IGTree" ); return false; }; protected: - TimblExperiment *clone() const { + TimblExperiment *clone() const override{ return new IG_Experiment( MaxFeats(), "", false ); }; - bool ClassicLearn( const std::string& = "", bool = true ); - bool checkTestFile(); - void showTestingInfo( std::ostream& ); - bool checkLine( const icu::UnicodeString& ); + bool ClassicLearn( const std::string& = "", bool = true ) override; + bool checkTestFile() override; + void showTestingInfo( std::ostream& ) override; + bool checkLine( const icu::UnicodeString& ) override; bool sanityCheck() const; const TargetValue *LocalClassify( const Instance&, double&, - bool& ); + bool& ) override; private: - bool GetInstanceBase( std::istream& ); + bool GetInstanceBase( std::istream& ) override; }; } diff --git a/include/timbl/Types.h b/include/timbl/Types.h index fc9d34a..c88b200 100644 --- a/include/timbl/Types.h +++ b/include/timbl/Types.h @@ -186,15 +186,16 @@ namespace TiCC { } throw( std::runtime_error( "conversion from string '" + str + "' to decayType failed" ) ); - return UnknownDecay; } template <> inline std::string toString( const DecayType& W, bool b ){ - if ( b ) + if ( b ){ return DecayName[W][1]; - else + } + else { return DecayName[W][0]; + } } template <> @@ -208,7 +209,6 @@ namespace TiCC { } throw( std::runtime_error( "conversion from string '" + str + "' to ordeningType failed" ) ); - return UnknownOrdening; } template <> @@ -228,7 +228,6 @@ namespace TiCC { } throw( std::runtime_error( "conversion from string '" + str + "' to metricType failed" ) ); - return UnknownMetric; } template <> @@ -255,15 +254,16 @@ namespace TiCC { } throw( std::runtime_error( "conversion from string '" + str + "' to weightType failed" ) ); - return Unknown_w; } template <> inline std::string toString( const WeightType& W, bool b ){ - if ( b ) + if ( b ){ return WeightName[W][1]; - else + } + else { return WeightName[W][0]; + } } template <> @@ -282,7 +282,6 @@ namespace TiCC { } throw( std::runtime_error( "conversion from string '" + str + "' to algorithmType failed" ) ); - return Unknown_a; } template <> @@ -305,16 +304,17 @@ namespace TiCC { } throw( std::runtime_error( "conversion from string '" + str + "' to weightType failed" ) ); - return UnknownInputFormat; } template <> inline std::string toString( const InputFormatType& i, bool b ){ - if ( b ) + if ( b ){ return InputFormatName[i][1]; - else + } + else { return InputFormatName[i][0]; + } } template <> @@ -328,16 +328,17 @@ namespace TiCC { } throw( std::runtime_error( "conversion from string '" + str + "' to smoothingType failed" ) ); - return UnknownSmoothing; } template <> inline std::string toString( const SmoothingType& s, bool b ){ - if ( b ) + if ( b ){ return SmoothingName[s][1]; - else + } + else { return SmoothingName[s][0]; + } } template <> @@ -356,16 +357,17 @@ namespace TiCC { } throw( std::runtime_error( "conversion from string '" + str + "' to normalisationType failed" ) ); - return unknownNorm; } template <> inline std::string toString( const normType& s, bool b ){ - if ( b ) + if ( b ){ return NormalisationName[s][1]; - else + } + else { return NormalisationName[s][0]; + } } inline bool string_to_verbflag( const std::string& line, diff --git a/include/timbl/neighborSet.h b/include/timbl/neighborSet.h index aa1cf38..5222cda 100644 --- a/include/timbl/neighborSet.h +++ b/include/timbl/neighborSet.h @@ -46,30 +46,30 @@ namespace Timbl{ class zeroDecay: public decayStruct { public: zeroDecay():decayStruct(){}; - std::ostream& put( std::ostream& ) const; - DecayType type() const { return Zero;}; + std::ostream& put( std::ostream& ) const override; + DecayType type() const override { return Zero;}; }; class invLinDecay: public decayStruct { public: invLinDecay():decayStruct(){}; - std::ostream& put( std::ostream& ) const; - DecayType type() const { return InvLinear;}; + std::ostream& put( std::ostream& ) const override; + DecayType type() const override { return InvLinear;}; }; class invDistDecay: public decayStruct { public: invDistDecay():decayStruct(){}; - std::ostream& put( std::ostream& ) const; - DecayType type() const { return InvDist;}; + std::ostream& put( std::ostream& ) const override; + DecayType type() const override { return InvDist;}; }; class expDecay: public decayStruct { public: explicit expDecay( double alp ): decayStruct(alp,1.0){}; expDecay( double alp, double bet ): decayStruct(alp,bet){}; - std::ostream& put( std::ostream& ) const; - DecayType type() const { return ExpDecay;}; + std::ostream& put( std::ostream& ) const override; + DecayType type() const override { return ExpDecay;}; }; class neighborSet { @@ -88,8 +88,8 @@ namespace Timbl{ void merge( const neighborSet& ); double getDistance( size_t ) const; double bestDistance() const { return getDistance(0); }; - const ValueDistribution *getDistribution( size_t ) const; - WValueDistribution *bestDistribution( const decayStruct * =0, + const ClassDistribution *getDistribution( size_t ) const; + WClassDistribution *bestDistribution( const decayStruct * =0, size_t =0 ) const ; double relativeWeight( const decayStruct *, size_t ) const; bool setShowDistance( bool b ) const { @@ -105,9 +105,9 @@ namespace Timbl{ private: mutable bool showDistance; mutable bool showDistribution; - void push_back( double, const ValueDistribution & ); + void push_back( double, const ClassDistribution & ); std::vector distances; - std::vector distributions; + std::vector distributions; }; } diff --git a/src/BestArray.cxx b/src/BestArray.cxx index 4353272..26e2b1d 100644 --- a/src/BestArray.cxx +++ b/src/BestArray.cxx @@ -24,13 +24,9 @@ or send mail to: lamasoftware (at ) science.ru.nl */ -#include -#include #include -#include #include "timbl/Common.h" -#include "timbl/MsgClass.h" #include "timbl/Types.h" #include "timbl/Instance.h" #include "timbl/neighborSet.h" @@ -115,7 +111,7 @@ namespace Timbl { } double BestArray::addResult( double Distance, - const ValueDistribution *Distr, + const ClassDistribution *Distr, const UnicodeString& neighbor ){ // We have the similarity in Distance, and a num_of_neighbors // dimensional array with best similarities. @@ -152,8 +148,8 @@ namespace Timbl { // best->bestDistance = Distance; if ( _storeInstances ){ - for ( unsigned int j = 0; j < best->bestInstances.size(); ++j ){ - delete best->bestDistributions[j]; + for ( const auto& it : best->bestDistributions ){ + delete it; } best->bestInstances.clear(); best->bestDistributions.clear(); @@ -176,8 +172,8 @@ namespace Timbl { // keep->bestDistance = Distance; if ( _storeInstances ){ - for ( unsigned int j = 0; j < keep->bestInstances.size(); ++j ){ - delete keep->bestDistributions[j]; + for ( const auto& it :keep->bestDistributions ){ + delete it; } keep->bestInstances.clear(); keep->bestDistributions.clear(); diff --git a/src/CVExperiment.cxx b/src/CVExperiment.cxx index d417f8b..4c99233 100644 --- a/src/CVExperiment.cxx +++ b/src/CVExperiment.cxx @@ -35,9 +35,9 @@ #include -#include "timbl/MsgClass.h" #include "timbl/Common.h" #include "timbl/Types.h" +#include "timbl/Instance.h" #include "timbl/TimblExperiment.h" namespace Timbl { diff --git a/src/Choppers.cxx b/src/Choppers.cxx index 1bc7785..89f23fd 100644 --- a/src/Choppers.cxx +++ b/src/Choppers.cxx @@ -262,7 +262,6 @@ namespace Timbl{ } void ExChopper::init( const UnicodeString& s, size_t len, bool stripDot ) { - exW = -1.0; UnicodeString split = s; vSize = len+1; choppedInput.resize(vSize); @@ -448,7 +447,7 @@ namespace Timbl{ for ( size_t i=0; i < res ; ++i ){ choppedInput[i] = StrToCode( splits[i] ); } - return ( res == vSize ); // Enough? + return true; } UnicodeString Columns_Chopper::getString() const { @@ -470,7 +469,7 @@ namespace Timbl{ for ( size_t i=0; i < res ; ++i ){ choppedInput[i] = StrToCode( splits[i], false ); } - return ( res == vSize ); // Enough? + return true; } UnicodeString Tabbed_Chopper::getString() const { diff --git a/src/Common.cxx b/src/Common.cxx index e73cf0f..3f41b18 100644 --- a/src/Common.cxx +++ b/src/Common.cxx @@ -24,9 +24,9 @@ or send mail to: lamasoftware (at ) science.ru.nl */ -#include -#include + #include "timbl/Common.h" + #include "config.h" using namespace std; diff --git a/src/Features.cxx b/src/Features.cxx new file mode 100644 index 0000000..0cd60e9 --- /dev/null +++ b/src/Features.cxx @@ -0,0 +1,1005 @@ +/* + Copyright (c) 1998 - 2023 + ILK - Tilburg University + CLST - Radboud University + CLiPS - University of Antwerp + + This file is part of timbl + + timbl is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + timbl is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . + + For questions and suggestions, see: + https://github.com/LanguageMachines/timbl/issues + or send mail to: + lamasoftware (at ) science.ru.nl +*/ + +#include +#include +#include +#include // for sort() +#include // for accumulate() +#include // for fabs() +#include "timbl/Common.h" +#include "timbl/Types.h" +#include "timbl/Metrics.h" +#include "timbl/Matrices.h" +#include "timbl/Instance.h" +#include "ticcutils/Unicode.h" +#include "ticcutils/UniHash.h" + +namespace Timbl { + + using namespace std; + using namespace Common; + using icu::UnicodeString; + + FeatureValue::FeatureValue( const UnicodeString& value, + size_t hash_val ): + ValueClass( value, hash_val ), + ValueClassProb( 0 ) + { + } + + FeatureValue::FeatureValue( const UnicodeString& s ): + ValueClass( s, 0 ), + ValueClassProb(0){ + _frequency = 0; + } + + FeatureValue::~FeatureValue( ){ + delete ValueClassProb; + } + + Feature::Feature( Hash::UnicodeHash *T ): + metric_matrix( 0 ), + TokenTree(T), + metric( 0 ), + ignore( false ), + numeric( false ), + vcpb_read( false ), + PrestoreStatus(ps_undef), + Prestored_metric( UnknownMetric ), + entropy( 0.0 ), + info_gain (0.0), + split_info(0.0), + gain_ratio(0.0), + chi_square(0.0), + shared_variance(0.0), + standard_deviation(0.0), + matrix_clip_freq(10), + n_min (0.0), + n_max (0.0), + weight(0.0), + is_reference(false) + {} + + Feature::Feature( const Feature& in ): MsgClass( in ){ + *this = in; + is_reference = true; + } + + Feature& Feature::operator=( const Feature& in ){ + if ( this != &in ){ + metric_matrix = in.metric_matrix; + metric = in.metric; + PrestoreStatus = in.PrestoreStatus; + Prestored_metric = in.Prestored_metric; + ignore = in.ignore; + numeric = in.numeric; + vcpb_read = in.vcpb_read; + entropy = in.entropy; + info_gain = in.info_gain; + split_info = in.split_info; + gain_ratio = in.gain_ratio; + chi_square = in.chi_square; + shared_variance = in.shared_variance; + standard_deviation = in.standard_deviation; + matrix_clip_freq = in.matrix_clip_freq; + n_dot_j = in.n_dot_j; + n_i_dot = in.n_i_dot; + n_min = in.n_min; + n_max = in.n_max; + weight = in.weight; + values_array = in.values_array; + reverse_values = in.reverse_values; + TokenTree = in.TokenTree; + } + return *this; + } + + void Feature::InitSparseArrays(){ + if ( !is_reference ){ + // Loop over all values. + // + for ( const auto& FV : values_array ){ + size_t freq = FV->ValFreq(); + FV->ValueClassProb->Clear(); + if ( freq > 0 ){ + // Loop over all present classes. + // + for ( const auto& tit : FV->TargetDist ){ + FV->ValueClassProb->Assign( tit.second->Index(), + tit.second->Freq()/(double)freq ); + } + } + } + } + } + + size_t Feature::EffectiveValues() const { + return count_if( values_array.begin(), values_array.end(), + [&]( const FeatureValue* v ){ + return (v->ValFreq() > 0); } ); + } + + size_t Feature::TotalValues() const { + return accumulate( values_array.begin(), values_array.end(), + 0, + [&]( size_t r, const FeatureValue *v ){ + return r + v->ValFreq(); } ); + } + + FeatureValue *Feature::Lookup( const UnicodeString& str ) const { + FeatureValue *result = NULL; + unsigned int hash_val = TokenTree->lookup( str ); + if ( hash_val > 0 ) { + auto const& it = reverse_values.find( hash_val ); + if ( it != reverse_values.end() ){ + result = it->second; + } + } + return result; + } + + FeatureValue *Feature::add_value( const UnicodeString& valstr, + TargetValue *tv, + int freq ){ + unsigned int hash_val = TokenTree->hash( valstr ); + // cerr << "hash(" << valstr << ") geeft: " << hash_val << endl; + return add_value( hash_val, tv, freq ); + } + + FeatureValue *Feature::add_value( size_t hash_val, + TargetValue *tv, + int freq ){ + auto const& it = reverse_values.find( hash_val ); + if ( it == reverse_values.end() ){ + const UnicodeString& value = TokenTree->reverse_lookup( hash_val ); + // cerr << "lookup(" << index << ") geeft: " << value << endl; + // we want to store the singleton value for this index + // so we MUST reverse lookup the index + FeatureValue *fv = new FeatureValue( value, hash_val ); + fv->ValFreq( freq ); + reverse_values[hash_val] = fv; + values_array.push_back( fv ); + } + else { + it->second->IncValFreq( freq ); + } + FeatureValue *result = reverse_values[hash_val]; + if ( tv ){ + result->TargetDist.IncFreq(tv, freq ); + } + return result; + } + + bool Feature::increment_value( FeatureValue *FV, + TargetValue *tv ){ + bool result = false; + if ( FV ){ + FV->incr_val_freq(); + if ( tv ){ + FV->TargetDist.IncFreq(tv,1); + } + result = true; + } + return result; + } + + bool Feature::decrement_value( FeatureValue *FV, TargetValue *tv ){ + bool result = false; + if ( FV ){ + FV->decr_val_freq(); + if ( tv ){ + FV->TargetDist.DecFreq(tv); + } + result = true; + } + return result; + } + + bool Feature::AllocSparseArrays( size_t Dim ){ + // Loop over all values. + // + for ( const auto& FV : values_array ){ + // Loop over all classes. + if ( FV->ValueClassProb == NULL ){ + if ( !(FV->ValueClassProb = new SparseValueProbClass( Dim )) ){ + return false; + } + } + } + return true; + } + + bool Feature::isNumerical() const { + if ( metric && metric->isNumerical() ){ + return true; + } + else { + return false; + } + } + + bool Feature::isStorableMetric() const { + if ( metric && metric->isStorable() ){ + return true; + } + else { + return false; + } + } + + struct D_D { + D_D(): dist(0), value(0.0) {}; + explicit D_D( FeatureValue *fv ): value(0.0) { + if ( !TiCC::stringTo( fv->name(), value ) ){ + throw( logic_error("called DD with an non-numeric value" ) ); + } + dist = &fv->TargetDist; + } + ClassDistribution *dist; + double value; + }; + + bool dd_less( const D_D* dd1, const D_D* dd2 ){ + return dd1->value < dd2->value; + } + + void Feature::NumStatistics( vector& FVBin, + double DBentropy ){ + size_t BinSize = FVBin.size(); + double Prob, FVEntropy; + size_t TotalVals = TotalValues(); + entropy = 0.0; + vector ddv; + size_t dd_len = values_array.size(); + ddv.reserve( dd_len ); + for ( const auto& FV : values_array ){ + if ( FV->ValFreq() > 0 ){ + ddv.push_back( new D_D( FV ) ); + } + } + sort( ddv.begin(), ddv.end(), dd_less ); + int num_per_bin = (int)floor( (double)dd_len / BinSize); + size_t rest = dd_len - num_per_bin * BinSize; + if ( rest ){ + num_per_bin++; + } + int jj = 0; + int cnt = 0; + for ( const auto& it: ddv ){ + FVBin[jj]->TargetDist.Merge( *it->dist ); + if ( ++cnt >= num_per_bin ){ + ++jj; + if ( --rest == 0 ){ + --num_per_bin; + } + cnt = 0; + } + } + for ( auto const& it: ddv ){ + delete it; + } + for ( size_t k=0; k < BinSize; k++ ){ + FeatureValue *pnt = FVBin[k]; + size_t Freq = pnt->TargetDist.totalSize(); + pnt->ValFreq( Freq ); + if ( Freq > 0 ){ + // Entropy for this FV pair. + // + FVEntropy = 0.0; + for ( const auto& it : pnt->TargetDist ){ + Prob = it.second->Freq()/(double)Freq; + FVEntropy += Prob * Log2(Prob); + } + entropy += -FVEntropy * Freq / (double)TotalVals; + } + } + entropy = fabs( entropy ); + // Info gain. + // + info_gain = DBentropy - entropy; + + // And the split info. + // + split_info = 0.0; + for ( size_t l=0; l < BinSize; ++l ){ + size_t Freq = FVBin[l]->ValFreq(); + if ( Freq > 0 ){ + Prob = Freq / (double)TotalVals; + split_info += Prob * Log2(Prob); + } + } + split_info = -split_info; + // Gain ratio. + // + if ( fabs(split_info) FVBin(BinSize); + for ( int i=0; i < BinSize; ++i ){ + sprintf( dumname, "dum%d", i ); + FVBin[i] = new FeatureValue( dumname ); + } + NumStatistics( FVBin, DBentropy ); + if ( full ){ + ChiSquareStatistics( FVBin, Targs ); + int cnt = 0; // count effective values in Bin + for ( int i=0; i < BinSize; ++i ){ + if ( FVBin[i]->ValFreq() > 0 ){ + ++cnt; + } + } + SharedVarianceStatistics( Targs, cnt ); + } + for ( const auto& it : FVBin ){ + delete it; + } + } + + void Feature::Statistics( double DBentropy ){ + size_t TotalVals = TotalValues(); + entropy = 0.0; + // Loop over the values. + for ( const auto& fv : values_array ){ + // Entropy for this FV pair. + size_t Freq = fv->ValFreq(); + if ( Freq > 0 ){ + double FVEntropy = 0.0; + for ( const auto& tit : fv->TargetDist ){ + double Prob = tit.second->Freq() / (double)Freq; + FVEntropy += Prob * Log2(Prob); + } + entropy += -FVEntropy * Freq / (double)TotalVals; + } + } + + entropy = fabs( entropy ); + // Info. gain. + // + info_gain = DBentropy - entropy; + if ( info_gain < 0.0 ){ + info_gain = 0.0; + } + // And the split. info. + // + split_info = 0.0; + for ( const auto& fv : values_array ){ + double Prob = fv->ValFreq() / (double)TotalVals; + if ( Prob > 0 ) { + split_info += Prob * Log2(Prob); + } + } + split_info = -split_info; + // Gain ratio. + // + if ( fabs(split_info) < Epsilon ){ + gain_ratio = 0.0; + } + else { + gain_ratio = info_gain / split_info; + } + } + + void Feature::ChiSquareStatistics( vector& FVA, + const Targets& Targs ){ + size_t Num_Vals = FVA.size(); + chi_square = 0.0; + long int n_dot_dot = 0; + size_t Size = Targs.num_of_values(); + n_dot_j.resize(Size,0); + n_i_dot.resize(Num_Vals,0); + for ( size_t j = 0; j < Size; ++j ){ + // ALL values should be zeroed + n_dot_j[j] = 0; + } + for ( size_t i = 0; i < Num_Vals; ++i ){ + n_i_dot[i] = 0; // ALL values should be zeroed + FeatureValue *fv = FVA[i]; + for ( const auto& tit : fv->TargetDist ){ + n_dot_j[tit.second->Index()-1] += tit.second->Freq(); + n_i_dot[i] += tit.second->Freq(); + } + n_dot_dot += n_i_dot[i]; + } + if ( n_dot_dot != 0 ){ + for ( size_t m = 0; m < Num_Vals; ++m ){ + FeatureValue *fv = FVA[m]; + size_t n = 0; + for ( const auto& it : fv->TargetDist ){ + if ( n >= Size ){ + break; + } + while ( n < it.second->Index()-1 ){ + double tmp = ((double)n_dot_j[n++] * (double)n_i_dot[m]) / + (double)n_dot_dot; + chi_square += tmp; + } + if ( n == it.second->Index()-1 ){ + double tmp = ((double)n_dot_j[n++] * (double)n_i_dot[m]) / + (double)n_dot_dot; + if ( fabs(tmp) > Epsilon){ + chi_square += ( (tmp - it.second->Freq()) * + (tmp - it.second->Freq()) ) / tmp; + } + } + else { + break; + } + } + while ( n < Size ){ + double tmp = ((double)n_dot_j[n++] * (double)n_i_dot[m]) / + (double)n_dot_dot; + chi_square += tmp; + } + } + } + } + + void Feature::ChiSquareStatistics( const Targets& Targs ){ + chi_square = 0.0; + long int n_dot_dot = 0; + size_t Size = Targs.num_of_values(); + size_t Num_Vals = values_array.size(); + n_dot_j.resize(Size,0); + n_i_dot.resize(Num_Vals,0); + for ( size_t j = 0; j < Size; ++j ){ + // ALL values should be zeroed + n_dot_j[j] = 0; + } + int i = 0; + for ( const auto& fv : values_array ){ + n_i_dot[i] = 0; // ALL values should be zeroed + for ( const auto& t_it : fv->TargetDist ){ + long int fr = t_it.second->Freq(); + n_dot_j[t_it.second->Index()-1] += fr; + n_i_dot[i] += fr; + } + n_dot_dot += n_i_dot[i]; + ++i; + } + if ( n_dot_dot != 0 ){ + int m = 0; + for ( const auto& fv : values_array ){ + size_t n = 0; + for ( const auto& t_it : fv->TargetDist ){ + if ( n >= Size ){ + break; + } + size_t id = t_it.second->Index()-1; + long int fr = t_it.second->Freq(); + while ( n < id ){ + double tmp = ((double)n_dot_j[n++] * (double)n_i_dot[m]) / + (double)n_dot_dot; + chi_square += tmp; + } + if ( n == id ){ + double tmp = ((double)n_dot_j[n++] * (double)n_i_dot[m]) / + (double)n_dot_dot; + if ( fabs(tmp) > Epsilon ){ + chi_square += ( (tmp - fr ) * (tmp - fr ) ) / tmp; + } + } + else { + break; + } + } + while ( n < Size ){ + double tmp = ((double)n_dot_j[n++] * (double)n_i_dot[m]) / + (double)n_dot_dot; + chi_square += tmp; + } + ++m; + } + } + } + + double Feature::fvDistance( FeatureValue *F, + FeatureValue *G, + size_t limit ) const { + double result = 0.0; + if ( F != G ){ + bool dummy; + if ( metric->isStorable() + && matrixPresent( dummy ) + && F->ValFreq() >= matrix_clip_freq + && G->ValFreq() >= matrix_clip_freq ){ + result = metric_matrix->Extract( F, G ); + } + else if ( metric->isNumerical() ) { + result = metric->distance( F, G, limit, Max() - Min() ); + } + else { + result = metric->distance( F, G, limit ); + } + } + return result; + } + + Feature_List &Feature_List::operator=( const Feature_List& l ){ + if ( this != &l ){ + _num_of_feats = l._num_of_feats; + feats.resize(_num_of_feats); + perm_feats.resize(_num_of_feats); + permutation = l.permutation; + _feature_hash = l._feature_hash; // shared ?? + for ( unsigned int i=0; i < _num_of_feats; ++i ){ + feats[i] = new Feature( *l.feats[i] ); + } + for ( unsigned int i=0; i < _num_of_feats; ++i ){ + if ( l.perm_feats[i] ) { + perm_feats[i] = feats[permutation[i]]; + } + else { + perm_feats[i] = 0; + } + } + _is_reference = true; + _eff_feats = l._eff_feats; + _num_of_num_feats = l._num_of_num_feats; + } + return *this; + } + + Feature_List::~Feature_List(){ + if ( !_is_reference ){ + delete _feature_hash; + } + } + + void Feature_List::init( size_t size, + const vector& UserOptions ) + { + _num_of_feats = size; + _feature_hash = new Hash::UnicodeHash(); // all features share the same hash + feats.resize(_num_of_feats,NULL); + perm_feats.resize(_num_of_feats,NULL); + for ( size_t i=0; i< _num_of_feats; ++i ){ + feats[i] = new Feature( _feature_hash ); + } + _eff_feats = _num_of_feats; + _num_of_num_feats = 0; + // the user thinks about features running from 1 to _num_of_feats+1 + // we know better, so shift the UserOptions one down. + for ( size_t j = 0; j < _num_of_feats; ++j ){ + MetricType m = UserOptions[j+1]; + if ( m == Ignore ){ + feats[j]->Ignore( true ); + --_eff_feats; + } + else { + feats[j]->setMetricType( m ); + if ( feats[j]->isNumerical() ){ + ++_num_of_num_feats; + } + } + } + } + + void Feature_List::write_permutation( ostream &os ) const { + os << "< "; + for ( const auto& it : permutation ){ + os << it + 1; + if ( &it != &permutation.back()) + os << ", "; + } + os << " >"; + } + + void Feature_List::calculate_permutation( const vector& W ){ + vector WR = W; + size_t IgnoredFeatures = 0; + permutation.resize(_num_of_feats); + for ( size_t j=0; j < _num_of_feats; ++j ){ + permutation[j] = j; + if ( feats[j]->Ignore() ){ + WR[j] = -0.1; // To be shure that they are placed AFTER + // those which are realy Zero + IgnoredFeatures++; + } + } + if ( IgnoredFeatures == _num_of_feats ){ + Error( "All features seem to be ignored! Nothing to do" ); + exit(1); + } + else { + for ( size_t k=0; k < _num_of_feats; ++k ){ + size_t Max = 0; + for ( size_t m=1; m < _num_of_feats; ++m ){ + if ( WR[m] > WR[Max] ){ + Max = m; + } + } + WR[Max] = -1; + permutation[k] = Max; + } + } + for ( size_t j=0; j < _num_of_feats; ++j ){ + if ( j < _eff_feats ){ + perm_feats[j] = feats[permutation[j]]; + } + else { + perm_feats[j] = NULL; + } + } + } + Feature::~Feature(){ + if ( !is_reference ){ + delete_matrix(); + delete metric; + for ( const auto& it : values_array ){ + delete it; + } + } + reverse_values.clear(); + } + + bool Feature::matrixPresent( bool& isRead ) const { + isRead = false; + if ( metric_matrix != 0 ){ + if ( PrestoreStatus == ps_ok ){ + return true; + } + else if ( PrestoreStatus == ps_read ){ + isRead = true; + return true; + } + } + return false; + } + + size_t Feature::matrix_byte_size() const { + if ( metric_matrix ){ + return metric_matrix->NumBytes(); + } + else { + return 0; + } + } + + FeatVal_Stat Feature::prepare_numeric_stats(){ + bool first = true; + for ( const auto& fv : values_array ){ + size_t freq = fv->ValFreq(); + if ( freq > 0 ){ + double tmp = -1; + if ( !TiCC::stringTo( fv->name(), tmp ) ){ + Warning( "a Non Numeric value '" + fv->name_string() + + "' in Numeric Feature!" ); + return NotNumeric; + } + if ( first ){ + first = false; + n_min = tmp; + n_max = tmp; + } + else if ( tmp < n_min ){ + n_min = tmp; + } + else if ( tmp > n_max ){ + n_max = tmp; + } + } + } + if ( fabs(n_max - n_min) < Epsilon ){ + return SingletonNumeric; + } + else { + return NumericValue; + } + } + + inline int min( int i1, int i2 ) { return (i1>i2?i2:i1); } + inline size_t min( size_t i1, size_t i2 ) { return (i1>i2?i2:i1); } + + void Feature::SharedVarianceStatistics( const Targets& Targ, + int eff_cnt ){ + size_t NumInst = Targ.TotalValues(); + int NumCats = Targ.EffectiveValues(); + int k = min( NumCats, eff_cnt ) - 1; + if ( k == 0 || NumInst == 0 ){ + shared_variance = 0; + } + else { + shared_variance = chi_square / (double)( NumInst * k ); + } + } + + void Feature::StandardDeviationStatistics( ){ + double sum = 0.0; + vector store( values_array.size() ); + for ( unsigned int i=0; i < values_array.size(); ++i ){ + FeatureValue *FV = values_array[i]; + double val = TiCC::stringTo( FV->name() ); + store[i] = val; + sum += val; + } + double total = 0.0; + for ( unsigned int i=0; i < values_array.size(); ++i ){ + double diff = sum - store[i]; + total += diff*diff; + } + standard_deviation = sqrt( total / values_array.size() ); + } + + void Feature::clear_matrix(){ + if ( PrestoreStatus == ps_read ){ + return; + } + else { + delete_matrix(); + } + } + + void Feature::delete_matrix(){ + if ( metric_matrix ){ + metric_matrix->Clear(); + delete metric_matrix; + } + metric_matrix = 0; + PrestoreStatus = ps_undef; + } + + bool Feature::setMetricType( const MetricType M ){ + if ( !metric || M != metric->type() ){ + delete metric; + metric = getMetricClass(M); + return true; + } + else { + return false; + } + } + + MetricType Feature::getMetricType() const { return metric->type(); } + + bool Feature::store_matrix( int limit){ + // + // Store a complete distance matrix. + // + if ( PrestoreStatus == ps_read ){ + return true; + } + if ( !metric_matrix ){ + metric_matrix = new SparseSymetricMatrix(); + } + if ( PrestoreStatus != ps_failed && metric->isStorable( ) ) { + try { + for ( const auto& FV_i : values_array ){ + for ( const auto& FV_j : values_array ){ + if ( FV_i->ValFreq() >= matrix_clip_freq && + FV_j->ValFreq() >= matrix_clip_freq && + ( Prestored_metric != metric->type() || + fabs(metric_matrix->Extract(FV_i,FV_j)) < Epsilon ) ){ + double dist = metric->distance( FV_i, FV_j, limit ); + metric_matrix->Assign( FV_i, FV_j, dist ); + } + } + } + } + catch( ... ){ + cout << "hit the ground!" << endl; + PrestoreStatus = ps_failed; + return false; + }; + PrestoreStatus = ps_ok; + } + if ( PrestoreStatus == ps_ok ){ + Prestored_metric = metric->type(); + } + return true; + } + + ostream& operator<< (std::ostream& os, SparseValueProbClass *VPC ){ + if ( VPC ) { + int old_prec = os.precision(); + os.precision(3); + os.setf( std::ios::fixed ); + auto it = VPC->vc_map.begin(); + for ( size_t k = 1; k <= VPC->dimension; ++k ){ + os.setf(std::ios::right, std::ios::adjustfield); + if ( it != VPC->vc_map.end() && + it->first == k ){ + os << "\t" << it->second; + ++it; + } + else { + os << "\t" << 0.0; + } + } + os << setprecision( old_prec ); + } + else { + os << "(Null SA)"; + } + return os; + } + + void Feature::print_vc_pb_array( ostream &os ) const { + for ( const auto& FV : values_array ){ + if ( FV->ValueClassProb ){ + os << FV << FV->ValueClassProb << endl; + } + } + } + + bool Feature::read_vc_pb_array( istream &is ){ + unsigned int Num = 0; + bool first = true; + // clear all existing arrays + for ( const auto& FV : values_array ){ + if ( FV->ValueClassProb ){ + delete FV->ValueClassProb; + FV->ValueClassProb = NULL; + } + } + UnicodeString buf; + while ( TiCC::getline( is, buf ) ){ + if ( buf.length() < 8 ){ // "empty" line separates matrices + break; + } + vector parts = TiCC::split( buf ); + if ( first ){ + Num = parts.size() - 1; + first = false; + } + UnicodeString name = parts[0]; + FeatureValue *FV = Lookup( name ); + if ( !FV ){ + Warning( "Unknown FeatureValue '" + TiCC::UnicodeToUTF8(name) + + "' in file, (skipped) " ); + continue; + } + else { + FV->ValueClassProb = new SparseValueProbClass( Num ); + for ( size_t i=0; i < Num; ++i ){ + UnicodeString tname = parts[i+1]; + double value; + if ( !TiCC::stringTo( tname, value ) ){ + Error( "Found illegal value '" + TiCC::UnicodeToUTF8(tname) + "'" ); + return false; + } + else if ( value > Epsilon ) { + FV->ValueClassProb->Assign( i, value ); + } + } + } + } + // check if we've got all the values, assign a default if not so + for ( const auto& FV : values_array ){ + if ( FV->ValueClassProb == NULL ){ + FV->ValueClassProb = new SparseValueProbClass( Num ); + } + } + vcpb_read = true; + return true; + } + + bool Feature::fill_matrix( istream &is ) { + if ( !metric_matrix ){ + metric_matrix = new SparseSymetricMatrix(); + } + else { + metric_matrix->Clear(); + } + UnicodeString line; + while ( TiCC::getline(is,line) ){ + if ( line.isEmpty() ){ + break; + } + vector arr = TiCC::split_at( line, " " ); + size_t num = arr.size(); + double d; + if ( num != 2 ){ + Error( "wrong line in inputfile" ); + return false; + } + else if ( arr[0].length() < 2 ){ + Error( "wrong line in inputfile" ); + return false; + } + else if ( !TiCC::stringTo( arr[1], d ) ) { + Error( "wrong line in inputfile" ); + return false; + } + else { + UnicodeString stripped = UnicodeString( arr[0], 1,arr[0].length()-2) ; + vector parts = TiCC::split_at( stripped, ",\t" ); + if ( parts.size() != 2 ){ + Error( "wrong line in inputfile" ); + return false; + } + else { + FeatureValue *F1 = Lookup(parts[0]); + FeatureValue *F2 = Lookup(parts[1]); + metric_matrix->Assign( F1, F2, d ); + } + } + } + PrestoreStatus = ps_read; + return true; + } + + void Feature::print_matrix( ostream &os, bool full ) const { + // + // Print the matrix. + // + int old_prec = os.precision(); + ios::fmtflags old_flags = os.flags(); + os.unsetf(std::ios_base::floatfield); + if ( full ){ + for ( const auto& FV_i : values_array ){ + os.width(6); + os.setf(ios::left, ios::adjustfield); + os << FV_i << ":"; + os.width(12); + os.precision(3); + os.setf(ios::right, ios::adjustfield); + for ( const auto& FV_j : values_array ){ + os.width(12); + os.precision(3); + os.setf(ios::right,ios::adjustfield ); + if ( FV_i->ValFreq() < matrix_clip_freq || + FV_j->ValFreq() < matrix_clip_freq ){ + os << "*"; + } + else { + os << metric_matrix->Extract(FV_i,FV_j); + } + } + os << endl; + } + } + else { + os << *metric_matrix << endl; + } + os << setprecision( old_prec ); + os.flags( old_flags ); + } + +} // namespace Timbl diff --git a/src/GetOptClass.cxx b/src/GetOptClass.cxx index 26471b9..94dea77 100644 --- a/src/GetOptClass.cxx +++ b/src/GetOptClass.cxx @@ -25,20 +25,17 @@ lamasoftware (at ) science.ru.nl */ #include +#include #include -#include -#include -#include -#include -#include -#include -#include + +#include "ticcutils/CommandLine.h" + #include "timbl/Common.h" #include "timbl/Types.h" #include "timbl/Options.h" #include "timbl/MsgClass.h" #include "timbl/Metrics.h" -#include "ticcutils/CommandLine.h" +#include "timbl/Instance.h" #include "timbl/GetOptClass.h" #include "timbl/TimblExperiment.h" @@ -127,6 +124,7 @@ namespace Timbl { local_normalisation( in.local_normalisation ), local_norm_factor( in.local_norm_factor ), MaxFeats( in.MaxFeats ), + target_pos( in.target_pos ), no_neigh( in.no_neigh ), mvd_limit( in.mvd_limit ), estimate( in.estimate ), @@ -293,7 +291,7 @@ namespace Timbl { if ( !outPath.empty() ){ Exp->setOutPath( outPath ); } - } + } //first if ( clones > 0 ){ Exp->Clones( clones ); } @@ -485,7 +483,7 @@ namespace Timbl { return false; } else { - for ( size_t j=k+1; j <= m && j <= metricsArray.size(); ++j ){ + for ( size_t j=k+1; j <= m && j < metricsArray.size(); ++j ){ if ( metricsArray[j] != UnknownMetric && metricsArray[j] != Value ){ Error( "metric of feature " + TiCC::toString(j) + diff --git a/src/IBprocs.cxx b/src/IBprocs.cxx index 179586d..54c5157 100644 --- a/src/IBprocs.cxx +++ b/src/IBprocs.cxx @@ -24,17 +24,18 @@ or send mail to: lamasoftware (at ) science.ru.nl */ -#include -#include + +#include #include #include -#include #include #include "ticcutils/StringOps.h" + #include "timbl/IBtree.h" #include "timbl/Common.h" #include "timbl/Types.h" +#include "timbl/Instance.h" #include "timbl/MBLClass.h" using namespace std; @@ -45,16 +46,16 @@ namespace Timbl { bool result = true; InstanceBase->RemoveInstance( Inst ); MBL_init = do_sloppy_loo; // must be only true if you are REALY sure - for ( size_t i=0; i < effective_feats && result; ++i ){ - PermFeatures[i]->clear_matrix(); - if ( !PermFeatures[i]->decrement_value( Inst.FV[i], - Inst.TV ) ){ + for ( size_t i=0; i < EffectiveFeatures() && result; ++i ){ + features.perm_feats[i]->clear_matrix(); + if ( !features.perm_feats[i]->decrement_value( Inst.FV[i], + Inst.TV ) ){ FatalError( "Unable to Hide an Instance!" ); result = false; } } if ( result ){ - Targets->decrement_value( Inst.TV ); + targets.decrement_value( Inst.TV ); } return result; } @@ -63,16 +64,16 @@ namespace Timbl { bool result = true; InstanceBase->AddInstance( Inst ); MBL_init = do_sloppy_loo; // must be only true if you are REALY sure - for ( size_t i=0; i < effective_feats && result; ++i ){ - PermFeatures[i]->clear_matrix(); - if ( !PermFeatures[i]->increment_value( Inst.FV[i], - Inst.TV ) ){ + for ( size_t i=0; i < EffectiveFeatures() && result; ++i ){ + features.perm_feats[i]->clear_matrix(); + if ( !features.perm_feats[i]->increment_value( Inst.FV[i], + Inst.TV ) ){ FatalError( "Unable to UnHide this Instance!" ); result = false; } } if ( result ){ - Targets->increment_value( Inst.TV ); + targets.increment_value( Inst.TV ); } return result; } @@ -125,7 +126,7 @@ namespace Timbl { if ( nodes == 0 ){ break; } - os << setw(8) << i << " |"<< setw(8) << permutation[i-1] + 1 << " |" + os << setw(8) << i << " |"<< setw(8) << features.permutation[i-1] + 1 << " |" << setw(10) << nodes << " |" << setw(10) << *(nIt-1) << " |" << setw(10) << *(tIt-1) << " |" << setw(10) << (*nIt + *tIt)/double(nodes) << " |" @@ -163,20 +164,21 @@ namespace Timbl { return ""; } - bool MBLClass::get_IB_Info( istream& is, - bool& Pruned, - int& Version, - bool& Hashed, - string& range_buf ){ + size_t MBLClass::get_IB_Info( istream& is, + bool& Pruned, + int& Version, + bool& Hashed, + string& range_buf ){ + size_t result = 0; if ( ExpInvalid() ){ Error( "Can't retrieve Instance-Base\n" ); - return false; + return result; } if ( Options.TableFrozen() || - num_of_features != 0 ){ + NumOfFeatures() != 0 ){ Warning( "unable to read an Instance Base while another" " experiment is already loaded" ); - return false; + return result; } bool info_ok = true; @@ -232,18 +234,18 @@ namespace Timbl { perms = perms + splits[i]; // Maybe we could use splits directly? } bool excl = false; - effective_feats = 0; + features._eff_feats = 0; size_t i = 0; string::size_type pos = 0; // skip < while ( info_ok && pos != string::npos && i < MaxFeatures ){ i++; if ( !excl ){ - effective_feats++; + ++features._eff_feats; } string tmp = string_tok( perms, pos, ", !" ); size_t index = TiCC::stringTo( tmp ); - permutation.push_back( --index ); + features.permutation.push_back( --index ); if ( index >= MaxFeatures ){ Error ( "illegal value " + TiCC::toString(index) + " in permutation, not between 1 and " + @@ -323,13 +325,12 @@ namespace Timbl { } Version = version; if ( info_ok ){ - num_of_features = depth; - return true; + result = depth; + return result; } else { - num_of_features = 0; Error( "Can't retrieve Instance-Base\n" ); - return false; + return 0; } } @@ -371,8 +372,8 @@ namespace Timbl { // But we didn't int scancount = sscanf( buf.c_str(), "[%lf-%lf]", &min, &max ); if ( scancount == 2 ){ - Features[k-1]->Min( min ); - Features[k-1]->Max( max ); + features[k-1]->Min( min ); + features[k-1]->Max( max ); if ( is ){ is >> ws >> buf; if ( !buf.empty() && (buf[0] == '.' || buf[0] == ',' ) ){ @@ -403,16 +404,16 @@ namespace Timbl { // < 5, 2, 3! 1, 4 > bool excl = false; os << "< "; - for ( size_t j=0; j < num_of_features-1; ++j ){ - if ( !excl && Features[permutation[j+1]]->Ignore() ){ + for ( size_t j=0; j < NumOfFeatures()-1; ++j ){ + if ( !excl && features[features.permutation[j+1]]->Ignore() ){ excl = true; - os << permutation[j]+1 << "! "; + os << features.permutation[j]+1 << "! "; } else { - os << permutation[j]+1 << ", "; + os << features.permutation[j]+1 << ", "; } } - os << permutation[num_of_features-1]+1 << " >" << endl; + os << features.permutation[NumOfFeatures()-1]+1 << " >" << endl; } bool MBLClass::PutInstanceBase( ostream& os ) const { @@ -430,9 +431,9 @@ namespace Timbl { writePermSpecial( os ); os << "# Numeric: "; bool first = true; - for ( size_t i=0; i < num_of_features; ++i ){ - if ( !Features[i]->Ignore() && - Features[i]->isNumerical() ){ + for ( size_t i=0; i < NumOfFeatures(); ++i ){ + if ( !features[i]->Ignore() && + features[i]->isNumerical() ){ if ( !first ){ os << ", "; } @@ -446,17 +447,17 @@ namespace Timbl { if ( NumNumFeatures() > 0 ){ os << "# Ranges: "; first = true; - for ( size_t j=0; j < num_of_features; ++j ){ - if ( !Features[j]->Ignore() && - Features[j]->isNumerical() ){ + for ( size_t j=0; j < NumOfFeatures(); ++j ){ + if ( !features[j]->Ignore() && + features[j]->isNumerical() ){ if ( !first ){ os << " , "; } else { first = false; } - os << j+1 << " [" << Features[j]->Min() - << "-" << Features[j]->Max() << "]"; + os << j+1 << " [" << features[j]->Min() + << "-" << features[j]->Max() << "]"; } } os << " ." << endl; @@ -464,8 +465,8 @@ namespace Timbl { os << "# Bin_Size: " << Bin_Size << endl; if ( hashed_trees ){ InstanceBase->Save( os, - *Targets->hash(), - *Features[0]->hash(), + *targets.hash(), + *features.hash(), keep_distributions ); } else { diff --git a/src/IBtree.cxx b/src/IBtree.cxx index 368808e..a31f53d 100644 --- a/src/IBtree.cxx +++ b/src/IBtree.cxx @@ -28,10 +28,6 @@ #include #include #include -#include -#include -#include -#include #include "ticcutils/StringOps.h" #include "ticcutils/UniHash.h" @@ -287,17 +283,17 @@ namespace Timbl { xmlNode *to_node( const FeatureValue *fv ){ xmlNode *result = TiCC::XmlNewNode( "feature" ); - TiCC::XmlAddContent( result, fv->Name() ); + TiCC::XmlAddContent( result, fv->name_string() ); return result; } xmlNode *to_node( const TargetValue *tv ){ xmlNode *result = TiCC::XmlNewNode( "target" ); - TiCC::XmlAddContent( result, tv->Name() ); + TiCC::XmlAddContent( result, tv->name_string() ); return result; } - xmlNode *to_node( const ValueDistribution *d ){ + xmlNode *to_node( const ClassDistribution *d ){ xmlNode *result = TiCC::XmlNewNode( "distribution" ); TiCC::XmlAddContent( result, d->DistToString() ); return result; @@ -361,7 +357,7 @@ namespace Timbl { UnicodeString VectoString( const vector& vec ){ UnicodeString result; for ( auto const& fv : vec ){ - result += " " + fv->name_u(); + result += " " + fv->name(); } return result; } @@ -450,8 +446,8 @@ namespace Timbl { } IBtree* InstanceBase_base::read_list( istream &is, - std::vector& Feats, - Target& Targ, + Feature_List& feats, + Targets& Targ, int level ){ IBtree *result = NULL; IBtree **pnt = &result; @@ -459,7 +455,7 @@ namespace Timbl { char delim; while ( is && goon ) { is >> delim; // skip the opening `[` or separating ',' - *pnt = read_local( is, Feats, Targ, level ); + *pnt = read_local( is, feats, Targ, level ); if ( !(*pnt) ){ delete result; return NULL; @@ -472,8 +468,8 @@ namespace Timbl { } IBtree* InstanceBase_base::read_list_hashed( istream &is, - std::vector& Feats, - Target& Targ, + Feature_List& feats, + Targets& Targ, int level ){ IBtree *result = NULL; IBtree **pnt = &result; @@ -481,7 +477,7 @@ namespace Timbl { char delim; while ( is && goon ) { is >> delim; // skip the opening `[` or separating ',' - *pnt = read_local_hashed( is, Feats, Targ, level ); + *pnt = read_local_hashed( is, feats, Targ, level ); if ( !(*pnt) ){ delete result; return NULL; @@ -494,8 +490,8 @@ namespace Timbl { } IBtree *InstanceBase_base::read_local( istream &is, - vector& Feats, - Target& Targ, + Feature_List& feats, + Targets& Targ, int level ){ if ( !is ){ return NULL; @@ -505,7 +501,7 @@ namespace Timbl { UnicodeString buf; char delim; is >> ws >> buf; - result->FValue = Feats[level]->add_value( buf, NULL, 1 ); + result->FValue = feats.perm_feats[level]->add_value( buf, NULL, 1 ); is >> delim; if ( !is || delim != '(' ){ Error( "missing `(` in Instance Base file" ); @@ -518,7 +514,7 @@ namespace Timbl { if ( nxt == '{' ){ try { result->TDistribution - = ValueDistribution::read_distribution( is, Targ, false ); + = ClassDistribution::read_distribution( is, Targ, false ); } catch ( const exception& e ){ Warning( e.what() ); @@ -533,7 +529,7 @@ namespace Timbl { } } if ( look_ahead(is) == '[' ){ - result->link = read_list( is, Feats, Targ, level+1 ); + result->link = read_list( is, feats, Targ, level+1 ); if ( !(result->link) ){ delete result; return 0; @@ -562,8 +558,8 @@ namespace Timbl { } IBtree *InstanceBase_base::read_local_hashed( istream &is, - vector& Feats, - Target& Targ, + Feature_List& feats, + Targets& Targ, int level ){ if ( !is ){ return NULL; @@ -573,7 +569,7 @@ namespace Timbl { char delim; int index; is >> index; - result->FValue = Feats[level]->add_value( index, NULL, 1 ); + result->FValue = feats.perm_feats[level]->add_value( index, NULL, 1 ); is >> delim; if ( !is || delim != '(' ){ Error( "missing `(` in Instance Base file" ); @@ -590,7 +586,7 @@ namespace Timbl { // OR we have Persistent Distributions try { result->TDistribution - = ValueDistribution::read_distribution_hashed( is, Targ, false ); + = ClassDistribution::read_distribution_hashed( is, Targ, false ); } catch ( const exception& e ){ Warning( e.what() ); @@ -600,7 +596,7 @@ namespace Timbl { } } if ( look_ahead(is) == '[' ){ - result->link = read_list_hashed( is, Feats, Targ, level+1 ); + result->link = read_list_hashed( is, feats, Targ, level+1 ); if ( !(result->link) ){ delete result; return NULL; @@ -632,12 +628,12 @@ namespace Timbl { } bool InstanceBase_base::ReadIB( istream &is, - vector& Feats, - Target& Targ, + Feature_List& feats, + Targets& Targ, int expected_version ){ - if ( read_IB( is, Feats, Targ, expected_version ) ){ + if ( read_IB( is, feats, Targ, expected_version ) ){ InstBase->redo_distributions(); - ValueDistribution *Top + ClassDistribution *Top = InstBase->sum_distributions( PersistentDistributions ); delete Top; // still a bit silly but the Top Distribution is known // but we need to cleanup behind us also @@ -657,12 +653,12 @@ namespace Timbl { } bool IG_InstanceBase::ReadIB( istream &is, - vector& Feats, - Target& Targ, + Feature_List& feats, + Targets& Targ, int expected_version ){ - if ( read_IB( is, Feats, Targ, expected_version ) ){ + if ( read_IB( is, feats, Targ, expected_version ) ){ if ( PersistentDistributions ){ - ValueDistribution *Top + ClassDistribution *Top = InstBase->sum_distributions( PersistentDistributions ); delete Top; // still a bit silly but the Top Distribution is known // but we need to cleanup behind us also @@ -675,8 +671,8 @@ namespace Timbl { } bool InstanceBase_base::read_IB( istream &is, - vector& Feats, - Target& Targs, + Feature_List& feats, + Targets& Targs, int expected_version ){ NumOfTails = 0; DefAss = true; // always for a restored tree @@ -699,7 +695,7 @@ namespace Timbl { // in the right order in Targ try { TopDistribution - = ValueDistribution::read_distribution( is, Targs, true ); + = ClassDistribution::read_distribution( is, Targs, true ); } catch ( const exception& e ){ Warning( e.what() ); @@ -710,7 +706,7 @@ namespace Timbl { } else { if ( look_ahead( is ) == '[' ){ - InstBase = read_list( is, Feats, Targs, 0 ); + InstBase = read_list( is, feats, Targs, 0 ); } if ( InstBase ){ is >> ws >> buf; @@ -762,15 +758,13 @@ namespace Timbl { return true; } - bool InstanceBase_base::ReadIB( istream& is, - vector& Feats, - Target& Targs, - Hash::UnicodeHash& cats, - Hash::UnicodeHash& feats, - int expected_version ){ - if ( read_IB( is, Feats, Targs, cats, feats, expected_version ) ){ + bool InstanceBase_base::ReadIB_hashed( istream& is, + Feature_List& feats, + Targets& Targs, + int expected_version ){ + if ( read_IB_hashed( is, feats, Targs, expected_version ) ){ InstBase->redo_distributions(); - ValueDistribution *Top + ClassDistribution *Top = InstBase->sum_distributions( PersistentDistributions ); delete Top; // still a bit silly but the Top Distribution is known // but we need to cleanup behind us also @@ -781,15 +775,13 @@ namespace Timbl { } } - bool IG_InstanceBase::ReadIB( istream& is, - vector& Feats, - Target& Targs, - Hash::UnicodeHash& cats, - Hash::UnicodeHash& feats, - int expected_version ){ - if ( read_IB( is, Feats, Targs, cats, feats, expected_version ) ){ + bool IG_InstanceBase::ReadIB_hashed( istream& is, + Feature_List& feats, + Targets& Targs, + int expected_version ){ + if ( read_IB_hashed( is, feats, Targs, expected_version ) ){ if ( PersistentDistributions ){ - ValueDistribution *Top + ClassDistribution *Top = InstBase->sum_distributions( PersistentDistributions ); delete Top; // still a bit silly but the Top Distribution is known // but we need to cleanup behind us also @@ -801,18 +793,16 @@ namespace Timbl { } } - bool InstanceBase_base::read_IB( istream& is, - vector& Feats, - Target& Targs, - Hash::UnicodeHash& cats, - Hash::UnicodeHash& feats, - int expected_version ){ + bool InstanceBase_base::read_IB_hashed( istream& is, + Feature_List& feats, + Targets& Targs, + int expected_version ){ char delim; NumOfTails = 0; DefAss = true; // always for a restored tree DefaultsValid = true; // always for a restored tree Version = expected_version; - read_hash( is, cats, feats ); + read_hash( is, *Targs.hash(), *feats.hash() ); is >> delim; if ( !is || delim != '(' ){ Error( "missing first `(` in Instance Base file" ); @@ -829,7 +819,7 @@ namespace Timbl { // in the right order in Targ try { TopDistribution - = ValueDistribution::read_distribution_hashed( is, Targs, true ); + = ClassDistribution::read_distribution_hashed( is, Targs, true ); } catch ( const string& what ){ Warning( what ); @@ -842,7 +832,7 @@ namespace Timbl { Error( "problems reading Top Distribution from Instance Base file" ); } if ( look_ahead( is ) == '[' ){ - InstBase = read_list_hashed( is, Feats, Targs, 0 ); + InstBase = read_list_hashed( is, feats, Targs, 0 ); } if ( InstBase ){ is >> delim; @@ -864,10 +854,10 @@ namespace Timbl { } } - inline ValueDistribution *IBtree::sum_distributions( bool keep ){ + inline ClassDistribution *IBtree::sum_distributions( bool keep ){ // create a new distribution at this level by summing up the // distibutions of all branches. - ValueDistribution *result; + ClassDistribution *result; if ( !keep ){ if ( TDistribution ){ if ( FValue ){ @@ -879,7 +869,7 @@ namespace Timbl { } } else { - result = new ValueDistribution(); + result = new ClassDistribution(); } IBtree *pnt = this->next; while ( pnt ){ @@ -898,7 +888,7 @@ namespace Timbl { result = TDistribution->to_VD_Copy(); } else { - result = new ValueDistribution(); + result = new ClassDistribution(); } IBtree *pnt = this->next; while ( pnt ){ @@ -1008,7 +998,7 @@ namespace Timbl { } } - const ValueDistribution *IBtree::exact_match( const Instance& Inst ) const { + const ClassDistribution *IBtree::exact_match( const Instance& Inst ) const { // Is there an exact match between the Instance and the IB // If so, return the best Distribution. const IBtree *pnt = this; @@ -1050,30 +1040,22 @@ namespace Timbl { Random( Rand ), PersistentDistributions( persist ), Version( 4 ), - TopDistribution( new ValueDistribution ), + TopDistribution( new ClassDistribution ), WTop( 0 ), TopT( 0 ), tiedTop(false), InstBase( 0 ), LastInstBasePos( 0 ), - RestartSearch( new const IBtree *[depth] ), - SkipSearch( new const IBtree *[depth] ), - InstPath( new const IBtree *[depth] ), ibCount( cnt ), Depth( depth ), NumOfTails( 0 ) - {} + { + InstPath.resize(depth,0); + RestartSearch.resize(depth,0); + SkipSearch.resize(depth,0); + } InstanceBase_base::~InstanceBase_base(){ - if ( InstPath ){ - delete [] InstPath; - } - if ( SkipSearch ){ - delete [] SkipSearch; - } - if ( RestartSearch ){ - delete [] RestartSearch; - } // the Instance can become very large, with even millions of 'next' pointers // so recursive deletion will use a lot of stack // therefore we choose to iterate the first level(s). @@ -1259,7 +1241,7 @@ namespace Timbl { else { InstBase->re_assign_defaults( Random, PersistentDistributions ); } - ValueDistribution *Top + ClassDistribution *Top = InstBase->sum_distributions( PersistentDistributions ); delete Top; // still a bit silly but the Top Distribution is known } @@ -1295,7 +1277,7 @@ namespace Timbl { IBtree *pnt = InstBase->link; // we have to fix the toptarget here, because the node // is build incremental - ValueDistribution dist; + ClassDistribution dist; while ( pnt ){ if ( pnt->TDistribution ){ dist.Merge( *pnt->TDistribution ); @@ -1342,10 +1324,10 @@ namespace Timbl { *pnt = new IBtree(); ++ibCount; if ( abs( Inst.ExemplarWeight() ) > Epsilon ){ - (*pnt)->TDistribution = new WValueDistribution(); + (*pnt)->TDistribution = new WClassDistribution(); } else { - (*pnt)->TDistribution = new ValueDistribution; + (*pnt)->TDistribution = new ClassDistribution; } NumOfTails++; } @@ -1518,10 +1500,10 @@ namespace Timbl { DefaultsValid = false; } - const ValueDistribution *InstanceBase_base::InitGraphTest( vector&, + const ClassDistribution *InstanceBase_base::InitGraphTest( vector&, const vector *, - size_t, - size_t ){ + const size_t, + const size_t ){ FatalError( "InitGraphTest" ); return 0; } @@ -1562,12 +1544,12 @@ namespace Timbl { //#define DEBUGTESTS - const ValueDistribution *IB_InstanceBase::InitGraphTest( vector& Path, + const ClassDistribution *IB_InstanceBase::InitGraphTest( vector& Path, const vector *inst, - size_t off, - size_t eff ){ + const size_t off, + const size_t eff ){ const IBtree *pnt; - const ValueDistribution *result = NULL; + const ClassDistribution *result = NULL; testInst = inst; offSet = off; effFeat = eff; @@ -1621,16 +1603,16 @@ namespace Timbl { return result; } - const ValueDistribution *InstanceBase_base::NextGraphTest( vector&, + const ClassDistribution *InstanceBase_base::NextGraphTest( vector&, size_t& ){ FatalError( "NextGraphTest" ); return 0; } - const ValueDistribution *IB_InstanceBase::NextGraphTest( vector& Path, + const ClassDistribution *IB_InstanceBase::NextGraphTest( vector& Path, size_t& pos ){ const IBtree *pnt = NULL; - const ValueDistribution *result = NULL; + const ClassDistribution *result = NULL; bool goon = true; while ( !pnt && goon ){ if ( RestartSearch[pos] == NULL ) { @@ -1709,7 +1691,7 @@ namespace Timbl { return result; } - const ValueDistribution *InstanceBase_base::IG_test( const Instance& , + const ClassDistribution *InstanceBase_base::IG_test( const Instance& , size_t &, bool &, const TargetValue *& ){ @@ -1717,7 +1699,7 @@ namespace Timbl { return NULL; } - const ValueDistribution *IG_InstanceBase::IG_test( const Instance& Inst, + const ClassDistribution *IG_InstanceBase::IG_test( const Instance& Inst, size_t &end_level, bool &leaf, const TargetValue *&result ) { @@ -1725,7 +1707,7 @@ namespace Timbl { // distribution of the last matching position in the Tree, it's position // in the Instance Base and the default TargetValue result = NULL; - ValueDistribution *Dist = NULL; + ClassDistribution *Dist = NULL; int pos = 0; leaf = false; const IBtree *pnt = fast_search_node( Inst.FV[pos] ); @@ -1757,14 +1739,14 @@ namespace Timbl { IB_InstanceBase *InstanceBase_base::TRIBL_test( const Instance& , size_t, const TargetValue *&, - const ValueDistribution *&, + const ClassDistribution *&, size_t & ){ FatalError( "TRIBL_test " ); return NULL; } IB_InstanceBase *InstanceBase_base::TRIBL2_test( const Instance& , - const ValueDistribution *&, + const ClassDistribution *&, size_t & ){ FatalError( "TRIBL2_test " ); return NULL; @@ -1773,7 +1755,7 @@ namespace Timbl { IB_InstanceBase *TRIBL_InstanceBase::TRIBL_test( const Instance& Inst, size_t threshold, const TargetValue *&TV, - const ValueDistribution *&dist, + const ClassDistribution *&dist, size_t &level ) { // The Test function for the TRIBL algorithm, returns a pointer to the // Target at the last matching position in the Tree, @@ -1826,7 +1808,7 @@ namespace Timbl { } IB_InstanceBase *TRIBL2_InstanceBase::TRIBL2_test( const Instance& Inst, - const ValueDistribution *& dist, + const ClassDistribution *& dist, size_t &level ){ // The Test function for the TRIBL2 algorithm, returns a pointer to the // the subtree Instance Base necessary for IB1 diff --git a/src/IGExperiment.cxx b/src/IGExperiment.cxx index 0b28572..8b3a30a 100644 --- a/src/IGExperiment.cxx +++ b/src/IGExperiment.cxx @@ -26,21 +26,22 @@ */ #include -#include -#include +#include #include #include -#include "timbl/MsgClass.h" #include "timbl/Common.h" #include "timbl/Types.h" #include "timbl/IBtree.h" +#include "timbl/Instance.h" #include "timbl/TimblExperiment.h" #include "ticcutils/Timer.h" +#include "ticcutils/PrettyPrint.h" namespace Timbl { using namespace std; using namespace icu; + using TiCC::operator<<; void IG_Experiment::InitInstanceBase(){ srand( RandomSeed() ); @@ -60,7 +61,7 @@ namespace Timbl { delete confusionInfo; confusionInfo = 0; if ( Verbosity(ADVANCED_STATS) ){ - confusionInfo = new ConfusionMatrix( Targets->num_of_values() ); + confusionInfo = new ConfusionMatrix( targets.num_of_values() ); } if ( !is_copy ){ InitWeights(); @@ -135,7 +136,7 @@ namespace Timbl { } UnicodeString Buffer; IG_InstanceBase *outInstanceBase = 0; - TargetValue *TopTarget = Targets->MajorityClass(); + TargetValue *TopTarget = targets.MajorityClass(); // cerr << "MAJORITY CLASS = " << TopTarget << endl; // Open the file. // @@ -193,7 +194,7 @@ namespace Timbl { UnicodeString Buffer; IG_InstanceBase *PartInstanceBase = 0; IG_InstanceBase *outInstanceBase = 0; - TargetValue *TopTarget = Targets->MajorityClass(); + TargetValue *TopTarget = targets.MajorityClass(); // cerr << "MAJORITY CLASS = " << TopTarget << endl; // Open the file. // @@ -368,12 +369,12 @@ namespace Timbl { exact = false; bool Tie = false; initExperiment(); - if ( !bestResult.reset( beamSize, normalisation, norm_factor, Targets ) ){ + if ( !bestResult.reset( beamSize, normalisation, norm_factor, targets ) ){ Warning( "no normalisation possible because a BeamSize is specified\n" "output is NOT normalized!" ); } const TargetValue *TV = NULL; - const ValueDistribution *ResultDist; + const ClassDistribution *ResultDist; ResultDist = InstanceBase->IG_test( Inst, match_depth, last_leaf, TV ); if ( match_depth == 0 ){ // when level 0, ResultDist == TopDistribution @@ -456,7 +457,8 @@ namespace Timbl { bool Hashed; int Version; string range_buf; - if ( !get_IB_Info( is, Pruned, Version, Hashed, range_buf ) ){ + size_t numF = get_IB_Info( is, Pruned, Version, Hashed, range_buf ); + if ( numF == 0 ){ return false; } else if ( !Pruned ){ @@ -465,7 +467,7 @@ namespace Timbl { } else { TreeOrder = DataFile; - Initialize(); + Initialize( numF ); if ( !get_ranges( range_buf ) ){ Warning( "couldn't retrieve ranges..." ); } @@ -478,24 +480,24 @@ namespace Timbl { KeepDistributions() ); int pos=0; for ( size_t i=0; i < NumOfFeatures(); ++i ){ - Features[i]->SetWeight( 1.0 ); - if ( Features[permutation[i]]->Ignore() ){ - PermFeatures[i] = NULL; + features[i]->SetWeight( 1.0 ); + if ( features[features.permutation[i]]->Ignore() ){ + features.perm_feats[i] = NULL; } else { - PermFeatures[pos++] = Features[permutation[i]]; + features.perm_feats[pos++] = features[features.permutation[i]]; } } if ( Hashed ){ - result = InstanceBase->ReadIB( is, PermFeatures, - *Targets, - *Targets->hash(), - *Features[0]->hash(), - Version ); + result = InstanceBase->ReadIB_hashed( is, + features, + targets, + Version ); } else { - result = InstanceBase->ReadIB( is, PermFeatures, - *Targets, + result = InstanceBase->ReadIB( is, + features, + targets, Version ); } if ( result ){ diff --git a/src/Instance.cxx b/src/Instance.cxx index 05c4de5..2aad439 100644 --- a/src/Instance.cxx +++ b/src/Instance.cxx @@ -24,1740 +24,21 @@ or send mail to: lamasoftware (at ) science.ru.nl */ -#include -#include -#include -#include -#include -#include // for sort() -#include // for accumulate() -#include -#include -#include "ticcutils/StringOps.h" -#include "ticcutils/UniHash.h" +#include -#include "timbl/Common.h" -#include "timbl/MsgClass.h" #include "timbl/Types.h" #include "timbl/Instance.h" -#include "timbl/Matrices.h" -#include "timbl/Metrics.h" using namespace std; -using namespace icu; namespace Timbl { - using namespace Common; - - size_t Vfield::Index() { return value->Index(); } - - ostream& operator<<(ostream& os, const Vfield *vd ) { - return vd->put( os ); - } - - ostream& operator<<(ostream& os, const Vfield& vd ) { - return vd.put( os ); - } - - ostream& Vfield::put( ostream& os ) const { - os << value << " " << weight; - return os; - } - - inline int random_number( int Min, int Max ){ - // calculate a random integer within the interval [min,max] - if ( Min == Max ){ - return Min; - } - double randnum = (double)rand()/(double)RAND_MAX; - randnum *= (Max-Min); - randnum += Min; - return (int)floor(randnum+0.5); - } - - void ValueDistribution::clear(){ - for ( const auto& d : distribution ){ - delete d.second; - } - distribution.clear(); - total_items = 0; - } - - double ValueDistribution::Confidence( const TargetValue *tv ) const { - auto it = find_if( distribution.begin(), distribution.end(), - [tv]( const std::pair& v ){ - return v.second->Value() == tv ; } ); - if ( it != distribution.end() ){ - return it->second->Weight(); - } - return 0.0; - } - - void ValueDistribution::DistToString( string& DistStr, double minf ) const { - ostringstream oss; - oss.setf(ios::showpoint); - bool first = true; - oss << "{ "; - for ( const auto& it : distribution ){ - Vfield *f = it.second; - if ( f->frequency >= minf ){ - if ( !first ){ - oss << ", "; - } - oss << f->value << " " << double(f->frequency); - first = false; - } - } - oss << " }"; - DistStr = oss.str(); - } - - void WValueDistribution::DistToString( string& DistStr, double minw ) const { - ostringstream oss; - oss.setf(ios::showpoint); - bool first = true; - oss << "{ "; - for( const auto& it : distribution ){ - Vfield *f = it.second; - if ( abs(f->weight) < minw ){ - continue; - } - if ( abs(f->weight) < Epsilon ){ - continue; - } - if ( !first ){ - oss << ", "; - } - oss << f->value << " " << f->weight; - first = false; - } - oss << " }"; - DistStr = oss.str(); - } - - class dblCmp { - public: - bool operator() ( const double d1, const double d2 ) const { - return d1 - d2 > Epsilon; - } - }; - - void ValueDistribution::DistToStringWW( string& DistStr, int beam ) const { - double minw = 0.0; - if ( beam > 0 ){ - std::set freqs; - for ( const auto& it : distribution ){ - Vfield *f = it.second; - freqs.insert( f->frequency ); - } - int cnt=0; - for ( const auto& rit : freqs ){ - if ( ++cnt == beam ) { - minw = rit; - break; - } - } - } - DistToString( DistStr, minw ); - } - - void WValueDistribution::DistToStringWW( string& DistStr, - int beam ) const { - double minw = 0.0; - if ( beam > 0 ){ - std::set wgths; - for ( const auto& it : distribution ){ - Vfield *f = it.second; - wgths.insert( f->weight ); - } - int cnt=0; - for ( const auto& rit : wgths ){ - if ( ++cnt == beam ) { - minw = rit; - break; - } - } - } - DistToString( DistStr, minw ); - } - - const string ValueDistribution::DistToString() const { - string result; - DistToString( result ); - return result; - } - - const string ValueDistribution::DistToStringW( int beam ) const { - string result; - DistToStringWW( result, beam ); - return result; - } - - double ValueDistribution::Entropy() const { - double entropy = 0.0; - size_t TotalVals = total_items; - if ( TotalVals > 0 ){ - // Loop over the classes in the distibution - for ( const auto& it : distribution ){ - size_t Freq = it.second->Freq(); - if ( Freq > 0 ){ - double Prob = Freq / (double)TotalVals; - entropy += Prob * Log2(Prob); - } - } - } - return fabs(entropy); - } - - void WValueDistribution::Normalize() { - double sum = accumulate( distribution.begin(), distribution.end(), - 0.0, - []( double r, const std::pair& v ){ - return r + v.second->Weight(); } ); - for ( auto& it : distribution ){ - it.second->SetWeight( it.second->Weight() / sum ); - } - } - - void WValueDistribution::Normalize_1( double factor, const Target *targ ) { - for ( const auto& val : targ->values_array ){ - // search for val, if not there: add entry with frequency factor; - // otherwise increment the ExamplarWeight - size_t id = val->Index(); - auto const& it = distribution.find( id ); - if ( it != distribution.end() ){ - it->second->SetWeight( it->second->Weight() + factor ); - } - else { - distribution[id] = new Vfield( val, 1, factor ); - } - } - total_items += targ->num_of_values(); - Normalize(); - } - - void WValueDistribution::Normalize_2( ) { - for ( const auto& d : distribution ){ - d.second->SetWeight( log1p( d.second->Weight() ) ); - } - Normalize(); - } - - ValueDistribution *ValueDistribution::to_VD_Copy( ) const { - ValueDistribution *res = new ValueDistribution(); - for ( const auto& d : distribution ){ - size_t key = d.first; - Vfield *vdf = d.second; - res->distribution[key] = new Vfield( vdf->Value(), - vdf->Freq(), - vdf->Freq() ); - } - res->total_items = total_items; - return res; - } - - WValueDistribution *ValueDistribution::to_WVD_Copy() const { - WValueDistribution *res = new WValueDistribution(); - for ( const auto& d : distribution ){ - size_t key = d.first; - Vfield *vdf = d.second; - res->distribution[key] = new Vfield( vdf->Value(), - vdf->Freq(), - vdf->Freq() ); - } - res->total_items = total_items; - return res; - } - - WValueDistribution *WValueDistribution::to_WVD_Copy( ) const { - WValueDistribution *result = new WValueDistribution(); - for ( const auto& d : distribution ){ - size_t key = d.first; - Vfield *vdf = d.second; - result->distribution[key] = new Vfield( vdf->Value(), - vdf->Freq(), - vdf->Weight() ); - } - result->total_items = total_items; - return result; - } - - - // - // special functions to serialize distibutions including both frequency - // AND weight information. Needed for store/retrieve InstanceBases - // - // First hashed variant: - // - - const string ValueDistribution::SaveHashed() const{ - ostringstream oss; - oss << "{ "; - bool first = true; - for ( const auto& it : distribution ){ - Vfield *f = it.second; - if ( f->frequency > 0 ){ - if ( !first ){ - oss << ", "; - } - oss << f->value->Index() << " " << f->frequency; - first = false; - } - } - oss << " }"; - return oss.str(); - } - - const string WValueDistribution::SaveHashed() const{ - ostringstream oss; - bool first = true; - oss << "{ "; - for ( const auto& it : distribution ){ - Vfield *f = it.second; - if ( f->frequency > 0 ){ - if ( !first ){ - oss << ", "; - } - oss << f->Value()->Index() << " " - << f->frequency << " " << f->weight; - first = false; - } - } - oss << " }"; - return oss.str(); - } - - // - // non-hashed variant: - // - - const string ValueDistribution::Save() const{ - ostringstream oss; - oss << "{ "; - bool first = true; - for ( const auto& it : distribution ){ - Vfield *f = it.second; - if ( f->frequency > 0 ){ - if ( !first ){ - oss << ", "; - } - oss << f->value << " " << f->frequency; - first = false; - } - } - oss << " }"; - return oss.str(); - } - - const string WValueDistribution::Save() const{ - ostringstream oss; - oss << "{ "; - bool first = true; - for ( const auto& it : distribution ){ - Vfield *f = it.second; - if ( f->frequency > 0 ){ - if ( !first ){ - oss << ", "; - } - oss.setf(ios::showpoint); - oss << f->value << " " << f->frequency << " " << f->weight; - first = false; - } - } - oss << " }"; - return oss.str(); - } - - void ValueDistribution::SetFreq( const TargetValue *val, const int freq, - double ){ - // add entry with frequency freq; - Vfield *temp = new Vfield( val, freq, freq ); - distribution[val->Index()] = temp; - total_items += freq; - } - - void WValueDistribution::SetFreq( const TargetValue *val, const int freq, - double sw ){ - // add entry with frequency freq; - // also sets the sample_weight - Vfield *temp = new Vfield( val, freq, sw ); - distribution[val->Index()] = temp; - total_items += freq; - } - - bool ValueDistribution::IncFreq( const TargetValue *val, - size_t occ, - double ){ - // search for val, if not there: add entry with frequency 'occ'; - // otherwise increment the freqency - size_t id = val->Index(); - auto const& it = distribution.find( id ); - if ( it != distribution.end() ){ - it->second->IncFreq( occ ); - } - else { - distribution[id] = new Vfield( val, occ, 1.0 ); - } - total_items += occ; - return true; - } - - bool WValueDistribution::IncFreq( const TargetValue *val, - size_t occ, - double sw ){ - // search for val, if not there: add entry with frequency 'occ'; - // otherwise increment the freqency - // also set sample weight - size_t id = val->Index(); - auto const& it = distribution.find( id ); - if ( it != distribution.end() ){ - it->second->IncFreq( occ ); - } - else { - distribution[id] = new Vfield( val, occ, sw ); - } - total_items += occ; - return fabs( distribution[id]->Weight() - sw ) > Epsilon; - } - - void ValueDistribution::DecFreq( const TargetValue *val ){ - // search for val, if not there, just forget - // otherwise decrement the freqency - auto const& it = distribution.find( val->Index() ); - if ( it != distribution.end() ){ - it->second->DecFreq(); - total_items -= 1; - } - } - - void ValueDistribution::Merge( const ValueDistribution& VD ){ - for ( const auto& it : VD.distribution ){ - size_t key = it.first; - Vfield *vd = it.second; - if ( distribution.find(key) != distribution.end() ){ - distribution[key]->AddFreq( vd->Freq() ); - } - else { - // VD might be weighted. But we don't need/want that info here - // Weight == Freq is more convenient - distribution[key] = new Vfield( vd->Value(), vd->Freq(), - vd->Freq() ); - } - } - total_items += VD.total_items; - } - - void WValueDistribution::MergeW( const ValueDistribution& VD, - double Weight ){ - for ( const auto& it : VD.distribution ){ - Vfield *vd = it.second; - size_t key = it.first; - if ( distribution.find(key) != distribution.end() ){ - distribution[key]->SetWeight( distribution[key]->Weight() + vd->Weight() *Weight ); - } - else { - distribution[key] = new Vfield( vd->Value(), 1, - vd->Weight() * Weight); - } - } - total_items += VD.total_items; - } - - const TargetValue *ValueDistribution::BestTarget( bool& tie, - bool do_rand ) const { - // get the most frequent target from the distribution. - // In case of a tie take the one which is GLOBALLY the most frequent, - // OR (if do_rand) take random one of the most frequents - // and signal if this ties also! - const TargetValue *best = NULL; - tie = false; - auto It = distribution.begin(); - if ( It != distribution.end() ){ - Vfield *pnt = It->second; - size_t Max = pnt->Freq(); - if ( do_rand ){ - int nof_best=1, pick=1; - ++It; - while ( It != distribution.end() ){ - pnt = It->second; - if ( pnt->Freq() > Max ){ - Max = pnt->Freq(); - nof_best = 1; - } - else { - if ( pnt->Freq() == Max ){ - nof_best++; - } - } - ++It; - } - tie = ( nof_best > 1 ); - pick = random_number( 1, nof_best ); - It = distribution.begin(); - nof_best = 0; - while ( It != distribution.end() ){ - pnt = It->second; - if ( pnt->Freq() == Max ){ - if ( ++nof_best == pick ){ - return pnt->Value(); - } - } - ++It; - } - return NULL; - } - else { - best = pnt->Value(); - ++It; - while ( It != distribution.end() ){ - pnt = It->second; - if ( pnt->Freq() > Max ){ - tie = false; - best = pnt->Value(); - Max = pnt->Freq(); - } - else { - if ( pnt->Freq() == Max ) { - tie = true; - if ( pnt->Value()->ValFreq() > best->ValFreq() ){ - best = pnt->Value(); - } - } - } - ++It; - } - return best; - } - } - return best; - } - - const TargetValue *WValueDistribution::BestTarget( bool& tie, - bool do_rand ) const { - // get the most frequent target from the distribution. - // In case of a tie take the one which is GLOBALLY the most frequent, - // OR (if do_rand) take random one of the most frequents - // and signal if this ties also! - const TargetValue *best = NULL; - auto It = distribution.begin(); - tie = false; - if ( It != distribution.end() ){ - double Max = It->second->Weight(); - if ( do_rand ){ - int nof_best=1, pick=1; - ++It; - while ( It != distribution.end() ){ - if ( It->second->Weight() > Max ){ - Max = It->second->Weight(); - nof_best = 1; - } - else { - if ( abs(It->second->Weight()- Max) < Epsilon ){ - nof_best++; - } - } - ++It; - } - tie = ( nof_best > 1 ); - pick = random_number( 1, nof_best ); - It = distribution.begin(); - nof_best = 0; - while ( It != distribution.end() ){ - if ( abs(It->second->Weight() - Max) < Epsilon ){ - if ( ++nof_best == pick ){ - return It->second->Value(); - } - } - ++It; - } - return NULL; - } - else { - best = It->second->Value(); - ++It; - while ( It != distribution.end() ){ - if ( It->second->Weight() > Max ){ - tie = false; - best = It->second->Value(); - Max = It->second->Weight(); - } - else { - if ( abs(It->second->Weight() - Max) < Epsilon ) { - tie = true; - if ( It->second->Value()->ValFreq() > best->ValFreq() ){ - best = It->second->Value(); - } - } - } - ++It; - } - return best; - } - } - return best; - } - - Feature::Feature( Hash::UnicodeHash *T ): - BaseFeatTargClass(T), - metric_matrix( 0 ), - metric( 0 ), - ignore( false ), - numeric( false ), - vcpb_read( false ), - PrestoreStatus(ps_undef), - Prestored_metric( UnknownMetric ), - entropy( 0.0 ), - info_gain (0.0), - split_info(0.0), - gain_ratio(0.0), - chi_square(0.0), - shared_variance(0.0), - standard_deviation(0.0), - matrix_clip_freq(10), - n_dot_j( 0 ), - n_i_dot( 0 ), - n_min (0.0), - n_max (0.0), - SaveSize(0), - SaveNum(0), - weight(0.0), - is_reference(false) - {} - - Feature::Feature( const Feature& in ): BaseFeatTargClass( in ){ - *this = in; - is_reference = true; - } - - Feature& Feature::operator=( const Feature& in ){ - if ( this != &in ){ - metric_matrix = in.metric_matrix; - metric = in.metric; - PrestoreStatus = in.PrestoreStatus; - Prestored_metric = in.Prestored_metric; - ignore = in.ignore; - numeric = in.numeric; - vcpb_read = in.vcpb_read; - entropy = in.entropy; - info_gain = in.info_gain; - split_info = in.split_info; - gain_ratio = in.gain_ratio; - chi_square = in.chi_square; - shared_variance = in.shared_variance; - standard_deviation = in.standard_deviation; - matrix_clip_freq = in.matrix_clip_freq; - n_dot_j = in.n_dot_j; - n_i_dot = in.n_i_dot; - n_min = in.n_min; - n_max = in.n_max; - SaveSize = in.SaveSize; - SaveNum = in.SaveNum; - weight = in.weight; - values_array = in.values_array; - reverse_values = in.reverse_values; - } - return *this; - } - - void Feature::InitSparseArrays(){ - if ( !is_reference ){ - // Loop over all values. - // - for ( const auto& FV : values_array ){ - size_t freq = FV->ValFreq(); - FV->ValueClassProb->Clear(); - if ( freq > 0 ){ - // Loop over all present classes. - // - for ( const auto& tit : FV->TargetDist ){ - FV->ValueClassProb->Assign( tit.second->Index(), - tit.second->Freq()/(double)freq ); - } - } - } - } - } - - struct D_D { - D_D(): dist(0), value(0.0) {}; - explicit D_D( FeatureValue *fv ): value(0.0) { - if ( !TiCC::stringTo( fv->Name(), value ) ){ - throw( logic_error("called DD with an non-numeric value" ) ); - } - dist = &fv->TargetDist; - } - ValueDistribution *dist; - double value; - }; - - bool dd_less( const D_D* dd1, const D_D* dd2 ){ - return dd1->value < dd2->value; - } - - void Feature::NumStatistics( vector& FVBin, - double DBentropy ){ - size_t BinSize = FVBin.size(); - double Prob, FVEntropy; - size_t TotalVals = TotalValues(); - entropy = 0.0; - vector ddv; - ddv.reserve( values_array.size() ); - for ( const auto& FV : values_array ){ - if ( FV->ValFreq() > 0 ){ - ddv.push_back( new D_D( FV ) ); - } - } - sort( ddv.begin(), ddv.end(), dd_less ); - size_t dd_len = ddv.size(); - int num_per_bin = (int)floor( (double)dd_len / BinSize); - size_t rest = dd_len - num_per_bin * BinSize; - if ( rest ){ - num_per_bin++; - } - int jj = 0; - int cnt = 0; - for ( size_t m = 0; m < dd_len; ++m ){ - FVBin[jj]->TargetDist.Merge( *ddv[m]->dist ); - if ( ++cnt >= num_per_bin ){ - ++jj; - if ( --rest == 0 ){ - --num_per_bin; - } - cnt = 0; - } - } - for ( size_t j=0; j < dd_len; ++j ){ - delete ddv[j]; - } - for ( size_t k=0; k < BinSize; k++ ){ - FeatureValue *pnt = FVBin[k]; - size_t Freq = pnt->TargetDist.totalSize(); - pnt->ValFreq( Freq ); - if ( Freq > 0 ){ - // Entropy for this FV pair. - // - FVEntropy = 0.0; - for ( const auto& it : pnt->TargetDist ){ - Prob = it.second->Freq()/(double)Freq; - FVEntropy += Prob * Log2(Prob); - } - entropy += -FVEntropy * Freq / (double)TotalVals; - } - } - entropy = fabs( entropy ); - // Info gain. - // - info_gain = DBentropy - entropy; - - // And the split info. - // - split_info = 0.0; - for ( size_t l=0; l < BinSize; ++l ){ - size_t Freq = FVBin[l]->ValFreq(); - if ( Freq > 0 ){ - Prob = Freq / (double)TotalVals; - split_info += Prob * Log2(Prob); - } - } - split_info = -split_info; - // Gain ratio. - // - if ( fabs(split_info) FVBin(BinSize); - for ( int i=0; i < BinSize; ++i ){ - sprintf( dumname, "dum%d", i ); - FVBin[i] = new FeatureValue( dumname ); - } - NumStatistics( FVBin, DBentropy ); - if ( full ){ - ChiSquareStatistics( FVBin, Targets ); - int cnt = 0; // count effective values in Bin - for ( int i=0; i < BinSize; ++i ){ - if ( FVBin[i]->ValFreq() > 0 ){ - ++cnt; - } - } - SharedVarianceStatistics( Targets, cnt ); - } - for ( int i=0; i < BinSize; ++i ){ - delete FVBin[i]; - } - } - - void Feature::Statistics( double DBentropy ){ - size_t TotalVals = TotalValues(); - entropy = 0.0; - // Loop over the values. - for ( const auto& fv : values_array ){ - // Entropy for this FV pair. - size_t Freq = fv->ValFreq(); - if ( Freq > 0 ){ - double FVEntropy = 0.0; - for ( const auto& tit : fv->TargetDist ){ - double Prob = tit.second->Freq() / (double)Freq; - FVEntropy += Prob * Log2(Prob); - } - entropy += -FVEntropy * Freq / (double)TotalVals; - } - } - - entropy = fabs( entropy ); - // Info. gain. - // - info_gain = DBentropy - entropy; - if ( info_gain < 0.0 ){ - info_gain = 0.0; - } - // And the split. info. - // - split_info = 0.0; - for ( const auto& fv : values_array ){ - double Prob = fv->ValFreq() / (double)TotalVals; - if ( Prob > 0 ) { - split_info += Prob * Log2(Prob); - } - } - split_info = -split_info; - // Gain ratio. - // - if ( fabs(split_info) < Epsilon ){ - gain_ratio = 0.0; - } - else { - gain_ratio = info_gain / split_info; - } - } - - void Feature::ChiSquareStatistics( vector& FVA, - Target *Targets ){ - size_t Num_Vals = FVA.size(); - chi_square = 0.0; - long int n_dot_dot = 0; - size_t Size = Targets->num_of_values(); - if ( !n_dot_j ) { - n_dot_j = new long int[Size]; - n_i_dot = new long int[Num_Vals]; - SaveSize = Size; - SaveNum = Num_Vals; - } - else { - if ( SaveSize < Size ){ - delete [] n_dot_j; - n_dot_j = new long int[Size]; - SaveSize = Size; - } - if ( SaveNum < Num_Vals ){ - delete [] n_i_dot; - n_i_dot = new long int[Num_Vals]; - SaveNum = Num_Vals; - } - } - for ( size_t j = 0; j < Size; ++j ){ - n_dot_j[j] = 0; - } - for ( size_t i = 0; i < Num_Vals; ++i ){ - n_i_dot[i] = 0; - FeatureValue *fv = FVA[i]; - for ( const auto& tit : fv->TargetDist ){ - n_dot_j[tit.second->Index()-1] += tit.second->Freq(); - n_i_dot[i] += tit.second->Freq(); - } - n_dot_dot += n_i_dot[i]; - } - if ( n_dot_dot != 0 ){ - for ( size_t m = 0; m < Num_Vals; ++m ){ - FeatureValue *fv = FVA[m]; - size_t n = 0; - for ( const auto& it : fv->TargetDist ){ - if ( n >= Size ){ - break; - } - while ( n < it.second->Index()-1 ){ - double tmp = ((double)n_dot_j[n++] * (double)n_i_dot[m]) / - (double)n_dot_dot; - chi_square += tmp; - } - if ( n == it.second->Index()-1 ){ - double tmp = ((double)n_dot_j[n++] * (double)n_i_dot[m]) / - (double)n_dot_dot; - if ( fabs(tmp) > Epsilon){ - chi_square += ( (tmp - it.second->Freq()) * - (tmp - it.second->Freq()) ) / tmp; - } - } - else { - break; - } - } - while ( n < Size ){ - double tmp = ((double)n_dot_j[n++] * (double)n_i_dot[m]) / - (double)n_dot_dot; - chi_square += tmp; - } - } - } - } - - void Feature::ChiSquareStatistics( Target *Targets ){ - chi_square = 0.0; - long int n_dot_dot = 0; - size_t Size = Targets->num_of_values(); - size_t Num_Vals = values_array.size(); - if ( !n_dot_j ) { - n_dot_j = new long int[Size]; - n_i_dot = new long int[Num_Vals]; - SaveSize = Size; - SaveNum = Num_Vals; - } - else { - if ( SaveSize < Size ){ - delete [] n_dot_j; - n_dot_j = new long int[Size]; - SaveSize = Size; - } - if ( SaveNum < Num_Vals ){ - delete [] n_i_dot; - n_i_dot = new long int[Num_Vals]; - SaveNum = Num_Vals; - } - } - for ( size_t j = 0; j < Size; ++j ){ - n_dot_j[j] = 0; - } - int i = 0; - for ( const auto& fv : values_array ){ - n_i_dot[i] = 0; - for ( const auto& t_it : fv->TargetDist ){ - long int fr = t_it.second->Freq(); - n_dot_j[t_it.second->Index()-1] += fr; - n_i_dot[i] += fr; - } - n_dot_dot += n_i_dot[i]; - ++i; - } - if ( n_dot_dot != 0 ){ - int m = 0; - for ( const auto& fv : values_array ){ - size_t n = 0; - for ( const auto& t_it : fv->TargetDist ){ - if ( n >= Size ){ - break; - } - size_t id = t_it.second->Index()-1; - long int fr = t_it.second->Freq(); - while ( n < id ){ - double tmp = ((double)n_dot_j[n++] * (double)n_i_dot[m]) / - (double)n_dot_dot; - chi_square += tmp; - } - if ( n == id ){ - double tmp = ((double)n_dot_j[n++] * (double)n_i_dot[m]) / - (double)n_dot_dot; - if ( fabs(tmp) > Epsilon ){ - chi_square += ( (tmp - fr ) * (tmp - fr ) ) / tmp; - } - } - else { - break; - } - } - while ( n < Size ){ - double tmp = ((double)n_dot_j[n++] * (double)n_i_dot[m]) / - (double)n_dot_dot; - chi_square += tmp; - } - ++m; - } - } - } - - double Feature::fvDistance( FeatureValue *F, FeatureValue *G, - size_t limit ) const { - double result = 0.0; - if ( F != G ){ - bool dummy; - if ( metric->isStorable() && matrixPresent( dummy ) && - F->ValFreq() >= matrix_clip_freq && - G->ValFreq() >= matrix_clip_freq ){ - result = metric_matrix->Extract( F, G ); - } - else if ( metric->isNumerical() ) { - result = metric->distance( F, G, limit, Max() - Min() ); - } - else { - result = metric->distance( F, G, limit ); - } - } - return result; - } - - ostream& operator<<(ostream& os, const ValueDistribution& vd ) { - string tmp; - vd.DistToString( tmp ); - os << tmp; - return os; - } - - ostream& operator<<(ostream& os, const ValueDistribution *vd ) { - string tmp = "{null}"; - if ( vd ){ - vd->DistToString( tmp ); - } - os << tmp; - return os; - } - - ValueDistribution *ValueDistribution::read_distribution( istream &is, - Target& Targ, - bool do_fr ){ - // read a distribution from stream is into Target - // if do_f we also adjust the value of Frequency of the Target, which is - // otherwise 1. Special case when reading the TopDistribution. - // - ValueDistribution *result = 0; - char nextCh; - is >> nextCh; // skip { - if ( nextCh != '{' ){ - throw runtime_error( "missing '{' in distribution string." ); - } - else { - int next; - do { - size_t freq; - UnicodeString buf; - is >> ws >> buf; - is >> freq; - TargetValue *target; - if ( do_fr ){ - target = Targ.add_value( buf, freq ); - } - else { - target = Targ.Lookup( buf ); - } - if ( !target ){ - delete result; - result = 0; - break; - } - next = look_ahead(is); - if ( next == ',' ){ - if ( !result ) { - result = new ValueDistribution(); - } - result->SetFreq( target, freq ); - is >> nextCh; - next = look_ahead(is); - } - else if ( next == '}' ){ - if ( !result ){ - result = new ValueDistribution(); - } - result->SetFreq( target, freq ); - } - else if ( isdigit(next) ){ - if ( !result ){ - result = new WValueDistribution(); - } - double sw; - is >> sw; - result->SetFreq( target, freq, sw ); - next = look_ahead(is); - if ( next == ',' ){ - is >> nextCh; - next = look_ahead(is); - } - } - } while ( is && next != '}' ); - if ( is ){ - is >> nextCh; // skip } - } - else { - delete result; - throw runtime_error( "missing '}' in distribution string." ); - } - } - return result; - } - - - ValueDistribution *ValueDistribution::read_distribution_hashed( istream &is, - Target& Targ, - bool do_fr ){ - - ValueDistribution *result = 0; - // read a distribution from stream is into Target - // if do_f we also adjust the value of Frequency of the Target, which is - // otherwise 1. Special case when reading the TopDistribution. - // - char nextCh; - is >> nextCh; // skip { - if ( nextCh != '{' ){ - throw runtime_error( "missing '{' in distribution string." ); - } - else { - int next; - do { - unsigned int index; - size_t freq; - is >> index; - is >> freq; - TargetValue *target; - if ( do_fr ){ - target = Targ.add_value( index, freq ); - } - else { - target = Targ.ReverseLookup( index ); - } - if ( !target ){ - delete result; - result = 0; - break; - } - next = look_ahead(is); - if ( next == ',' ){ - if ( !result ){ - result = new ValueDistribution(); - } - result->SetFreq( target, freq ); - is >> nextCh; - next = look_ahead(is); - } - else if ( next == '}' ){ - if ( !result ){ - result = new ValueDistribution(); - } - result->SetFreq( target, freq ); - } - else if ( isdigit(next) ){ - double sw; - is >> sw; - if ( !result ){ - result = new WValueDistribution(); - } - result->SetFreq( target, freq, sw ); - next = look_ahead(is); - if ( next == ',' ){ - is >> nextCh; - next = look_ahead(is); - } - } - } while ( is && next != '}' ); - if ( is ){ - is >> nextCh; // skip } - } - else { - delete result; - throw runtime_error( "missing '}' in distribution string" ); - } - } - return result; - } - - - ostream& operator<<( std::ostream& os, ValueClass const *vc ){ - if ( vc ){ - os << vc->Name(); - } - else { - os << "*FV-NF*"; - } - return os; - } - - FeatureValue::FeatureValue( const UnicodeString& value, - size_t hash_val ): - ValueClass( value, hash_val ), ValueClassProb( 0 ) { - } - - FeatureValue::FeatureValue( const UnicodeString& s ): - ValueClass( s, 0 ), - ValueClassProb(0){ - Frequency = 0; - } - - FeatureValue::~FeatureValue( ){ - delete ValueClassProb; - } - - TargetValue::TargetValue( const UnicodeString& value, - size_t value_hash ): - ValueClass( value, value_hash ){} - - size_t Target::EffectiveValues() const { - return count_if( values_array.begin(), values_array.end(), - [&]( const TargetValue* v ){ - return (v->ValFreq() > 0); } ); - } - - size_t Feature::EffectiveValues() const { - return count_if( values_array.begin(), values_array.end(), - [&]( const FeatureValue* v ){ - return (v->ValFreq() > 0); } ); - } - - size_t Target::TotalValues() const { - return accumulate( values_array.begin(), values_array.end(), - 0, - [&]( size_t r, const TargetValue *v ){ - return r + v->ValFreq(); } ); - } - - size_t Feature::TotalValues() const { - return accumulate( values_array.begin(), values_array.end(), - 0, - [&]( size_t r, const FeatureValue *v ){ - return r + v->ValFreq(); } ); - } - - FeatureValue *Feature::Lookup( const UnicodeString& str ) const { - FeatureValue *result = NULL; - unsigned int index = TokenTree->lookup( str ); - if ( index ) { - auto const& it = reverse_values.find( index ); - if ( it != reverse_values.end() ){ - result = it->second; - } - } - return result; - } - - FeatureValue *Feature::add_value( const UnicodeString& valstr, - TargetValue *tv, - int freq ){ - unsigned int hash_val = TokenTree->hash( valstr ); - // cerr << "hash(" << valstr << ") geeft: " << hash_val << endl; - return add_value( hash_val, tv, freq ); - } - - FeatureValue *Feature::add_value( size_t index, - TargetValue *tv, - int freq ){ - auto const& it = reverse_values.find( index ); - if ( it == reverse_values.end() ){ - const UnicodeString& value = TokenTree->reverse_lookup( index ); - // cerr << "lookup(" << index << ") geeft: " << value << endl; - // we want to store the singleton value for this index - // so we MUST reverse lookup the index - FeatureValue *fv = new FeatureValue( value, index ); - fv->ValFreq( freq ); - reverse_values[index] = fv; - values_array.push_back( fv ); - } - else { - it->second->IncValFreq( freq ); - } - FeatureValue *result = reverse_values[index]; - if ( tv ){ - result->TargetDist.IncFreq(tv, freq ); - } - return result; - } - - bool Feature::increment_value( FeatureValue *FV, - TargetValue *tv ){ - bool result = false; - if ( FV ){ - FV->incr_val_freq(); - if ( tv ){ - FV->TargetDist.IncFreq(tv,1); - } - result = true; - } - return result; - } - - bool Feature::decrement_value( FeatureValue *FV, TargetValue *tv ){ - bool result = false; - if ( FV ){ - FV->decr_val_freq(); - if ( tv ){ - FV->TargetDist.DecFreq(tv); - } - result = true; - } - return result; - } - - bool Feature::AllocSparseArrays( size_t Dim ){ - // Loop over all values. - // - for ( const auto& FV : values_array ){ - // Loop over all classes. - if ( FV->ValueClassProb == NULL ){ - if ( !(FV->ValueClassProb = new SparseValueProbClass( Dim )) ){ - return false; - } - } - } - return true; - } - - bool Feature::isNumerical() const { - if ( metric && metric->isNumerical() ){ - return true; - } - else { - return false; - } - } - - bool Feature::isStorableMetric() const { - if ( metric && metric->isStorable() ){ - return true; - } - else { - return false; - } - } - - BaseFeatTargClass::BaseFeatTargClass( Hash::UnicodeHash *T ): - TokenTree( T ) - {} - - BaseFeatTargClass::BaseFeatTargClass( const BaseFeatTargClass& in ): - MsgClass( in ), - TokenTree( in.TokenTree ) - {} - - BaseFeatTargClass::~BaseFeatTargClass(){ - } - - Target::~Target() { - for ( const auto& it : values_array ){ - delete it; - } - reverse_values.clear(); - } - - TargetValue *Target::Lookup( const UnicodeString& str ) const { - TargetValue *result = 0; - size_t index = TokenTree->lookup( str ); - if ( index ) { - auto const& it = reverse_values.find( index ); - result = it->second; - } - return result; - } - - TargetValue *Target::ReverseLookup( size_t index ) const { - auto const& it = reverse_values.find( index ); - return it->second; - } - - Feature::~Feature(){ - if ( !is_reference ){ - if ( n_dot_j ) { - delete [] n_dot_j; - delete [] n_i_dot; - } - delete_matrix(); - delete metric; - for ( const auto& it : values_array ){ - delete it; - } - } - reverse_values.clear(); - } - - bool Feature::matrixPresent( bool& isRead ) const { - isRead = false; - if ( metric_matrix != 0 ){ - if ( PrestoreStatus == ps_ok ){ - return true; - } - else if ( PrestoreStatus == ps_read ){ - isRead = true; - return true; - } - } - return false; - } - - size_t Feature::matrix_byte_size() const { - if ( metric_matrix ){ - return metric_matrix->NumBytes(); - } - else { - return 0; - } - } - - FeatVal_Stat Feature::prepare_numeric_stats(){ - bool first = true; - for ( const auto& fv : values_array ){ - size_t freq = fv->ValFreq(); - if ( freq > 0 ){ - double tmp = -1; - if ( !TiCC::stringTo( fv->Name(), tmp ) ){ - Warning( "a Non Numeric value '" + fv->Name() + - "' in Numeric Feature!" ); - return NotNumeric; - } - if ( first ){ - first = false; - n_min = tmp; - n_max = tmp; - } - else if ( tmp < n_min ){ - n_min = tmp; - } - else if ( tmp > n_max ){ - n_max = tmp; - } - } - } - if ( fabs(n_max - n_min) < Epsilon ){ - return SingletonNumeric; - } - else { - return NumericValue; - } - } - - inline int min( int i1, int i2 ) { return (i1>i2?i2:i1); } - inline size_t min( size_t i1, size_t i2 ) { return (i1>i2?i2:i1); } - - void Feature::SharedVarianceStatistics( Target *Targ, int eff_cnt ){ - size_t NumInst = Targ->TotalValues(); - int NumCats = Targ->EffectiveValues(); - int k = min( NumCats, eff_cnt ) - 1; - if ( k == 0 || NumInst == 0 ){ - shared_variance = 0; - } - else { - shared_variance = chi_square / (double)( NumInst * k ); - } - } - - void Feature::StandardDeviationStatistics( ){ - double sum = 0.0; - vector store( values_array.size() ); - for ( unsigned int i=0; i < values_array.size(); ++i ){ - FeatureValue *FV = values_array[i]; - double val = TiCC::stringTo( FV->Name() ); - store[i] = val; - sum += val; - } - double total = 0.0; - for ( unsigned int i=0; i < values_array.size(); ++i ){ - double diff = sum - store[i]; - total += diff*diff; - } - standard_deviation = sqrt( total / values_array.size() ); - } - - void Feature::clear_matrix(){ - if ( PrestoreStatus == ps_read ){ - return; - } - else { - delete_matrix(); - } - } - - void Feature::delete_matrix(){ - if ( metric_matrix ){ - metric_matrix->Clear(); - delete metric_matrix; - } - metric_matrix = 0; - PrestoreStatus = ps_undef; - } - - bool Feature::setMetricType( const MetricType M ){ - if ( !metric || M != metric->type() ){ - delete metric; - metric = getMetricClass(M); - return true; - } - else { - return false; - } - } - - MetricType Feature::getMetricType() const { return metric->type(); } - - bool Feature::store_matrix( int limit){ - // - // Store a complete distance matrix. - // - if ( PrestoreStatus == ps_read ){ - return true; - } - if ( !metric_matrix ){ - metric_matrix = new SparseSymetricMatrix(); - } - if ( PrestoreStatus != ps_failed && metric->isStorable( ) ) { - try { - for ( const auto& FV_i : values_array ){ - for ( const auto& FV_j : values_array ){ - if ( FV_i->ValFreq() >= matrix_clip_freq && - FV_j->ValFreq() >= matrix_clip_freq && - ( Prestored_metric != metric->type() || - fabs(metric_matrix->Extract(FV_i,FV_j)) < Epsilon ) ){ - double dist = metric->distance( FV_i, FV_j, limit ); - metric_matrix->Assign( FV_i, FV_j, dist ); - } - } - } - } - catch( ... ){ - cout << "hit the ground!" << endl; - PrestoreStatus = ps_failed; - return false; - }; - PrestoreStatus = ps_ok; - } - if ( PrestoreStatus == ps_ok ){ - Prestored_metric = metric->type(); - } - return true; - } - - ostream& operator<< (std::ostream& os, SparseValueProbClass *VPC ){ - if ( VPC ) { - int old_prec = os.precision(); - os.precision(3); - os.setf( std::ios::fixed ); - auto it = VPC->vc_map.begin(); - for ( size_t k = 1; k <= VPC->dimension; ++k ){ - os.setf(std::ios::right, std::ios::adjustfield); - if ( it != VPC->vc_map.end() && - it->first == k ){ - os << "\t" << it->second; - ++it; - } - else { - os << "\t" << 0.0; - } - } - os << setprecision( old_prec ); - } - else { - os << "(Null SA)"; - } - return os; - } - - void Feature::print_vc_pb_array( ostream &os ) const { - for ( const auto& FV : values_array ){ - if ( FV->ValueClassProb ){ - os << FV << FV->ValueClassProb << endl; - } - } - } - - bool Feature::read_vc_pb_array( istream &is ){ - unsigned int Num = 0; - bool first = true; - // clear all existing arrays - for ( const auto& FV : values_array ){ - if ( FV->ValueClassProb ){ - delete FV->ValueClassProb; - FV->ValueClassProb = NULL; - } - } - UnicodeString buf; - while ( TiCC::getline( is, buf ) ){ - if ( buf.length() < 8 ){ // "empty" line separates matrices - break; - } - vector parts = TiCC::split( buf ); - if ( first ){ - Num = parts.size() - 1; - first = false; - } - UnicodeString name = parts[0]; - FeatureValue *FV = Lookup( name ); - if ( !FV ){ - Warning( "Unknown FeatureValue '" + TiCC::UnicodeToUTF8(name) - + "' in file, (skipped) " ); - continue; - } - else { - FV->ValueClassProb = new SparseValueProbClass( Num ); - for ( size_t i=0; i < Num; ++i ){ - UnicodeString tname = parts[i+1]; - double value; - if ( !TiCC::stringTo( tname, value ) ){ - Error( "Found illegal value '" + TiCC::UnicodeToUTF8(tname) + "'" ); - return false; - } - else if ( value > Epsilon ) { - FV->ValueClassProb->Assign( i, value ); - } - } - } - } - // check if we've got all the values, assign a default if not so - for ( const auto& FV : values_array ){ - if ( FV->ValueClassProb == NULL ){ - FV->ValueClassProb = new SparseValueProbClass( Num ); - } - } - vcpb_read = true; - return true; - } - - bool Feature::fill_matrix( istream &is ) { - if ( !metric_matrix ){ - metric_matrix = new SparseSymetricMatrix(); - } - else { - metric_matrix->Clear(); - } - UnicodeString line; - while ( TiCC::getline(is,line) ){ - if ( line.isEmpty() ){ - break; - } - vector arr = TiCC::split_at( line, " " ); - size_t num = arr.size(); - double d; - if ( num != 2 ){ - Error( "wrong line in inputfile" ); - return false; - } - else if ( arr[0].length() < 2 ){ - Error( "wrong line in inputfile" ); - return false; - } - else if ( !TiCC::stringTo( arr[1], d ) ) { - Error( "wrong line in inputfile" ); - return false; - } - else { - UnicodeString stripped = UnicodeString( arr[0], 1,arr[0].length()-2) ; - vector parts = TiCC::split_at( stripped, ",\t" ); - if ( parts.size() != 2 ){ - Error( "wrong line in inputfile" ); - return false; - } - else { - FeatureValue *F1 = Lookup(parts[0]); - FeatureValue *F2 = Lookup(parts[1]); - metric_matrix->Assign( F1, F2, d ); - } - } - } - PrestoreStatus = ps_read; - return true; - } - - void Feature::print_matrix( ostream &os, bool full ) const { - // - // Print the matrix. - // - int old_prec = os.precision(); - ios::fmtflags old_flags = os.flags(); - os.unsetf(std::ios_base::floatfield); - if ( full ){ - for ( const auto& FV_i : values_array ){ - os.width(6); - os.setf(ios::left, ios::adjustfield); - os << FV_i << ":"; - os.width(12); - os.precision(3); - os.setf(ios::right, ios::adjustfield); - for ( const auto& FV_j : values_array ){ - os.width(12); - os.precision(3); - os.setf(ios::right,ios::adjustfield ); - if ( FV_i->ValFreq() < matrix_clip_freq || - FV_j->ValFreq() < matrix_clip_freq ){ - os << "*"; - } - else { - os << metric_matrix->Extract(FV_i,FV_j); - } - } - os << endl; - } - } - else { - os << *metric_matrix << endl; - } - os << setprecision( old_prec ); - os.flags( old_flags ); - } - - TargetValue *Target::add_value( const UnicodeString& valstr, int freq ){ - unsigned int hash_val = TokenTree->hash( valstr ); - // cerr << "target hash(" << valstr << ") geeft: " << hash_val << endl; - return add_value( hash_val, freq ); - } - - TargetValue *Target::add_value( size_t index, int freq ){ - auto const& it = reverse_values.find( index ); - if ( it == reverse_values.end() ){ - const UnicodeString& name = TokenTree->reverse_lookup( index ); - // cerr << "target lookup(" << index << ") geeft: " << name << endl; - // we want to store the singleton value for this index - // so we MUST reverse lookup the index - TargetValue *tv = new TargetValue( name, index ); - tv->ValFreq( freq ); - reverse_values[index] = tv; - values_array.push_back( tv ); - } - else { - it->second->IncValFreq( freq ); - } - return reverse_values[index]; - } - - TargetValue *Target::MajorityClass() const { - TargetValue *result = 0; - size_t freq = 0; - for ( const auto& it : values_array ){ - if ( it->ValFreq() > freq ){ - result = it; - freq = result->ValFreq(); - } - } - return result; - } - - bool Target::increment_value( TargetValue *TV ){ - bool result = false; - if ( TV ){ - TV->incr_val_freq(); - result = true; - } - return result; - } - - bool Target::decrement_value( TargetValue *TV ){ - bool result = false; - if ( TV ){ - TV->decr_val_freq(); - result = true; - } - return result; - } Instance::Instance(): - TV(NULL), sample_weight(0.0), occ(1) { + TV(NULL), + sample_weight(0.0), + occ(1) + { } Instance::~Instance(){ @@ -1765,13 +46,13 @@ namespace Timbl { } void Instance::clear(){ - for ( unsigned int i=0; i < FV.size(); ++i ){ - if ( FV[i] ){ - if ( FV[i]->isUnknown() ){ - delete FV[i]; + for ( auto& it : FV ){ + if ( it ){ + if ( it->isUnknown() ){ + delete it; } } - FV[i] = 0; + it = 0; } TV = 0; sample_weight = 0.0; @@ -1782,7 +63,7 @@ namespace Timbl { FV.resize( len, 0 ); } - ostream& operator<<(ostream& os, const Instance *I ){ + ostream& operator<<( ostream& os, const Instance *I ){ if ( I ){ os << *I; } @@ -1792,9 +73,9 @@ namespace Timbl { return os; } - ostream& operator<<(ostream& os, const Instance& I ){ - for ( unsigned int i=0; i < I.FV.size(); ++i ){ - os << I.FV[i] << ", "; + ostream& operator<<( ostream& os, const Instance& I ){ + for ( const auto& it : I.FV ){ + os << it << ", "; } os << I.TV << " " << I.sample_weight; return os; diff --git a/src/LOOExperiment.cxx b/src/LOOExperiment.cxx index 6ece36b..21f74a4 100644 --- a/src/LOOExperiment.cxx +++ b/src/LOOExperiment.cxx @@ -26,7 +26,7 @@ */ #include -#include +#include #include #include @@ -36,6 +36,7 @@ #include "timbl/Common.h" #include "timbl/Types.h" #include "timbl/IBtree.h" +#include "timbl/Instance.h" #include "timbl/MBLClass.h" #include "timbl/TimblExperiment.h" @@ -113,7 +114,7 @@ namespace Timbl { delete confusionInfo; confusionInfo = 0; if ( Verbosity(ADVANCED_STATS) ){ - confusionInfo = new ConfusionMatrix( Targets->num_of_values() ); + confusionInfo = new ConfusionMatrix( targets.num_of_values() ); } showTestingInfo( *mylog ); // Start time. diff --git a/src/MBLClass.cxx b/src/MBLClass.cxx index 743189a..996d75c 100644 --- a/src/MBLClass.cxx +++ b/src/MBLClass.cxx @@ -24,6 +24,7 @@ or send mail to: lamasoftware (at ) science.ru.nl */ + #include #include #include @@ -33,7 +34,6 @@ #include #include -#include #include #include "ticcutils/StringOps.h" @@ -59,14 +59,17 @@ using namespace icu; using namespace nlohmann; namespace Timbl { + using TiCC::operator<<; - void MBLClass::fill_table(){ + void MBLClass::init_options_table( size_t Size ){ if ( tableFilled ){ return; } else { tableFilled = true; } + MaxFeatures = Size; + UserOptions.resize(MaxFeatures+1); //cerr << "fill table() for " << (void*)this << endl; bool stat = Options.Add( new IntegerOption( "FLENGTH", @@ -100,8 +103,8 @@ namespace Timbl { &target_pos, std::numeric_limits::max(), 0, MaxFeatures ) ); + // cerr << "STAT 1=" << (stat?"true":"false") << endl; if ( stat ){ - Options.SetFreezeMark(); stat = Options.Add( new BoolOption( "DO_SILLY", &do_silly_testing, false ) ) @@ -149,13 +152,14 @@ namespace Timbl { && Options.Add( new IntegerOption( "CLIP_FACTOR", &clip_factor, 10, 0, 1000000 ) ); } + // cerr << "STAT 2=" << (stat?"true":"false") << endl; if ( !stat ){ FatalError( "Too many options for OptionTable" ); } } void MBLClass::InvalidMessage(void) const{ - if ( err_count++ == 1 ){ + if ( err_cnt++ == 1 ){ Warning( "A preceding error prevents any operation on this " "Timbl Object\n" "other experiments might not be influenced" ); @@ -167,7 +171,7 @@ namespace Timbl { bool MBLClass::SetOption( const string& line ){ bool result = false; - if ( !ExpInvalid() ){ + if ( !ExpInvalid(true) ){ // Info( "set Option:" + line ); enum SetOptRes opt_res = Options.SetOption( line ); switch ( opt_res ){ @@ -191,55 +195,69 @@ namespace Timbl { return result; } - void MBLClass::InitClass( const size_t Size ){ - GlobalMetric = 0; - is_copy = false; - is_synced = false; - sock_os = 0; - sock_is_json = false; - Targets = NULL; - err_count = 0; - MBL_init = false; - tableFilled = false; - need_all_weights = false; - InstanceBase = NULL; - TargetStrings = NULL; - FeatureStrings = NULL; - num_of_features = 0; - target_pos = std::numeric_limits::max(); - mvd_threshold = 1; - effective_feats = 0; - num_of_num_features = 0; - DBEntropy = -1.0; - ChopInput = 0; - MaxFeatures = Size; - runningPhase = LearnWords; - do_sloppy_loo = false; - do_silly_testing = false; - do_diversify = false; - keep_distributions = false; - UserOptions.resize(MaxFeatures+1); - tester = 0; - // cerr << "call fill table() in InitClass()" << endl; - fill_table(); - decay = 0; - myerr = &cerr; - mylog = &cout; - } - - MBLClass::MBLClass( const string& name ){ - tableFilled = false; - exp_name = name; + MBLClass::MBLClass( const string& name ): + MsgClass(), + sock_os(0), + sock_is_json(false), + targets(NULL), + InstanceBase(NULL), + mylog(&cout), + myerr(&cerr), + runningPhase(LearnWords), + Weighting(GR_w), + GlobalMetric(0), + TreeOrder(UnknownOrdening), + num_of_neighbors(1), + dynamic_neighbors(false), + decay_flag(Zero), + exp_name( name ), + MaxBests(500), + decay(0), + beamSize(0), + normalisation(noNorm), + norm_factor(1.0), + is_copy(false), + is_synced(false), + ib2_offset(0), + random_seed(-1), + decay_alfa(1.0), + decay_beta(1.0), + MBL_init(false), + tableFilled(false), + globalMetricOption(Overlap), + do_diversify(false), + ChopInput(0), + F_length(0), + MaxFeatures(0), + input_format(UnknownInputFormat), + verbosity(NO_VERB), + target_pos(std::numeric_limits::max()), + clip_factor(10), + Bin_Size(20), + progress(10000), + tribl_offset(0), + igThreshold(1000), + mvd_threshold(1), + do_sloppy_loo(false), + do_exact_match(false), + do_silly_testing(false), + hashed_trees(true), + need_all_weights(false), + do_sample_weighting(false), + do_ignore_samples(true), + no_samples_test(true), + keep_distributions(false), + DBEntropy(-1.0), + tester(0), + doOcc(0) + { } MBLClass &MBLClass::operator=( const MBLClass& m ){ if ( this != &m ){ is_copy = true; is_synced = false; - MaxFeatures = m.MaxFeatures; - UserOptions.resize(MaxFeatures+1); - // cerr << "call fill table() in assign" << endl; - fill_table(); + init_options_table( m.MaxFeatures ); F_length = m.F_length; MaxBests = m.MaxBests; TreeOrder = m.TreeOrder; @@ -267,7 +285,6 @@ namespace Timbl { mvd_threshold = m.mvd_threshold; num_of_neighbors = m.num_of_neighbors; dynamic_neighbors = m.dynamic_neighbors; - num_of_features = m.num_of_features; target_pos = m.target_pos; progress = m.progress; Bin_Size = m.Bin_Size; @@ -279,34 +296,17 @@ namespace Timbl { do_sloppy_loo = m.do_sloppy_loo; do_silly_testing = m.do_silly_testing; do_diversify = m.do_diversify; - permutation = m.permutation; tester = 0; decay = 0; - Features = m.Features; - PermFeatures = m.PermFeatures; - for ( unsigned int i=0; i < Features.size(); ++i ){ - Features[i] = new Feature( *m.Features[i] ); - if ( m.PermFeatures[i] ) { - PermFeatures[i] = Features[permutation[i]]; - } - else { - PermFeatures[i] = 0; - } - } - Targets = m.Targets; - err_count = 0; + targets = m.targets; + features = m.features; MBL_init = false; need_all_weights = false; InstanceBase = m.InstanceBase->Copy(); - TargetStrings = m.TargetStrings; - FeatureStrings = m.FeatureStrings; - effective_feats = m.effective_feats; - num_of_num_features = m.num_of_num_features; DBEntropy = -1.0; ChopInput = 0; setInputFormat( m.input_format ); - //one extra to store the target! - CurrInst.Init( num_of_features ); + CurrInst.Init( NumOfFeatures() ); myerr = m.myerr; mylog = m.mylog; } @@ -317,9 +317,6 @@ namespace Timbl { CurrInst.clear(); if ( !is_copy ){ delete InstanceBase; - delete Targets; - delete TargetStrings; - delete FeatureStrings; } else { if ( is_synced ){ @@ -329,9 +326,6 @@ namespace Timbl { InstanceBase->CleanPartition( false ); } } - for ( auto const& feat : Features ){ - delete feat; - } delete GlobalMetric; delete tester; delete decay; @@ -397,7 +391,7 @@ namespace Timbl { *myerr << "Error: " << out_line << endl; } } - err_count++; + ++err_cnt; } void MBLClass::FatalError( const string& out_line ) const { @@ -499,7 +493,7 @@ namespace Timbl { else { int OldPrec = os.precision(DBL_DIG); size_t pos = 0; - for ( auto const& feat : Features ){ + for ( auto const& feat : features.feats ){ os.precision(DBL_DIG); os << "Feature " << ++pos << "\t : " << feat->Weight() << endl; } @@ -508,35 +502,6 @@ namespace Timbl { return true; } - void MBLClass::calculatePermutation( const vector& W ){ - vector WR = W; - size_t IgnoredFeatures = 0; - permutation.resize(num_of_features); - for ( size_t j=0; j < num_of_features; ++j ){ - permutation[j] = j; - if ( Features[j]->Ignore() ){ - WR[j] = -0.1; // To be shure that they are placed AFTER - // those which are realy Zero - IgnoredFeatures++; - } - } - if ( IgnoredFeatures == num_of_features ){ - Error( "All features seem to be ignored! Nothing to do" ); - } - else { - for ( size_t k=0; k < num_of_features; ++k ){ - size_t Max = 0; - for ( size_t m=1; m < num_of_features; ++m ){ - if ( WR[m] > WR[Max] ){ - Max = m; - } - } - WR[Max] = -1; - permutation[k] = Max; - } - } - } - string MBLClass::extract_limited_m( size_t lim ){ default_order(); set_order(); @@ -545,44 +510,32 @@ namespace Timbl { result += TiCC::toString( gm ); set ignore; map> metrics; - // using TiCC::operator<<; - // cerr << "permutation: " << permutation << endl; - for ( size_t k=0; k < num_of_features; ++k ){ - if ( Features[permutation[k]]->Ignore() ){ + for ( size_t k=0; k < NumOfFeatures(); ++k ){ + if ( features[features.permutation[k]]->Ignore() ){ // cerr << "Add " << k+1 << " to ignore" << endl; ignore.insert(k+1); } else { - MetricType m = Features[permutation[k]]->getMetricType(); + MetricType m = features[features.permutation[k]]->getMetricType(); if ( m != gm ){ metrics[TiCC::toString( m )].insert(k+1); } } } - // cerr << "lim=" << lim << " so start ignoring at: " - // << lim + ignore.size()<< endl; - - for ( size_t i=lim+ignore.size(); i < num_of_features; ++i ){ - // cerr << "Add " << permutation[i]+1 << " to ignore" << endl; - ignore.insert( permutation[i]+1 ); + for ( size_t i=lim+ignore.size(); i < NumOfFeatures(); ++i ){ + ignore.insert( features.permutation[i]+1 ); } if ( !ignore.empty() ){ result += ":I"; - // using TiCC::operator<<; - // cerr << "IGNORE bevat:" << ignore << endl; for ( auto it = ignore.begin(); it != ignore.end(); ++it ){ size_t value = *it; - // cerr << "START it=" << *it << " value = " << value << endl; size_t steps = 0; for ( ; value <= *ignore.rbegin(); ++value ){ - // cerr << "value = " << value << endl; if ( ignore.find(value) == ignore.end() ){ break; } ++steps; } - // cerr << "END it=" << *it << " value = " << value << endl; - // cerr << "STEPS =" << steps << endl; if ( value == *it+1 ){ // so only one value, output it if ( *it != *ignore.begin() ){ @@ -599,7 +552,6 @@ namespace Timbl { else { // a range. output with a hyphen result += TiCC::toString(*it) + "-" + TiCC::toString( value-1) + ","; - // cerr << "advance it met " << steps-1 << endl; for ( size_t j=0; j < steps-1;++j){ ++it; if ( it == ignore.end() ){ @@ -638,11 +590,9 @@ namespace Timbl { void MBLClass::writePermutation( ostream& os ) const { os << "Feature Permutation based on " << ( Weighting==UserDefined_w?"weightfile":TiCC::toString(TreeOrder, true)) - << " :" << endl << "< "; - for ( size_t j=0; j < num_of_features-1; ++j ){ - os << permutation[j]+1 << ", "; - } - os << permutation[num_of_features-1]+1 << " >" << endl; + << " :" << endl; + features.write_permutation( os ); + os << endl; } void MBLClass::time_stamp( const char *line, int number ) const { @@ -663,7 +613,7 @@ namespace Timbl { } void MBLClass::InitWeights(void){ - for ( auto const& feat : Features ){ + for ( auto const& feat : features.feats ){ if ( feat->Ignore() ){ feat->SetWeight( 0.0 ); } @@ -701,7 +651,7 @@ namespace Timbl { void MBLClass::diverseWeights(void){ double minW = DBL_MAX; - for ( auto const& feat : Features ){ + for ( auto const& feat : features.feats ){ if ( feat->Ignore() ){ continue; } @@ -709,7 +659,7 @@ namespace Timbl { minW = feat->Weight(); } } - for ( auto const& feat : Features ){ + for ( auto const& feat : features.feats ){ if ( feat->Ignore() ){ continue; } @@ -751,15 +701,15 @@ namespace Timbl { void MBLClass::set_order(){ calculate_fv_entropy(false); - vector Order(num_of_features); + vector Order(NumOfFeatures()); size_t i = 0; - for ( auto const& feat : Features ){ + for ( auto const& feat : features.feats ){ switch( TreeOrder ){ case DataFile: Order[i] = feat->Weight(); break; case NoOrder: - Order[i] = (double)(num_of_features-i); + Order[i] = (double)(NumOfFeatures()-i); break; case IGOrder: Order[i] = feat->InfoGain(); @@ -811,25 +761,17 @@ namespace Timbl { } ++i; } - calculatePermutation( Order ); + features.calculate_permutation( Order ); if ( !Verbosity(SILENT) ){ writePermutation( *mylog ); } - for ( size_t j=0; j < num_of_features; ++j ){ - if ( j < effective_feats ){ - PermFeatures[j] = Features[permutation[j]]; - } - else { - PermFeatures[j] = NULL; - } - } } void MBLClass::MatrixInfo( ostream& os ) const { unsigned int TotalCount = 0; bool dummy; size_t m = 1; - for ( const auto& feat : Features ){ + for ( const auto& feat : features.feats ){ if ( !feat->Ignore() && feat->isStorableMetric() && feat->matrixPresent( dummy ) ){ @@ -880,13 +822,13 @@ namespace Timbl { line = TiCC::trim( line.substr( pos ) ); } if ( line.empty() ){ - if ( !Features[num-1]->isStorableMetric() ){ + if ( !features[num-1]->isStorableMetric() ){ Warning( "Ignoring entry for feature " + nums + " which is NOT set to a storable metric type." + " use -m commandline option to set metrics" ); skip = true; } - else if ( !Features[num-1]->fill_matrix( is ) ){ + else if ( !features[num-1]->fill_matrix( is ) ){ return false; } else { @@ -906,7 +848,7 @@ namespace Timbl { bool MBLClass::writeMatrices( ostream& os ) const { size_t pos = 0; - for ( const auto& feat : Features ){ + for ( const auto& feat : features.feats ){ os << "Feature " << ++pos; bool dummy; if ( !feat->matrixPresent( dummy ) ){ @@ -942,14 +884,14 @@ namespace Timbl { " in file, " + TiCC::toString(index) + " expected" ); result = false; } - else if ( index > num_of_features ){ + else if ( index > NumOfFeatures() ){ Error( "Too many features matrices in this file " ); result = false; } else { is >> ws >> buf; if ( compare_nocase_n( "Ignored", buf ) ){ - if ( Features[index-1]->Ignore() ){ + if ( features[index-1]->Ignore() ){ ++index; continue; } @@ -960,7 +902,7 @@ namespace Timbl { } } else if ( compare_nocase_n( "Numeric", buf ) ){ - if ( Features[index-1]->isNumerical() ){ + if ( features[index-1]->isNumerical() ){ ++index; continue; } @@ -973,8 +915,8 @@ namespace Timbl { Error( "Problem in Probability file, missing matrix info" ); result = false; } - else if ( Features[index-1]->Ignore() || - Features[index-1]->isNumerical() ){ + else if ( features[index-1]->Ignore() || + features[index-1]->isNumerical() ){ Warning( "Matrix info found for feature #" + TiCC::toString(index) + " (skipped)" ); @@ -982,7 +924,7 @@ namespace Timbl { } else { is.ignore( std::numeric_limits::max(), '\n' ); - result = Features[index-1]->read_vc_pb_array( is ); + result = features[index-1]->read_vc_pb_array( is ); ++index; } } @@ -990,7 +932,7 @@ namespace Timbl { } } while ( result && !is.eof() && !is.bad() ); - if ( index < num_of_features+1 ){ + if ( index < NumOfFeatures()+1 ){ Error( "Not enough features matrices in this file " ); result = false; } @@ -1010,15 +952,15 @@ namespace Timbl { // Print the possible classes. // os << "Targets : "; - for ( const auto& it : Targets->values_array ){ + for ( const auto& it : targets.values_array ){ os << it; - if ( &it != &Targets->values_array.back() ){ + if ( &it != &targets.values_array.back() ){ os << ","; } } os << "." << endl << endl; size_t pos = 0; - for ( const auto& feat : Features ){ + for ( const auto& feat : features.feats ){ os << "feature # " << ++pos ; if ( feat->Ignore() ){ os << " Ignored, (-s option)" << endl; @@ -1037,8 +979,8 @@ namespace Timbl { } bool MBLClass::allocate_arrays(){ - size_t Dim = Targets->values_array.size(); - for ( const auto& feat : Features ){ + size_t Dim = targets.values_array.size(); + for ( const auto& feat : features.feats ){ if ( !feat->Ignore() && !feat->isNumerical() ) { if ( !feat->AllocSparseArrays( Dim ) ){ @@ -1054,7 +996,7 @@ namespace Timbl { if ( !is_copy ){ result = allocate_arrays(); if ( result ){ - for ( const auto& feat : Features ){ + for ( const auto& feat : features.feats ){ if ( !feat->Ignore() && !feat->isNumerical() ){ feat->ClipFreq( (int)rint(clip_factor * @@ -1076,15 +1018,15 @@ namespace Timbl { */ void MBLClass::calculatePrestored(){ if ( !is_copy ){ - for ( size_t j = tribl_offset; j < effective_feats; ++j ) { - if ( !PermFeatures[j]->Ignore() && - PermFeatures[j]->isStorableMetric() ){ - PermFeatures[j]->store_matrix( mvd_threshold ); + for ( size_t j = tribl_offset; j < EffectiveFeatures(); ++j ) { + if ( !features.perm_feats[j]->Ignore() && + features.perm_feats[j]->isStorableMetric() ){ + features.perm_feats[j]->store_matrix( mvd_threshold ); } } if ( Verbosity(VD_MATRIX) ){ size_t pos = 0; - for ( auto const& feat : Features ){ + for ( auto const& feat : features.feats ){ ++pos; if ( !feat->Ignore() ){ bool dummy; @@ -1105,31 +1047,31 @@ namespace Timbl { const Instance *MBLClass::chopped_to_instance( PhaseValue phase ){ CurrInst.clear(); - if ( num_of_features != target_pos ) { + if ( NumOfFeatures() != target_pos ) { ChopInput->swapTarget( target_pos ); } int occ = ChopInput->getOcc(); if ( occ > 1 ){ CurrInst.Occurrences( occ ); } - // cerr << "to instance: Chopped input=" << ChopInput->getString() << endl; switch ( phase ){ case LearnWords: // Add the target. - CurrInst.TV = Targets->add_value( ChopInput->getField( num_of_features ), - occ ); + CurrInst.TV = targets.add_value( ChopInput->getField( NumOfFeatures() ), + occ ); // Now add the Feature values. - for ( size_t i = 0; i < num_of_features; ++i ){ + for ( size_t i = 0; i < NumOfFeatures(); ++i ){ // when learning, no need to bother about Permutation - if ( Features[i]->Ignore() ) { + if ( features[i]->Ignore() ) { // but this might happen, take care! CurrInst.FV[i] = NULL; } else { // Add it to the Instance. // cerr << "Feature add: " << ChopInput->getField(i) << endl; - CurrInst.FV[i] = Features[i]->add_value( ChopInput->getField(i), - CurrInst.TV, occ ); + CurrInst.FV[i] = features[i]->add_value( ChopInput->getField(i), + CurrInst.TV, occ ); + } } // i // cerr << "new instance: " << CurrInst << endl; @@ -1137,33 +1079,33 @@ namespace Timbl { case TrainWords: // Lookup for TreeBuilding // First the Features - for ( size_t k = 0; k < effective_feats; ++k ){ - size_t j = permutation[k]; - CurrInst.FV[k] = Features[j]->Lookup( ChopInput->getField(j) ); + for ( size_t k = 0; k < EffectiveFeatures(); ++k ){ + size_t j = features.permutation[k]; + CurrInst.FV[k] = features[j]->Lookup( ChopInput->getField(j) ); } // k // and the Target - CurrInst.TV = Targets->Lookup( ChopInput->getField( num_of_features ) ); + CurrInst.TV = targets.Lookup( ChopInput->getField( NumOfFeatures() ) ); break; case TrainLearnWords: // Lookup for Incremental TreeBuilding // Assumes that somehow Permutation and effective_feats are known // First the Target - CurrInst.TV = Targets->add_value( ChopInput->getField(num_of_features ), - occ ); + CurrInst.TV = targets.add_value( (*ChopInput)[NumOfFeatures()], occ ); // Then the Features - for ( size_t l = 0; l < effective_feats; ++l ){ - size_t j = permutation[l]; - CurrInst.FV[l] = Features[j]->add_value( ChopInput->getField(j), - CurrInst.TV, occ ); + for ( size_t l = 0; l < EffectiveFeatures(); ++l ){ + size_t j = features.permutation[l]; + CurrInst.FV[l] = features[j]->add_value((*ChopInput)[j], + CurrInst.TV, + occ ); } // for l break; case TestWords: // Lookup for Testing // This might fail for unknown values, then we create a dummy value - for ( size_t m = 0; m < effective_feats; ++m ){ - size_t j = permutation[m]; + for ( size_t m = 0; m < EffectiveFeatures(); ++m ){ + size_t j = features.permutation[m]; const UnicodeString& fld = ChopInput->getField(j); - CurrInst.FV[m] = Features[j]->Lookup( fld ); + CurrInst.FV[m] = features[j]->Lookup( fld ); if ( !CurrInst.FV[m] ){ // for "unknown" values have to add a dummy value CurrInst.FV[m] = new FeatureValue( fld ); @@ -1171,7 +1113,7 @@ namespace Timbl { } // i // the last string is the target - CurrInst.TV = Targets->Lookup( ChopInput->getField(num_of_features) ); + CurrInst.TV = targets.Lookup( ChopInput->getField(NumOfFeatures()) ); break; default: FatalError( "Wrong value in Switch: " @@ -1213,13 +1155,13 @@ namespace Timbl { os.setf(ios::showpoint ); int OldPrec = os.precision(8); os << "DB Entropy : " << DBEntropy << endl; - os << "Number of Classes : " << Targets->EffectiveValues() << endl; + os << "Number of Classes : " << targets.EffectiveValues() << endl; os << endl; if ( Verbosity(FEAT_W) ){ if ( CurrentWeighting() == SD_w ){ os << "Feats\tVals\tStandard Deviation" << endl; size_t pos = 0; - for ( const auto& feat : Features ){ + for ( const auto& feat : features.feats ){ os << setw(5) << ++pos; os.setf(ios::right, ios::adjustfield); if ( feat->Ignore() ){ @@ -1241,7 +1183,7 @@ namespace Timbl { else if ( need_all_weights ){ os << "Feats\tVals\tX-square\tVariance\tInfoGain\tGainRatio" << endl; size_t pos = 0; - for ( const auto& feat : Features ) { + for ( const auto& feat : features.feats ) { os << setw(5) << ++pos; os.setf(ios::right, ios::adjustfield); if ( feat->Ignore() ){ @@ -1266,7 +1208,7 @@ namespace Timbl { else { os << "Feats\tVals\tInfoGain\tGainRatio" << endl; size_t pos = 0; - for ( const auto& feat : Features ) { + for ( const auto& feat : features.feats ) { os << setw(5) << ++pos; os.setf(ios::right, ios::adjustfield); if ( feat->Ignore() ){ @@ -1293,20 +1235,20 @@ namespace Timbl { bool MBLClass::writeWeights( ostream& os ) const { bool result = false; if ( !ExpInvalid() ){ - if ( Features[0] == NULL ){ + if ( features[0] == NULL ){ Warning( "unable to save Weights, nothing learned yet" ); } else { os << "# DB Entropy: " << DBEntropy << endl; - os << "# Classes: " << Targets->values_array.size() << endl; - os << "# Lines of data: " << Targets->TotalValues() << endl; + os << "# Classes: " << targets.values_array.size() << endl; + os << "# Lines of data: " << targets.TotalValues() << endl; int OldPrec = os.precision(DBL_DIG); if ( CurrentWeighting() == SD_w ){ os << "#" << endl; os << "# " << TiCC::toString( SD_w ) << endl; os << "# Fea." << "\t" << "Weight" << endl; size_t pos = 0; - for ( const auto& feat : Features ){ + for ( const auto& feat : features.feats ){ os.precision(DBL_DIG); os << ++pos << "\t"; if ( feat->Ignore() ){ @@ -1322,7 +1264,7 @@ namespace Timbl { os << "# " << TiCC::toString( No_w ) << endl; os << "# Fea." << "\t" << "Weight" << endl; size_t pos = 0; - for ( const auto& feat : Features ){ + for ( const auto& feat : features.feats ){ os.precision(DBL_DIG); os << ++pos << "\t"; if ( feat->Ignore() ){ @@ -1336,7 +1278,7 @@ namespace Timbl { os << "# " << TiCC::toString( GR_w ) << endl; os << "# Fea." << "\t" << "Weight" << endl; pos = 0; - for ( const auto& feat : Features ){ + for ( const auto& feat : features.feats ){ os.precision(DBL_DIG); os << ++pos << "\t"; if ( feat->Ignore() ){ @@ -1350,7 +1292,7 @@ namespace Timbl { os << "# " << TiCC::toString( IG_w ) << endl; os << "# Fea." << "\t" << "Weight" << endl; pos = 0; - for ( const auto& feat : Features ){ + for ( const auto& feat : features.feats ){ os.precision(DBL_DIG); os << ++pos << "\t"; if ( feat->Ignore() ){ @@ -1365,7 +1307,7 @@ namespace Timbl { os << "# " << TiCC::toString( SV_w ) << endl; os << "# Fea." << "\t" << "Weight" << endl; pos = 0; - for ( const auto& feat : Features ){ + for ( const auto& feat : features.feats ){ os.precision(DBL_DIG); os << ++pos << "\t"; if ( feat->Ignore() ){ @@ -1379,7 +1321,7 @@ namespace Timbl { os << "# " << TiCC::toString( X2_w ) << endl; os << "# Fea." << "\t" << "Weight" << endl; pos = 0; - for ( const auto& feat : Features ){ + for ( const auto& feat : features.feats ){ os.precision(DBL_DIG); os << ++pos << "\t"; if ( feat->Ignore() ){ @@ -1400,10 +1342,7 @@ namespace Timbl { } bool MBLClass::read_the_vals( istream& is ){ - bool *done = new bool[num_of_features]; - for ( size_t i=0; i < num_of_features; ++i ){ - done[i] = false; - } + vector done( NumOfFeatures(), false );; string Buffer; while ( getline( is, Buffer) ){ if ( !Buffer.empty() ){ @@ -1418,9 +1357,9 @@ namespace Timbl { vector vals = TiCC::split( Buffer ); if ( vals.size() == 2 ){ size_t i_f = TiCC::stringTo( vals[0] ); - if ( i_f > num_of_features ){ + if ( i_f > NumOfFeatures() ){ Error( "in weightsfile, Feature index > Maximum, (" + - TiCC::toString(num_of_features) + ")" ); + TiCC::toString(NumOfFeatures()) + ")" ); } else if ( done[i_f-1] ){ Error( "in weightsfile, Feature index " + vals[0] + @@ -1435,8 +1374,8 @@ namespace Timbl { " has illegal value: " + vals[1] ); } else { - Features[i_f-1]->SetWeight( w ); - if ( Features[i_f-1]->Ignore() ){ + features[i_f-1]->SetWeight( w ); + if ( features[i_f-1]->Ignore() ){ Warning( "in weightsfile, " "Feature " + vals[0] + " has value: " + TiCC::toString( w ) + @@ -1445,8 +1384,8 @@ namespace Timbl { } } else { - Features[i_f-1]->SetWeight( 0.0 ); - if ( !Features[i_f-1]->Ignore() ){ + features[i_f-1]->SetWeight( 0.0 ); + if ( !features[i_f-1]->Ignore() ){ Warning( "in weightsfile, Feature " + vals[0] + " has value: 'Ignore', we will use: 0.0 " ); } @@ -1456,14 +1395,13 @@ namespace Timbl { } } bool result = true; - for ( size_t j=0; j < num_of_features; ++j ){ + for ( size_t j=0; j < NumOfFeatures(); ++j ){ if ( !done[j] ) { Error( "in weightsfile, Feature index " + TiCC::toString(j+1) + " is not mentioned" ); result = false; } } - delete [] done; return result; } @@ -1533,7 +1471,7 @@ namespace Timbl { } // make shure all weights are correct // Paranoid? - for ( const auto& feat : Features ){ + for ( const auto& feat : features.feats ){ feat->InfoGain( feat->Weight() ); feat->GainRatio( feat->Weight() ); feat->ChiSquare( feat->Weight() ); @@ -1545,37 +1483,20 @@ namespace Timbl { return true; } - void MBLClass::calculate_fv_entropy( bool always ){ - bool realy_first = DBEntropy < 0.0; - if ( always || realy_first ){ - // if it's the first time (DBEntropy == 0 ) or - // if always, we have to (re)calculate everything - double Entropy = 0.0; - // first get the Database Entropy - size_t totval = Targets->TotalValues(); - for ( const auto& it : Targets->values_array ){ - double Ratio = it->ValFreq() / (double)totval; - if ( Ratio > 0 ){ - Entropy += Ratio * Log2(Ratio); - } - } - DBEntropy = fabs(-Entropy); - allocate_arrays(); // create ValueClassProb arrays.. - } - // Loop over the Features, see if the numerics are non-singular - // and do the statistics for those features where the metric is changed. - vector feat_status(num_of_features); - bool nothing_changed = true; - for ( size_t g = 0; g < num_of_features; ++g ) { + bool MBLClass::recalculate_stats( Feature_List& feats, + vector& feat_status, + bool check_change ){ + bool changed = false; + for ( size_t g = 0; g < NumOfFeatures(); ++g ) { feat_status[g] = Unknown; - if ( Features[g]->Ignore() ){ + if ( feats.feats[g]->Ignore() ){ continue; } bool metricChanged = false; MetricType TmpMetricType = UserOptions[g+1]; metricClass *tmpMetric = getMetricClass( TmpMetricType ); if ( tmpMetric->isNumerical() ){ - feat_status[g] = Features[g]->prepare_numeric_stats(); + feat_status[g] = feats[g]->prepare_numeric_stats(); if ( feat_status[g] == SingletonNumeric && input_format == SparseBin && GlobalMetric->isSimilarityMetric( ) ){ @@ -1590,60 +1511,88 @@ namespace Timbl { } } } - else if ( Features[g]->values_array.size() == 1 ){ + else if ( feats[g]->values_array.size() == 1 ){ feat_status[g] = Singleton; } delete tmpMetric; - if ( always || realy_first ){ + if ( check_change ){ bool isRead; - if ( Features[g]->metric && - Features[g]->getMetricType() != TmpMetricType && - Features[g]->isStorableMetric() && - Features[g]->matrixPresent( isRead ) && + if ( feats.feats[g]->metric && + feats.feats[g]->getMetricType() != TmpMetricType && + feats.feats[g]->isStorableMetric() && + feats.feats[g]->matrixPresent( isRead ) && isRead ){ - Error( "The metric " + TiCC::toString(Features[g]->getMetricType()) + + Error( "The metric " + TiCC::toString(feats.feats[g]->getMetricType()) + " for feature " + TiCC::toString( g+1 ) + " is set from a file. It cannot be changed!" ); - return; + abort(); } - metricChanged = !Features[g]->setMetricType(TmpMetricType); + metricChanged = !feats.feats[g]->setMetricType(TmpMetricType); } if ( metricChanged ){ - nothing_changed = false; + changed = true; } } // end g + return changed; + } + + + void MBLClass::calculate_fv_entropy( bool always ){ + bool realy_first = DBEntropy < 0.0; + bool redo = always || realy_first; + if ( redo ){ + // if it's the first time (DBEntropy == 0 ) or + // if always, we have to (re)calculate everything + double Entropy = 0.0; + // first get the Database Entropy + size_t totval = targets.TotalValues(); + for ( const auto& it : targets.values_array ){ + double Ratio = it->ValFreq() / (double)totval; + if ( Ratio > 0 ){ + Entropy += Ratio * Log2(Ratio); + } + } + DBEntropy = fabs(-Entropy); + allocate_arrays(); // create ValueClassProb arrays.. + } + // Loop over the Features, see if the numerics are non-singular + // and do the statistics for those features where the metric is changed. + vector feat_status(NumOfFeatures()); + bool changed = recalculate_stats( features, + feat_status, + redo ); if ( ( CurrentWeighting() == SD_w || GlobalMetric->isSimilarityMetric() ) - && !nothing_changed ){ + && changed ){ // check to see if ALL features are still Numeric. // otherwise we can't do Standard Deviation weighting, // or Similarity Metrics! bool first = true; - ostringstream ostr1; - for ( size_t ff = 0; ff < num_of_features; ++ff ){ + string str1; + for ( size_t ff = 0; ff < NumOfFeatures(); ++ff ){ if ( feat_status[ff] == NotNumeric ){ if ( first ){ - ostr1 << "The following feature(s) have non numeric value: "; + str1 += "The following feature(s) have non numeric value: "; first = false; } else { - ostr1 << ", "; + str1 += ", "; } size_t n = ff; - while ( ff < num_of_features-1 && + while ( ff < NumOfFeatures()-1 && feat_status[ff+1] == NotNumeric ){ ++ff; } if ( n != ff ){ - ostr1 << n+1 << "-" << ff+1; + str1 += to_string(n+1) + "-" + to_string(ff+1); } else { - ostr1 << ff+1; + str1 + to_string(ff+1); } } } if ( !first ){ - Error( ostr1.str() ); + Error( str1 ); if ( GlobalMetric->isSimilarityMetric() ){ Error( "Therefore InnerProduct/Cosine operations are impossible" ); } @@ -1657,72 +1606,73 @@ namespace Timbl { // a result of a forced recalculation if ( realy_first ){ bool first = true; - ostringstream ostr1; - ostringstream ostr2; - for ( size_t ff = 0; ff < num_of_features; ++ff ) { + string str1; + for ( size_t ff = 0; ff < NumOfFeatures(); ++ff ) { if ( feat_status[ff] == Singleton || feat_status[ff] == SingletonNumeric ){ if ( first ){ - ostr1 << "The following feature(s) have only 1 value: "; + str1 += "The following feature(s) have only 1 value: "; first = false; } else { - ostr1 << ", "; + str1 += ", "; } size_t n = ff; - while ( ff < num_of_features-1 && + while ( ff < NumOfFeatures()-1 && ( feat_status[ff+1] == Singleton || feat_status[ff+1] == SingletonNumeric ) ){ ++ff; } if ( n != ff ){ - ostr1 << n+1 << "-" << ff+1; + str1 += to_string(n+1) + "-" + to_string(ff+1); } else { - ostr1 << ff+1; + str1 += to_string(ff+1); } } } if ( !first && !is_copy ){ - Warning( ostr1.str() ); + Warning( str1 ); } + string str2; first = true; - for ( size_t ff = 0; ff < num_of_features; ++ff ){ + for ( size_t ff = 0; ff < NumOfFeatures(); ++ff ){ if ( feat_status[ff] == NotNumeric ){ if ( first ){ - ostr2 << "The following feature(s) contained non-numeric values and\nwill be treated as NON-Numeric: "; + str2 += "The following feature(s) contained non-numeric values and" + "\nwill be treated as NON-Numeric: "; first = false; } else { - ostr2 << ", "; + str2 += ", "; } size_t n = ff; - while ( ff < num_of_features-1 && + while ( ff < NumOfFeatures()-1 && feat_status[ff+1] == NotNumeric ) ff++; if ( n != ff ){ - ostr2 << n+1 << "-" << ff+1; + str2 += to_string(n+1) + "-" + to_string(ff+1); } else { - ostr2 << ff+1; + str2 += to_string(ff+1); } } } if ( !first ){ - Warning( ostr2.str() ); + Warning( str2 ); } } - if ( always || realy_first ){ - for ( const auto& feat : Features ){ + if ( redo ){ + for ( const auto& feat : features.feats ){ if ( Weighting != UserDefined_w ){ if ( CurrentWeighting() == SD_w ){ feat->StandardDeviationStatistics( ); } else if ( feat->isNumerical() ){ - feat->NumStatistics( DBEntropy, Targets, Bin_Size, + feat->NumStatistics( DBEntropy, targets, Bin_Size, need_all_weights ); } else { - feat->Statistics( DBEntropy, Targets, need_all_weights ); + feat->Statistics( DBEntropy, targets, need_all_weights ); } } } @@ -1737,15 +1687,15 @@ namespace Timbl { else { // Print the possible classes. // - for ( const auto& it : Targets->values_array ){ + for ( const auto& it : targets.values_array ){ os << it; - if ( &it != &Targets->values_array.back() ){ + if ( &it != &targets.values_array.back() ){ os << ","; } } os << "." << endl << endl; size_t pos = 0; - for ( auto const& feat : Features ){ + for ( auto const& feat : features.feats ){ os << "a" << ++pos << ": "; if ( feat->Ignore() ){ os << "Ignore" << endl; @@ -1771,7 +1721,7 @@ namespace Timbl { bool MBLClass::Chop( const UnicodeString& line ) { try { - return ChopInput->chop( line, num_of_features ); + return ChopInput->chop( line, NumOfFeatures() ); } catch ( const exception& e ){ Warning( e.what() ); @@ -1792,8 +1742,8 @@ namespace Timbl { return false; } - const ValueDistribution *MBLClass::ExactMatch( const Instance& inst ) const { - const ValueDistribution *result = NULL; + const ClassDistribution *MBLClass::ExactMatch( const Instance& inst ) const { + const ClassDistribution *result = NULL; if ( !GlobalMetric->isSimilarityMetric() && ( do_exact_match || ( num_of_neighbors == 1 && @@ -1807,12 +1757,12 @@ namespace Timbl { return nSet.bestDistance(); } - WValueDistribution *MBLClass::getBestDistribution( unsigned int k ){ + WClassDistribution *MBLClass::getBestDistribution( unsigned int k ){ return nSet.bestDistribution( decay, k ); } UnicodeString MBLClass::formatInstance( const vector& OrgFV, - vector& RedFV, + const vector& RedFV, size_t OffSet, size_t Size ) const { UnicodeString result; @@ -1823,61 +1773,60 @@ namespace Timbl { for ( size_t j=OffSet; j< Size; ++j ){ inst.FV[j] = RedFV[j-OffSet]; } - size_t *InvPerm = new size_t[num_of_features]; - for ( size_t i=0; i< num_of_features; ++i ){ - InvPerm[permutation[i]] = i; + vector InvPerm(NumOfFeatures(),0); + for ( size_t i=0; i< NumOfFeatures(); ++i ){ + InvPerm[features.permutation[i]] = i; } - for ( size_t j=0; j< num_of_features; ++j ){ + for ( size_t j=0; j< NumOfFeatures(); ++j ){ switch ( input_format ) { case C4_5: // fall through case ARFF: - if ( Features[j]->Ignore() ){ + if ( features[j]->Ignore() ){ result += "-*-,"; } else { - result += inst.FV[InvPerm[j]]->name_u() + ","; + result += inst.FV[InvPerm[j]]->name() + ","; } break; case Sparse: - if ( inst.FV[InvPerm[j]]->name_u() != DefaultSparseString ){ + if ( inst.FV[InvPerm[j]]->name() != DefaultSparseString ){ result += "(" + TiCC::toUnicodeString(j+1) + "," - + CodeToStr( inst.FV[InvPerm[j]]->name_u() ) + + CodeToStr( inst.FV[InvPerm[j]]->name() ) + ")"; } break; case SparseBin: - if ( inst.FV[InvPerm[j]]->name_u()[0] == '1' ){ + if ( inst.FV[InvPerm[j]]->name()[0] == '1' ){ result += TiCC::toUnicodeString( j+1 ) + ","; } break; case Columns: - if ( Features[j]->Ignore() ){ + if ( features[j]->Ignore() ){ result += "-*- "; } else { - result += inst.FV[InvPerm[j]]->name_u() + " "; + result += inst.FV[InvPerm[j]]->name() + " "; } break; case Tabbed: - if ( Features[j]->Ignore() ){ + if ( features[j]->Ignore() ){ result += "-*- "; } else { - result += inst.FV[InvPerm[j]]->name_u() + "\t"; + result += inst.FV[InvPerm[j]]->name() + "\t"; } break; default: - if ( Features[j]->Ignore() ){ + if ( features[j]->Ignore() ){ result += UnicodeString( F_length, '*', F_length ); } else { - result += inst.FV[InvPerm[j]]->name_u(); + result += inst.FV[InvPerm[j]]->name(); } break; } } - delete [] InvPerm; return result; } @@ -1889,19 +1838,19 @@ namespace Timbl { void MBLClass::test_instance_ex( const Instance& Inst, InstanceBase_base *IB, size_t ib_offset ){ - vector CurrentFV(num_of_features); - const ValueDistribution *best_distrib = IB->InitGraphTest( CurrentFV, + vector CurrentFV(NumOfFeatures()); + const ClassDistribution *best_distrib = IB->InitGraphTest( CurrentFV, &Inst.FV, ib_offset, - effective_feats ); + EffectiveFeatures() ); if ( !best_distrib ){ // no use to do more work then return; } - tester->init( Inst, effective_feats, ib_offset ); + tester->init( Inst, EffectiveFeatures(), ib_offset ); auto lastpos = best_distrib->begin(); Vfield *Bpnt = lastpos->second; - size_t EffFeat = effective_feats - ib_offset; + size_t EffFeat = EffectiveFeatures() - ib_offset; size_t CurPos = 0; while ( Bpnt ) { // call test() with a maximum threshold, to prevent stepping out early @@ -1911,13 +1860,13 @@ namespace Timbl { if ( EndPos != EffFeat ){ throw( logic_error( "Exemplar testing: test should not stop before last feature" ) ); } - ValueDistribution ResultDist; + ClassDistribution ResultDist; ResultDist.SetFreq( Bpnt->Value(), Bpnt->Freq() ); UnicodeString origI; if ( Verbosity(NEAR_N) ){ origI = formatInstance( Inst.FV, CurrentFV, ib_offset, - num_of_features ); + NumOfFeatures() ); } double Distance = WeightFun( tester->getDistance(EndPos), Bpnt->Weight() ); @@ -1966,29 +1915,26 @@ namespace Timbl { delete GlobalMetric; GlobalMetric = getMetricClass( globalMetricOption ); delete tester; - tester = getTester( globalMetricOption, Features, permutation, mvd_threshold ); + tester = getTester( globalMetricOption, + features, mvd_threshold ); } void MBLClass::test_instance( const Instance& Inst, InstanceBase_base *IB, size_t ib_offset ){ - vector CurrentFV(num_of_features); + vector CurrentFV(NumOfFeatures()); double Threshold = DBL_MAX; - size_t EffFeat = effective_feats - ib_offset; - const ValueDistribution *best_distrib = IB->InitGraphTest( CurrentFV, + size_t EffFeat = EffectiveFeatures() - ib_offset; + const ClassDistribution *best_distrib = IB->InitGraphTest( CurrentFV, &Inst.FV, ib_offset, - effective_feats ); - tester->init( Inst, effective_feats, ib_offset ); - // cerr << "start test Instance = " << &Inst << " met " << TiCC::toString(CurrentFV) << endl; - // cerr << "BA at start = " << bestArray << endl; + EffectiveFeatures() ); + tester->init( Inst, EffectiveFeatures(), ib_offset ); size_t CurPos = 0; while ( best_distrib ){ - // cerr << "test:" << TiCC::toString(CurrentFV) << endl; size_t EndPos = tester->test( CurrentFV, CurPos, Threshold + Epsilon ); - // cerr << "EndPos = " << EndPos << endl; if ( EndPos == EffFeat ){ // we finished with a certain amount of succes double Distance = tester->getDistance(EndPos); @@ -1997,11 +1943,9 @@ namespace Timbl { if ( Verbosity(NEAR_N) ){ origI = formatInstance( Inst.FV, CurrentFV, ib_offset, - num_of_features ); + NumOfFeatures() ); } - // cerr << "Ok add " << best_distrib << "at distance " << Distance << endl; Threshold = bestArray.addResult( Distance, best_distrib, origI ); - // cerr << "BA = " << bestArray << endl; if ( do_silly_testing ){ Threshold = DBL_MAX; } @@ -2012,18 +1956,15 @@ namespace Timbl { } } else { - EndPos++; // out of luck, compensate for roll-back + ++EndPos; // out of luck, compensate for roll-back } size_t pos=EndPos-1; - // cerr << "start rollback " << pos << endl; while ( true ){ - // cerr << "rollback " << pos << endl; + // rollback if ( tester->getDistance(pos) <= Threshold ){ CurPos = pos; - // cerr << "voor next test " << endl; best_distrib = IB->NextGraphTest( CurrentFV, CurPos ); - // cerr << "na next test, curpos=" << CurPos << "-" << TiCC::toString(CurrentFV) << endl; break; } if ( pos == 0 ){ @@ -2032,19 +1973,18 @@ namespace Timbl { --pos; } } - // cerr << "BA at end = " << bestArray << endl; } void MBLClass::test_instance_sim( const Instance& Inst, InstanceBase_base *IB, size_t ib_offset ){ - vector CurrentFV(num_of_features); - size_t EffFeat = effective_feats - ib_offset; - const ValueDistribution *best_distrib = IB->InitGraphTest( CurrentFV, + vector CurrentFV(NumOfFeatures()); + size_t EffFeat = EffectiveFeatures() - ib_offset; + const ClassDistribution *best_distrib = IB->InitGraphTest( CurrentFV, &Inst.FV, ib_offset, - effective_feats ); - tester->init( Inst, effective_feats, ib_offset ); + EffectiveFeatures() ); + tester->init( Inst, EffectiveFeatures(), ib_offset ); while ( best_distrib ){ double dummy_t = -1.0; size_t dummy_p = 0; @@ -2061,7 +2001,7 @@ namespace Timbl { if ( Verbosity(NEAR_N) ){ origI = formatInstance( Inst.FV, CurrentFV, ib_offset, - num_of_features ); + NumOfFeatures() ); } bestArray.addResult( Distance, best_distrib, origI ); } @@ -2104,7 +2044,7 @@ namespace Timbl { const InputFormatType IF ) const { size_t result = 0; if ( IF == Sparse || IF == SparseBin ){ - return num_of_features; + return NumOfFeatures(); } else { try { @@ -2126,7 +2066,7 @@ namespace Timbl { } size_t MBLClass::examineData( const string& FileName ){ - // Looks at the data files, counts num_of_features. + // Looks at the data files, counts number of features. // and sets input_format variables. // size_t NumF = 0; @@ -2237,52 +2177,26 @@ namespace Timbl { return NumF; } - void MBLClass::Initialize( size_t n ){ - if ( n > 0 ) { - num_of_features = n; - } + void MBLClass::Initialize( size_t numF ){ // Allocate memory. Will be reused again and again .... // if ( target_pos == std::numeric_limits::max() ){ - target_pos = num_of_features; // the default + target_pos = numF; // the default } - else if ( target_pos > num_of_features ){ + else if ( target_pos > numF ){ FatalError( "Initialize: TARGET_POS cannot exceed NUM_OF_FEATURES+1 " + - TiCC::toString( num_of_features+1 ) ); - } - Features.resize(num_of_features,NULL); - PermFeatures.resize(num_of_features,NULL); - FeatureStrings = new Hash::UnicodeHash(); // all features share the same hash - TargetStrings = new Hash::UnicodeHash(); // targets has it's own hash - Targets = new Target( TargetStrings ); - for ( size_t i=0; i< num_of_features; ++i ){ - Features[i] = new Feature( FeatureStrings ); - PermFeatures[i] = NULL; - } - CurrInst.Init( num_of_features ); - effective_feats = num_of_features; - num_of_num_features = 0; + TiCC::toString( numF+1 ) ); + } + targets.init(); + features.init( numF, UserOptions ); + CurrInst.Init( numF ); delete GlobalMetric; GlobalMetric = getMetricClass( globalMetricOption ); - // the user thinks about features running from 1 to Num - // we know better, so shift the UserOptions one down. - for ( size_t j = 0; j < num_of_features; ++j ){ - MetricType m = UserOptions[j+1]; - if ( m == Ignore ){ - Features[j]->Ignore( true ); - effective_feats--; - } - else { - Features[j]->setMetricType( m ); - if ( Features[j]->isNumerical() ){ - num_of_num_features++; - } - } - } Options.FreezeTable(); if ( Weighting > IG_w || - TreeOrder >= X2Order ) + TreeOrder >= X2Order ){ need_all_weights = true; + } } } // namespace diff --git a/src/Makefile.am b/src/Makefile.am index d087690..e62858f 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -13,10 +13,11 @@ LDADD = libtimbl.la timbl_SOURCES = Timbl.cxx lib_LTLIBRARIES = libtimbl.la -libtimbl_la_LDFLAGS= -version-info 6:0:0 +libtimbl_la_LDFLAGS= -version-info 7:0:0 libtimbl_la_SOURCES = Common.cxx \ - GetOptClass.cxx IBtree.cxx IBprocs.cxx Instance.cxx \ + GetOptClass.cxx IBtree.cxx IBprocs.cxx \ + Targets.cxx Features.cxx Instance.cxx \ MBLClass.cxx MsgClass.cxx \ StringOps.cxx TimblAPI.cxx Choppers.cxx\ TimblExperiment.cxx IGExperiment.cxx Metrics.cxx Testers.cxx \ diff --git a/src/Metrics.cxx b/src/Metrics.cxx index 2e1b6eb..87b18ae 100644 --- a/src/Metrics.cxx +++ b/src/Metrics.cxx @@ -27,17 +27,14 @@ #include #include #include -#include +#include #include -#include -#include -#include #include "timbl/Common.h" -#include "timbl/MsgClass.h" #include "timbl/Types.h" #include "timbl/Instance.h" #include "timbl/Metrics.h" +#include "unicode/schriter.h" using namespace std; using Common::Epsilon; @@ -47,7 +44,8 @@ using Common::Log2; namespace Timbl{ - double lv_distance( const string& source, const string& target ){ + double lv_distance( const icu::UnicodeString& source, + const icu::UnicodeString& target ){ // code taken from: http://www.merriampark.com/ldcpp.htm // Levenshtein Distance Algorithm: C++ Implementation // by Anders Sewerin Johansen @@ -113,7 +111,8 @@ namespace Timbl{ return (double)matrix[n][m]; } - double dc_distance( const string& string1, const string& string2 ){ + double dc_distance( const icu::UnicodeString& string1, + const icu::UnicodeString& string2 ){ // code taken from: // http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Dice's_coefficient unsigned int ls1 = string1.length(); @@ -123,14 +122,18 @@ namespace Timbl{ int total = 0; if ( ls1 <= 1 || ls2 <= 1 ){ // back-off naar unigrammen - set string1_unigrams; - set string2_unigrams; + set string1_unigrams; + set string2_unigrams; - for ( const auto& c : string1 ){ - string1_unigrams.insert(c); + icu::StringCharacterIterator it1(string1); + while ( it1.hasNext() ){ + string1_unigrams.insert(it1.current32()); + it1.next32(); } - for ( const auto& c : string2 ){ - string2_unigrams.insert(c); + icu::StringCharacterIterator it2(string2); + while ( it2.hasNext() ){ + string2_unigrams.insert(it2.current32()); + it2.next32(); } for ( const auto& ug : string2_unigrams ){ @@ -141,16 +144,16 @@ namespace Timbl{ total = string1_unigrams.size() + string2_unigrams.size(); } else { - set string1_bigrams; - set string2_bigrams; + set string1_bigrams; + set string2_bigrams; for ( unsigned int i = 0; i < (ls1 - 1); ++i ) { // extract character bigrams from string1 - string1_bigrams.insert(string1.substr(i, 2)); + string1_bigrams.insert( icu::UnicodeString( string1, i, 2 ) ); } for ( unsigned int i = 0; i < (ls2 - 1); ++i ) { // extract character bigrams from string2 - string2_bigrams.insert(string2.substr(i, 2)); + string2_bigrams.insert( icu::UnicodeString( string2, i, 2 ) ); } for ( const auto& bg : string2_bigrams ){ @@ -340,7 +343,7 @@ namespace Timbl{ inline bool FV_to_real( FeatureValue *FV, double &result ){ if ( FV ){ - if ( TiCC::stringTo( FV->name_u(), result ) ){ + if ( TiCC::stringTo( FV->name(), result ) ){ return true; } } @@ -393,7 +396,7 @@ namespace Timbl{ size_t, double) const { double result = 0.0; if ( G != F ){ - result = lv_distance( F->Name(), G->Name() ); + result = lv_distance( F->name(), G->name() ); } return result; } @@ -402,7 +405,7 @@ namespace Timbl{ size_t, double ) const { double result = 0.0; if ( G != F ){ - result = dc_distance( F->Name(), G->Name() ); + result = dc_distance( F->name(), G->name() ); } return result; } diff --git a/src/MsgClass.cxx b/src/MsgClass.cxx index 0b5cef7..479e9f1 100644 --- a/src/MsgClass.cxx +++ b/src/MsgClass.cxx @@ -45,6 +45,7 @@ namespace Timbl { } void MsgClass::Error( const string& out_line ) const { + ++err_cnt; cerr << "Error:" << out_line << endl; } diff --git a/src/Statistics.cxx b/src/Statistics.cxx index 20c8adb..0d04c53 100644 --- a/src/Statistics.cxx +++ b/src/Statistics.cxx @@ -79,10 +79,11 @@ namespace Timbl { } } - void ConfusionMatrix::Print( ostream& os, const Target *tg ) const { + void ConfusionMatrix::Print( ostream& os, + const Targets& targets ) const { os << "Confusion Matrix:" << endl; os << " "; - for ( const auto& val : tg->values_array ){ + for ( const auto& val : targets.values_array ){ // Print the class names. os.width(6); os.setf(ios::right, ios::adjustfield); @@ -94,17 +95,17 @@ namespace Timbl { os << "-------"; } os << endl; - for ( unsigned int i=0; i < tg->values_array.size(); ++i ){ + for ( unsigned int i=0; i < targets.values_array.size(); ++i ){ os.width(6); os.setf(ios::right, ios::adjustfield); - os << tg->values_array[i] << " | "; + os << targets.values_array[i] << " | "; for ( const auto& mv : mat[i] ){ os.width(6); os.setf(ios::right, ios::adjustfield); os << mv << " "; } os << endl; - if ( i == tg->values_array.size() - 1 ){ + if ( i == targets.values_array.size() - 1 ){ os << " -*- | "; for ( const auto& mv : mat[size] ){ os.width(6); @@ -133,7 +134,8 @@ namespace Timbl { } void ConfusionMatrix::FScore( ostream& os, - const Target* tg, bool cs_too ) const { + const Targets& targets, + bool cs_too ) const { double maf = 0.0; double mif = 0.0; double maa = 0.0; @@ -147,13 +149,13 @@ namespace Timbl { os << "Scores per Value Class:" << endl; os << "class |\tTP\tFP\tTN\tFN\tprecision\trecall(TPR)\tFPR\t\tF-score\t\tAUC" << endl; } - for ( unsigned int i=0; i < tg->values_array.size(); ++i ){ + for ( unsigned int i=0; i < targets.values_array.size(); ++i ){ // so we loop over all known (trained) target values size_t TP = 0; size_t FP = 0; size_t FN = 0; size_t TN = 0; - ValueClass *tv = tg->values_array[i]; + ValueClass *tv = targets.values_array[i]; size_t testCount = 0; for ( unsigned int j=0; j < size; ++j ){ testCount += mat[i][j]; diff --git a/src/TRIBLExperiments.cxx b/src/TRIBLExperiments.cxx index 161b795..0954ad3 100644 --- a/src/TRIBLExperiments.cxx +++ b/src/TRIBLExperiments.cxx @@ -28,27 +28,21 @@ #include #include +#include #include -#include -#include +#include // for srand() #include -#include -#include "timbl/MsgClass.h" #include "timbl/Common.h" #include "timbl/Types.h" #include "timbl/Options.h" #include "timbl/Instance.h" -#include "timbl/Statistics.h" -#include "timbl/neighborSet.h" -#include "timbl/BestArray.h" #include "timbl/IBtree.h" #include "timbl/MBLClass.h" #include "timbl/TimblExperiment.h" namespace Timbl { using namespace std; - using namespace icu; void TRIBL_Experiment::InitInstanceBase(){ srand( RandomSeed() ); @@ -107,11 +101,11 @@ namespace Timbl { const TargetValue *Res = NULL; bool Tie = false; exact = false; - if ( !bestResult.reset( beamSize, normalisation, norm_factor, Targets ) ){ + if ( !bestResult.reset( beamSize, normalisation, norm_factor, targets ) ){ Warning( "no normalisation possible because a BeamSize is specified\n" "output is NOT normalized!" ); } - const ValueDistribution *ExResultDist = ExactMatch( Inst ); + const ClassDistribution *ExResultDist = ExactMatch( Inst ); if ( ExResultDist ){ Distance = 0.0; Res = ExResultDist->BestTarget( Tie, (RandomSeed() >= 0) ); @@ -120,7 +114,7 @@ namespace Timbl { } else { size_t level = 0; - const ValueDistribution *TrResultDist = 0; + const ClassDistribution *TrResultDist = 0; initExperiment(); IB_InstanceBase *SubTree = InstanceBase->TRIBL_test( Inst, TRIBL_offset(), @@ -142,13 +136,13 @@ namespace Timbl { else { testInstance( Inst, SubTree, TRIBL_offset() ); bestArray.initNeighborSet( nSet ); - WValueDistribution *ResultDist = getBestDistribution(); + WClassDistribution *ResultDist = getBestDistribution(); Res = ResultDist->BestTarget( Tie, (RandomSeed() >= 0) ); if ( Tie ){ ++num_of_neighbors; testInstance( Inst, SubTree, TRIBL_offset() ); bestArray.addToNeighborSet( nSet, num_of_neighbors ); - WValueDistribution *ResultDist2 = getBestDistribution(); + WClassDistribution *ResultDist2 = getBestDistribution(); bool Tie2 = false; const TargetValue *Res2 = ResultDist2->BestTarget( Tie2, (RandomSeed() >= 0) ); --num_of_neighbors; @@ -189,7 +183,7 @@ namespace Timbl { return Res; } - bool TRIBL_Experiment::checkLine( const UnicodeString& line ){ + bool TRIBL_Experiment::checkLine( const icu::UnicodeString& line ){ if ( !TimblExperiment::checkLine( line ) ){ return false; } @@ -201,7 +195,7 @@ namespace Timbl { return true; } - bool TRIBL2_Experiment::checkLine( const UnicodeString& line ){ + bool TRIBL2_Experiment::checkLine( const icu::UnicodeString& line ){ if ( !TimblExperiment::checkLine( line ) ){ return false; } @@ -218,12 +212,12 @@ namespace Timbl { bool& exact ){ const TargetValue *Res = NULL; exact = false; - if ( !bestResult.reset( beamSize, normalisation, norm_factor, Targets ) ){ + if ( !bestResult.reset( beamSize, normalisation, norm_factor, targets ) ){ Warning( "no normalisation possible because a BeamSize is specified\n" "output is NOT normalized!" ); } bool Tie = false; - const ValueDistribution *ExResultDist = ExactMatch( Inst ); + const ClassDistribution *ExResultDist = ExactMatch( Inst ); if ( ExResultDist ){ Distance = 0.0; Res = ExResultDist->BestTarget( Tie, (RandomSeed() >= 0) ); @@ -232,21 +226,22 @@ namespace Timbl { } else { size_t level = 0; - const ValueDistribution *TrResultDist = 0; + const ClassDistribution *TrResultDist = 0; IB_InstanceBase *SubTree = InstanceBase->TRIBL2_test( Inst, TrResultDist, level ); if ( SubTree ){ testInstance( Inst, SubTree, level ); bestArray.initNeighborSet( nSet ); - WValueDistribution *ResultDist1 = getBestDistribution(); + WClassDistribution *ResultDist1 = getBestDistribution(); Res = ResultDist1->BestTarget( Tie, (RandomSeed() >= 0) ); if ( Tie ){ ++num_of_neighbors; testInstance( Inst, SubTree, level ); bestArray.addToNeighborSet( nSet, num_of_neighbors ); - WValueDistribution *ResultDist2 = getBestDistribution(); + WClassDistribution *ResultDist2 = getBestDistribution(); bool Tie2 = false; - const TargetValue *Res2 = ResultDist2->BestTarget( Tie2, (RandomSeed() >= 0) ); + const TargetValue *Res2 = ResultDist2->BestTarget( Tie2, + (RandomSeed() >= 0) ); --num_of_neighbors; if ( !Tie2 ){ delete ResultDist1; @@ -317,7 +312,8 @@ namespace Timbl { bool Hashed; int Version; string range_buf; - if ( !get_IB_Info( is, Pruned, Version, Hashed, range_buf ) ){ + size_t numF = get_IB_Info( is, Pruned, Version, Hashed, range_buf ); + if ( numF == 0 ){ return false; } else if ( Pruned ){ @@ -326,7 +322,7 @@ namespace Timbl { } else { TreeOrder = DataFile; - Initialize(); + Initialize( numF ); if ( !get_ranges( range_buf ) ){ Warning( "couldn't retrieve ranges..." ); } @@ -338,24 +334,24 @@ namespace Timbl { KeepDistributions() ); int pos=0; for ( size_t i=0; i < NumOfFeatures(); ++i ){ - Features[i]->SetWeight( 1.0 ); - if ( Features[permutation[i]]->Ignore() ){ - PermFeatures[i] = NULL; + features[i]->SetWeight( 1.0 ); + if ( features[features.permutation[i]]->Ignore() ){ + features.perm_feats[i] = NULL; } else { - PermFeatures[pos++] = Features[permutation[i]]; + features.perm_feats[pos++] = features[features.permutation[i]]; } } if ( Hashed ){ - result = InstanceBase->ReadIB( is, PermFeatures, - *Targets, - *Targets->hash(), - *Features[0]->hash(), - Version ); + result = InstanceBase->ReadIB_hashed( is, + features, + targets, + Version ); } else { - result = InstanceBase->ReadIB( is, PermFeatures, - *Targets, + result = InstanceBase->ReadIB( is, + features, + targets, Version ); } } @@ -369,7 +365,8 @@ namespace Timbl { bool Hashed; int Version; string range_buf; - if ( !get_IB_Info( is, Pruned, Version, Hashed, range_buf ) ){ + size_t numF = get_IB_Info( is, Pruned, Version, Hashed, range_buf ); + if ( numF == 0 ){ return false; } else if ( Pruned ){ @@ -378,7 +375,7 @@ namespace Timbl { } else { TreeOrder = DataFile; - Initialize(); + Initialize( numF ); if ( !get_ranges( range_buf ) ){ Warning( "couldn't retrieve ranges..." ); } @@ -390,24 +387,24 @@ namespace Timbl { KeepDistributions() ); int pos=0; for ( size_t i=0; i < NumOfFeatures(); ++i ){ - Features[i]->SetWeight( 1.0 ); - if ( Features[permutation[i]]->Ignore() ){ - PermFeatures[i] = NULL; + features[i]->SetWeight( 1.0 ); + if ( features[features.permutation[i]]->Ignore() ){ + features.perm_feats[i] = NULL; } else { - PermFeatures[pos++] = Features[permutation[i]]; + features.perm_feats[pos++] = features[features.permutation[i]]; } } if ( Hashed ){ - result = InstanceBase->ReadIB( is, PermFeatures, - *Targets, - *Targets->hash(), - *Features[0]->hash(), - Version ); + result = InstanceBase->ReadIB_hashed( is, + features, + targets, + Version ); } else { - result = InstanceBase->ReadIB( is, PermFeatures, - *Targets, + result = InstanceBase->ReadIB( is, + features, + targets, Version ); } } diff --git a/src/Targets.cxx b/src/Targets.cxx new file mode 100644 index 0000000..5aba02e --- /dev/null +++ b/src/Targets.cxx @@ -0,0 +1,886 @@ +/* + Copyright (c) 1998 - 2023 + ILK - Tilburg University + CLST - Radboud University + CLiPS - University of Antwerp + + This file is part of timbl + + timbl is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + timbl is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . + + For questions and suggestions, see: + https://github.com/LanguageMachines/timbl/issues + or send mail to: + lamasoftware (at ) science.ru.nl +*/ + +#include +#include +#include +#include +#include // for find_if() +#include // for accumulate() +#include +#include + +#include "ticcutils/StringOps.h" +#include "ticcutils/PrettyPrint.h" +#include "ticcutils/UniHash.h" + +#include "timbl/Common.h" +#include "timbl/Types.h" +#include "timbl/Targets.h" + +using namespace std; +using namespace icu; + + +namespace Timbl { + using namespace Common; + using TiCC::operator<<; + + size_t Vfield::Index() { return value->Index(); } + + ostream& operator<<(ostream& os, const Vfield *vd ) { + return vd->put( os ); + } + + ostream& operator<<(ostream& os, const Vfield& vd ) { + return vd.put( os ); + } + + ostream& Vfield::put( ostream& os ) const { + os << value << " " << weight; + return os; + } + + inline int random_number( int Min, int Max ){ + // calculate a random integer within the interval [min,max] + if ( Min == Max ){ + return Min; + } + double randnum = (double)rand()/(double)RAND_MAX; + randnum *= (Max-Min); + randnum += Min; + return (int)floor(randnum+0.5); + } + + void ClassDistribution::clear(){ + for ( const auto& d : distribution ){ + delete d.second; + } + distribution.clear(); + total_items = 0; + } + + double ClassDistribution::Confidence( const TargetValue *tv ) const { + auto it = find_if( distribution.begin(), distribution.end(), + [tv]( const std::pair& v ){ + return v.second->Value() == tv ; } ); + if ( it != distribution.end() ){ + return it->second->Weight(); + } + return 0.0; + } + + void ClassDistribution::DistToString( string& DistStr, double minf ) const { + ostringstream oss; + oss.setf(ios::showpoint); + bool first = true; + oss << "{ "; + for ( const auto& it : distribution ){ + Vfield *f = it.second; + if ( f->frequency >= minf ){ + if ( !first ){ + oss << ", "; + } + oss << f->value << " " << double(f->frequency); + first = false; + } + } + oss << " }"; + DistStr = oss.str(); + } + + void WClassDistribution::DistToString( string& DistStr, double minw ) const { + ostringstream oss; + oss.setf(ios::showpoint); + bool first = true; + oss << "{ "; + for( const auto& it : distribution ){ + Vfield *f = it.second; + if ( abs(f->weight) < minw ){ + continue; + } + if ( abs(f->weight) < Epsilon ){ + continue; + } + if ( !first ){ + oss << ", "; + } + oss << f->value << " " << f->weight; + first = false; + } + oss << " }"; + DistStr = oss.str(); + } + + class dblCmp { + public: + bool operator() ( const double d1, const double d2 ) const { + return d1 - d2 > Epsilon; + } + }; + + void ClassDistribution::DistToStringWW( string& DistStr, int beam ) const { + double minw = 0.0; + if ( beam > 0 ){ + set freqs; + for ( const auto& it : distribution ){ + Vfield *f = it.second; + freqs.insert( f->frequency ); + } + int cnt=0; + for ( const auto& rit : freqs ){ + if ( ++cnt == beam ) { + minw = rit; + break; + } + } + } + DistToString( DistStr, minw ); + } + + void WClassDistribution::DistToStringWW( string& DistStr, + int beam ) const { + double minw = 0.0; + if ( beam > 0 ){ + set wgths; + for ( const auto& it : distribution ){ + Vfield *f = it.second; + wgths.insert( f->weight ); + } + int cnt=0; + for ( const auto& rit : wgths ){ + if ( ++cnt == beam ) { + minw = rit; + break; + } + } + } + DistToString( DistStr, minw ); + } + + const string ClassDistribution::DistToString() const { + string result; + DistToString( result ); + return result; + } + + const string ClassDistribution::DistToStringW( int beam ) const { + string result; + DistToStringWW( result, beam ); + return result; + } + + double ClassDistribution::Entropy() const { + double entropy = 0.0; + size_t TotalVals = total_items; + if ( TotalVals > 0 ){ + // Loop over the classes in the distibution + for ( const auto& it : distribution ){ + size_t Freq = it.second->Freq(); + if ( Freq > 0 ){ + double Prob = Freq / (double)TotalVals; + entropy += Prob * Log2(Prob); + } + } + } + return fabs(entropy); + } + + void WClassDistribution::Normalize() { + double sum = accumulate( distribution.begin(), distribution.end(), + 0.0, + []( double r, const std::pair& v ){ + return r + v.second->Weight(); } ); + for ( auto& it : distribution ){ + it.second->SetWeight( it.second->Weight() / sum ); + } + } + + void WClassDistribution::Normalize_1( double factor, + const Targets& targ ) { + for ( const auto& val : targ.values_array ){ + // search for val, if not there: add entry with frequency factor; + // otherwise increment the ExamplarWeight + size_t id = val->Index(); + auto const& it = distribution.find( id ); + if ( it != distribution.end() ){ + it->second->SetWeight( it->second->Weight() + factor ); + } + else { + distribution[id] = new Vfield( val, 1, factor ); + } + } + total_items += targ.num_of_values(); + Normalize(); + } + + void WClassDistribution::Normalize_2( ) { + for ( const auto& d : distribution ){ + d.second->SetWeight( log1p( d.second->Weight() ) ); + } + Normalize(); + } + + ClassDistribution *ClassDistribution::to_VD_Copy( ) const { + ClassDistribution *res = new ClassDistribution(); + for ( const auto& d : distribution ){ + size_t key = d.first; + Vfield *vdf = d.second; + res->distribution[key] = new Vfield( vdf->Value(), + vdf->Freq(), + vdf->Freq() ); + } + res->total_items = total_items; + return res; + } + + WClassDistribution *ClassDistribution::to_WVD_Copy() const { + WClassDistribution *res = new WClassDistribution(); + for ( const auto& d : distribution ){ + size_t key = d.first; + Vfield *vdf = d.second; + res->distribution[key] = new Vfield( vdf->Value(), + vdf->Freq(), + vdf->Freq() ); + } + res->total_items = total_items; + return res; + } + + WClassDistribution *WClassDistribution::to_WVD_Copy( ) const { + WClassDistribution *result = new WClassDistribution(); + for ( const auto& d : distribution ){ + size_t key = d.first; + Vfield *vdf = d.second; + result->distribution[key] = new Vfield( vdf->Value(), + vdf->Freq(), + vdf->Weight() ); + } + result->total_items = total_items; + return result; + } + + + // + // special functions to serialize distibutions including both frequency + // AND weight information. Needed for store/retrieve InstanceBases + // + // First hashed variant: + // + + const string ClassDistribution::SaveHashed() const{ + ostringstream oss; + oss << "{ "; + bool first = true; + for ( const auto& it : distribution ){ + Vfield *f = it.second; + if ( f->frequency > 0 ){ + if ( !first ){ + oss << ", "; + } + oss << f->value->Index() << " " << f->frequency; + first = false; + } + } + oss << " }"; + return oss.str(); + } + + const string WClassDistribution::SaveHashed() const{ + ostringstream oss; + bool first = true; + oss << "{ "; + for ( const auto& it : distribution ){ + Vfield *f = it.second; + if ( f->frequency > 0 ){ + if ( !first ){ + oss << ", "; + } + oss << f->Value()->Index() << " " + << f->frequency << " " << f->weight; + first = false; + } + } + oss << " }"; + return oss.str(); + } + + // + // non-hashed variant: + // + + const string ClassDistribution::Save() const{ + ostringstream oss; + oss << "{ "; + bool first = true; + for ( const auto& it : distribution ){ + Vfield *f = it.second; + if ( f->frequency > 0 ){ + if ( !first ){ + oss << ", "; + } + oss << f->value << " " << f->frequency; + first = false; + } + } + oss << " }"; + return oss.str(); + } + + const string WClassDistribution::Save() const{ + ostringstream oss; + oss << "{ "; + bool first = true; + for ( const auto& it : distribution ){ + Vfield *f = it.second; + if ( f->frequency > 0 ){ + if ( !first ){ + oss << ", "; + } + oss.setf(ios::showpoint); + oss << f->value << " " << f->frequency << " " << f->weight; + first = false; + } + } + oss << " }"; + return oss.str(); + } + + void ClassDistribution::SetFreq( const TargetValue *val, const int freq, + double ){ + // add entry with frequency freq; + Vfield *temp = new Vfield( val, freq, freq ); + distribution[val->Index()] = temp; + total_items += freq; + } + + void WClassDistribution::SetFreq( const TargetValue *val, const int freq, + double sw ){ + // add entry with frequency freq; + // also sets the sample_weight + Vfield *temp = new Vfield( val, freq, sw ); + distribution[val->Index()] = temp; + total_items += freq; + } + + bool ClassDistribution::IncFreq( const TargetValue *val, + size_t occ, + double ){ + // search for val, if not there: add entry with frequency 'occ'; + // otherwise increment the freqency + size_t id = val->Index(); + auto const& it = distribution.find( id ); + if ( it != distribution.end() ){ + it->second->IncFreq( occ ); + } + else { + distribution[id] = new Vfield( val, occ, 1.0 ); + } + total_items += occ; + return true; + } + + bool WClassDistribution::IncFreq( const TargetValue *val, + size_t occ, + double sw ){ + // search for val, if not there: add entry with frequency 'occ'; + // otherwise increment the freqency + // also set sample weight + size_t id = val->Index(); + auto const& it = distribution.find( id ); + if ( it != distribution.end() ){ + it->second->IncFreq( occ ); + } + else { + distribution[id] = new Vfield( val, occ, sw ); + } + total_items += occ; + return fabs( distribution[id]->Weight() - sw ) > Epsilon; + } + + void ClassDistribution::DecFreq( const TargetValue *val ){ + // search for val, if not there, just forget + // otherwise decrement the freqency + auto const& it = distribution.find( val->Index() ); + if ( it != distribution.end() ){ + it->second->DecFreq(); + total_items -= 1; + } + } + + void ClassDistribution::Merge( const ClassDistribution& VD ){ + for ( const auto& it : VD.distribution ){ + size_t key = it.first; + Vfield *vd = it.second; + if ( distribution.find(key) != distribution.end() ){ + distribution[key]->AddFreq( vd->Freq() ); + } + else { + // VD might be weighted. But we don't need/want that info here + // Weight == Freq is more convenient + distribution[key] = new Vfield( vd->Value(), vd->Freq(), + vd->Freq() ); + } + } + total_items += VD.total_items; + } + + void WClassDistribution::MergeW( const ClassDistribution& VD, + double Weight ){ + for ( const auto& it : VD.distribution ){ + Vfield *vd = it.second; + size_t key = it.first; + if ( distribution.find(key) != distribution.end() ){ + distribution[key]->SetWeight( distribution[key]->Weight() + vd->Weight() *Weight ); + } + else { + distribution[key] = new Vfield( vd->Value(), 1, + vd->Weight() * Weight); + } + } + total_items += VD.total_items; + } + + const TargetValue *ClassDistribution::BestTarget( bool& tie, + bool do_rand ) const { + // get the most frequent target from the distribution. + // In case of a tie take the one which is GLOBALLY the most frequent, + // OR (if do_rand) take random one of the most frequents + // and signal if this ties also! + const TargetValue *best = NULL; + tie = false; + auto It = distribution.begin(); + if ( It != distribution.end() ){ + Vfield *pnt = It->second; + size_t Max = pnt->Freq(); + if ( do_rand ){ + int nof_best=1, pick=1; + ++It; + while ( It != distribution.end() ){ + pnt = It->second; + if ( pnt->Freq() > Max ){ + Max = pnt->Freq(); + nof_best = 1; + } + else { + if ( pnt->Freq() == Max ){ + nof_best++; + } + } + ++It; + } + tie = ( nof_best > 1 ); + pick = random_number( 1, nof_best ); + It = distribution.begin(); + nof_best = 0; + while ( It != distribution.end() ){ + pnt = It->second; + if ( pnt->Freq() == Max ){ + if ( ++nof_best == pick ){ + return pnt->Value(); + } + } + ++It; + } + return NULL; + } + else { + best = pnt->Value(); + ++It; + while ( It != distribution.end() ){ + pnt = It->second; + if ( pnt->Freq() > Max ){ + tie = false; + best = pnt->Value(); + Max = pnt->Freq(); + } + else { + if ( pnt->Freq() == Max ) { + tie = true; + if ( pnt->Value()->ValFreq() > best->ValFreq() ){ + best = pnt->Value(); + } + } + } + ++It; + } + return best; + } + } + return best; + } + + const TargetValue *WClassDistribution::BestTarget( bool& tie, + bool do_rand ) const { + // get the most frequent target from the distribution. + // In case of a tie take the one which is GLOBALLY the most frequent, + // OR (if do_rand) take random one of the most frequents + // and signal if this ties also! + const TargetValue *best = NULL; + auto It = distribution.begin(); + tie = false; + if ( It != distribution.end() ){ + double Max = It->second->Weight(); + if ( do_rand ){ + int nof_best=1, pick=1; + ++It; + while ( It != distribution.end() ){ + if ( It->second->Weight() > Max ){ + Max = It->second->Weight(); + nof_best = 1; + } + else { + if ( abs(It->second->Weight()- Max) < Epsilon ){ + nof_best++; + } + } + ++It; + } + tie = ( nof_best > 1 ); + pick = random_number( 1, nof_best ); + It = distribution.begin(); + nof_best = 0; + while ( It != distribution.end() ){ + if ( abs(It->second->Weight() - Max) < Epsilon ){ + if ( ++nof_best == pick ){ + return It->second->Value(); + } + } + ++It; + } + return NULL; + } + else { + best = It->second->Value(); + ++It; + while ( It != distribution.end() ){ + if ( It->second->Weight() > Max ){ + tie = false; + best = It->second->Value(); + Max = It->second->Weight(); + } + else { + if ( abs(It->second->Weight() - Max) < Epsilon ) { + tie = true; + if ( It->second->Value()->ValFreq() > best->ValFreq() ){ + best = It->second->Value(); + } + } + } + ++It; + } + return best; + } + } + return best; + } + + ostream& operator<<(ostream& os, const ClassDistribution& vd ) { + string tmp; + vd.DistToString( tmp ); + os << tmp; + return os; + } + + ostream& operator<<(ostream& os, const ClassDistribution *vd ) { + string tmp = "{null}"; + if ( vd ){ + vd->DistToString( tmp ); + } + os << tmp; + return os; + } + + ClassDistribution *ClassDistribution::read_distribution( istream &is, + Targets& Targ, + bool do_fr ){ + // read a distribution from stream is into Target + // if do_f we also adjust the value of Frequency of the Target, which is + // otherwise 1. Special case when reading the TopDistribution. + // + ClassDistribution *result = 0; + char nextCh; + is >> nextCh; // skip { + if ( nextCh != '{' ){ + throw runtime_error( "missing '{' in distribution string." ); + } + else { + int next; + do { + size_t freq; + UnicodeString buf; + is >> ws >> buf; + is >> freq; + TargetValue *target; + if ( do_fr ){ + target = Targ.add_value( buf, freq ); + } + else { + target = Targ.Lookup( buf ); + } + if ( !target ){ + delete result; + result = 0; + break; + } + next = look_ahead(is); + if ( next == ',' ){ + if ( !result ) { + result = new ClassDistribution(); + } + result->SetFreq( target, freq ); + is >> nextCh; + next = look_ahead(is); + } + else if ( next == '}' ){ + if ( !result ){ + result = new ClassDistribution(); + } + result->SetFreq( target, freq ); + } + else if ( isdigit(next) ){ + if ( !result ){ + result = new WClassDistribution(); + } + double sw; + is >> sw; + result->SetFreq( target, freq, sw ); + next = look_ahead(is); + if ( next == ',' ){ + is >> nextCh; + next = look_ahead(is); + } + } + } while ( is && next != '}' ); + if ( is ){ + is >> nextCh; // skip } + } + else { + delete result; + throw runtime_error( "missing '}' in distribution string." ); + } + } + return result; + } + + + ClassDistribution *ClassDistribution::read_distribution_hashed( istream &is, + Targets& Targ, + bool do_fr ){ + + ClassDistribution *result = 0; + // read a distribution from stream is into Target + // if do_f we also adjust the value of Frequency of the Target, which is + // otherwise 1. Special case when reading the TopDistribution. + // + char nextCh; + is >> nextCh; // skip { + if ( nextCh != '{' ){ + throw runtime_error( "missing '{' in distribution string." ); + } + else { + int next; + do { + unsigned int index; + size_t freq; + is >> index; + is >> freq; + TargetValue *target; + if ( do_fr ){ + target = Targ.add_value( index, freq ); + } + else { + target = Targ.ReverseLookup( index ); + } + if ( !target ){ + delete result; + result = 0; + break; + } + next = look_ahead(is); + if ( next == ',' ){ + if ( !result ){ + result = new ClassDistribution(); + } + result->SetFreq( target, freq ); + is >> nextCh; + next = look_ahead(is); + } + else if ( next == '}' ){ + if ( !result ){ + result = new ClassDistribution(); + } + result->SetFreq( target, freq ); + } + else if ( isdigit(next) ){ + double sw; + is >> sw; + if ( !result ){ + result = new WClassDistribution(); + } + result->SetFreq( target, freq, sw ); + next = look_ahead(is); + if ( next == ',' ){ + is >> nextCh; + next = look_ahead(is); + } + } + } while ( is && next != '}' ); + if ( is ){ + is >> nextCh; // skip thr '}' + } + else { + delete result; + throw runtime_error( "missing '}' in distribution string" ); + } + } + return result; + } + + + ostream& operator<<( std::ostream& os, ValueClass const *vc ){ + if ( vc ){ + os << vc->name(); + } + else { + os << "*FV-NF*"; + } + return os; + } + + TargetValue::TargetValue( const UnicodeString& value, + size_t value_hash ): + ValueClass( value, value_hash ){} + + size_t Targets::EffectiveValues() const { + return count_if( values_array.begin(), values_array.end(), + [&]( const TargetValue* v ){ + return (v->ValFreq() > 0); } ); + } + + size_t Targets::TotalValues() const { + return accumulate( values_array.begin(), values_array.end(), + 0, + [&]( size_t r, const TargetValue *v ){ + return r + v->ValFreq(); } ); + } + + Targets &Targets::operator=( const Targets& t ){ + if ( this != &t ){ + values_array = t.values_array; + reverse_values = t.reverse_values; + target_hash = t.target_hash; // shared ?? + is_reference =true; + } + return *this; + } + + + Targets::~Targets() { + if ( !is_reference ){ + for ( const auto& it : values_array ){ + delete it; + } + delete target_hash; + } + reverse_values.clear(); + } + + void Targets::init(){ + target_hash = new Hash::UnicodeHash(); + } + + TargetValue *Targets::Lookup( const UnicodeString& str ) const { + TargetValue *result = 0; + size_t index = target_hash->lookup( str ); + if ( index ) { + auto const& it = reverse_values.find( index ); + result = it->second; + } + return result; + } + + TargetValue *Targets::ReverseLookup( size_t index ) const { + auto const& it = reverse_values.find( index ); + return it->second; + } + + TargetValue *Targets::add_value( const UnicodeString& valstr, int freq ){ + unsigned int hash_val = target_hash->hash( valstr ); + // cerr << "target hash(" << valstr << ") geeft: " << hash_val << endl; + return add_value( hash_val, freq ); + } + + TargetValue *Targets::add_value( size_t index, int freq ){ + auto const& it = reverse_values.find( index ); + if ( it == reverse_values.end() ){ + const UnicodeString& name = target_hash->reverse_lookup( index ); + // cerr << "target lookup(" << index << ") geeft: " << name << endl; + // we want to store the singleton value for this index + // so we MUST reverse lookup the index + TargetValue *tv = new TargetValue( name, index ); + tv->ValFreq( freq ); + reverse_values[index] = tv; + values_array.push_back( tv ); + } + else { + it->second->IncValFreq( freq ); + } + return reverse_values[index]; + } + + TargetValue *Targets::MajorityClass() const { + TargetValue *result = 0; + size_t freq = 0; + for ( const auto& it : values_array ){ + if ( it->ValFreq() > freq ){ + result = it; + freq = result->ValFreq(); + } + } + return result; + } + + bool Targets::increment_value( TargetValue *TV ){ + bool result = false; + if ( TV ){ + TV->incr_val_freq(); + result = true; + } + return result; + } + + bool Targets::decrement_value( TargetValue *TV ){ + bool result = false; + if ( TV ){ + TV->decr_val_freq(); + result = true; + } + return result; + } + +} diff --git a/src/Testers.cxx b/src/Testers.cxx index 7cee02e..f61469b 100644 --- a/src/Testers.cxx +++ b/src/Testers.cxx @@ -25,13 +25,8 @@ lamasoftware (at ) science.ru.nl */ #include -#include #include -#include -#include -#include -#include -#include +#include #include "timbl/Common.h" #include "timbl/Types.h" @@ -83,34 +78,33 @@ namespace Timbl{ } TesterClass* getTester( MetricType m, - const std::vector& features, - const std::vector& permutation, + const Feature_List& features, int mvdThreshold ){ if ( m == Cosine ){ - return new CosineTester( features, permutation ); + return new CosineTester( features ); } else if ( m == DotProduct ){ - return new DotProductTester( features, permutation ); + return new DotProductTester( features ); } else { - return new DistanceTester( features, permutation, mvdThreshold ); + return new DistanceTester( features, mvdThreshold ); } } - TesterClass::TesterClass( const vector& feat, - const vector& perm ): - _size(feat.size()), + TesterClass::TesterClass( const Feature_List& features ): + _size(features.feats.size()), effSize(_size), offSet(0), FV(0), - features(feat), - permutation(perm) { + features(features.feats), + permutation(features.permutation) + { permFeatures.resize(_size,0); #ifdef DBGTEST cerr << "created TesterClass(" << _size << ")" << endl; #endif for ( size_t j=0; j < _size; ++j ){ - permFeatures[j] = feat[perm[j]]; + permFeatures[j] = features.feats[features.permutation[j]]; } distances.resize(_size+1, 0.0); } @@ -127,24 +121,21 @@ namespace Timbl{ } DistanceTester::~DistanceTester(){ - for ( size_t i=0; i < _size; ++i ){ - delete metricTest[i]; + for ( const auto& it : metricTest ){ + delete it; } - delete [] metricTest; } - DistanceTester::DistanceTester( const vector& feat, - const vector& perm, + DistanceTester::DistanceTester( const Feature_List& features, int mvdmThreshold ): - TesterClass( feat, perm ){ + TesterClass( features ){ #ifdef DBGTEST cerr << "create a tester with threshold = " << mvdmThreshold << endl; #endif - metricTest = new metricTestFunction*[_size]; + metricTest.resize(_size,0); for ( size_t i=0; i < _size; ++i ){ - metricTest[i] = 0; #ifdef DBGTEST - cerr << "set metric[" << i+1 << "]=" << TiCC::toString(features[i]->getMetricType()) << endl; + cerr << "set metric[" << i+1 << "]=" << TiCC::toString(features.feats[i]->getMetricType()) << endl; #endif if ( features[i]->Ignore() ) continue; @@ -197,7 +188,7 @@ namespace Timbl{ inline bool FV_to_real( FeatureValue *FV, double &result ){ if ( FV ){ - if ( TiCC::stringTo( FV->name_u(), result ) ){ + if ( TiCC::stringTo( FV->name(), result ) ){ return true; } } @@ -278,7 +269,7 @@ namespace Timbl{ } double DotProductTester::getDistance( size_t pos ) const{ -#ifdef DBGTEST +#ifdef DBGTEST_DOT cerr << "getDistance, maxSim = " << std::numeric_limits::max() << endl; cerr << " distances[" << pos << "]= " << distances[pos] << endl; #endif diff --git a/src/Timbl.cxx b/src/Timbl.cxx index 335381f..b4f76fa 100644 --- a/src/Timbl.cxx +++ b/src/Timbl.cxx @@ -31,11 +31,6 @@ #include #include #include -#include -#include -#include -#include -#include #include "config.h" #include "ticcutils/CommandLine.h" diff --git a/src/TimblAPI.cxx b/src/TimblAPI.cxx index f91bfca..c54b0bd 100644 --- a/src/TimblAPI.cxx +++ b/src/TimblAPI.cxx @@ -397,7 +397,7 @@ namespace Timbl { } const TargetValue *TimblAPI::Classify( const string& s, - const ValueDistribution *& db, + const ClassDistribution *& db, double& di ){ if ( Valid() ){ return pimpl->Classify( TiCC::UnicodeFromUTF8(s), db, di ); @@ -410,7 +410,7 @@ namespace Timbl { } const TargetValue *TimblAPI::Classify( const icu::UnicodeString& s, - const ValueDistribution *& db, + const ClassDistribution *& db, double& di ){ if ( Valid() ){ return pimpl->Classify( s, db, di ); @@ -437,7 +437,7 @@ namespace Timbl { } const TargetValue *TimblAPI::Classify( const string& s, - const ValueDistribution *& db ){ + const ClassDistribution *& db ){ if ( Valid() ){ return pimpl->Classify( TiCC::UnicodeFromUTF8(s), db ); } @@ -448,7 +448,7 @@ namespace Timbl { } const TargetValue *TimblAPI::Classify( const icu::UnicodeString& s, - const ValueDistribution *& db ){ + const ClassDistribution *& db ){ if ( Valid() ){ return pimpl->Classify( s, db ); } @@ -505,11 +505,11 @@ namespace Timbl { return 0; } - const Target *TimblAPI::myTargets() const{ + const Targets& TimblAPI::myTargets() const{ if ( Valid() ){ - return pimpl->Targets; + return pimpl->targets; } - return 0; + abort(); } bool TimblAPI::Classify( const string& s, string& cls ){ diff --git a/src/TimblExperiment.cxx b/src/TimblExperiment.cxx index b77b9db..633a3ae 100644 --- a/src/TimblExperiment.cxx +++ b/src/TimblExperiment.cxx @@ -29,20 +29,9 @@ #include #include #include -#include #include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include #include #include "config.h" @@ -59,12 +48,12 @@ #include "timbl/BestArray.h" #include "timbl/IBtree.h" #include "timbl/MBLClass.h" -#include "ticcutils/CommandLine.h" #include "timbl/GetOptClass.h" #include "timbl/TimblExperiment.h" #include "ticcutils/XMLtools.h" #include "ticcutils/Timer.h" #include "ticcutils/PrettyPrint.h" +#include "ticcutils/CommandLine.h" #ifdef HAVE_OPENMP #include @@ -75,6 +64,7 @@ using namespace icu; using namespace nlohmann; namespace Timbl { + using TiCC::operator<<; const string timbl_short_opts = "a:b:B:c:C:d:De:f:F:G::hHi:I:k:l:L:m:M:n:N:o:O:p:P:q:QR:s::t:T:u:U:v:Vw:W:xX:Z%"; const string timbl_long_opts = ",Beam:,clones:,Diversify,occurrences:,sloppy::,silly::,Threshold:,Treeorder:,matrixin:,matrixout:,version,help,limit:"; @@ -86,12 +76,12 @@ namespace Timbl { } bool resultStore::reset( int _beam, normType _norm, - double _factor, const Target *_targets ) { + double _factor, const Targets& _targets ) { clear(); beam = _beam; norm = _norm; factor = _factor; - targets = _targets; + targets = &_targets; bool result = true; if ( norm != noNorm && beam != 0 ){ @@ -114,7 +104,7 @@ namespace Timbl { resultCache.clear(); } - const WValueDistribution *resultStore::getResultDist() { + const WClassDistribution *resultStore::getResultDist() { if ( rawDist && !dist ){ prepare(); } @@ -144,14 +134,14 @@ namespace Timbl { return resultCache; } - void resultStore::addConstant( const ValueDistribution *vd, + void resultStore::addConstant( const ClassDistribution *vd, const TargetValue *best_result ) { rawDist = vd; best_target = best_result; disposable = false; } - void resultStore::addTop( const ValueDistribution *vd, + void resultStore::addTop( const ClassDistribution *vd, const TargetValue *best_result ) { rawDist = vd; best_target = best_result; @@ -159,7 +149,7 @@ namespace Timbl { isTop = true; } - void resultStore::addDisposable( ValueDistribution *vd, + void resultStore::addDisposable( ClassDistribution *vd, const TargetValue *best_result ) { rawDist = vd; best_target = best_result; @@ -175,7 +165,7 @@ namespace Timbl { dist = rawDist->to_WVD_Copy(); } else { - dist = dynamic_cast( const_cast(rawDist) ); + dist = dynamic_cast( const_cast(rawDist) ); rawDist = 0; } } @@ -190,7 +180,7 @@ namespace Timbl { dist->Normalize(); break; case addFactorNorm: - dist->Normalize_1( factor, targets ); + dist->Normalize_1( factor, *targets ); break; case logProbNorm: dist->Normalize_2(); @@ -214,8 +204,6 @@ namespace Timbl { Initialized( false ), OptParams( NULL ), algorithm( Alg ), - CurrentDataFile( "" ), - WFileName( "" ), ibCount( 0 ), confusionInfo( 0 ), match_depth(-1), @@ -267,7 +255,7 @@ namespace Timbl { result->OptParams = OptParams->Clone( 0 ); } result->WFileName = WFileName; - result->CurrentDataFile = ""; + result->CurrentDataFile.clear(); result->InstanceBase->CleanPartition( false ); result->InstanceBase = 0; result->is_synced = true; @@ -282,7 +270,7 @@ namespace Timbl { delete confusionInfo; confusionInfo = 0; if ( Verbosity(ADVANCED_STATS) ){ - confusionInfo = new ConfusionMatrix( Targets->num_of_values() ); + confusionInfo = new ConfusionMatrix( targets.num_of_values() ); } initDecay(); calculate_fv_entropy( true ); @@ -295,8 +283,8 @@ namespace Timbl { // // invalidate MVDM matrices, they might be changing in size for ( size_t j=0; j < NumOfFeatures(); ++j ){ - if ( !Features[j]->Ignore() ){ - Features[j]->clear_matrix(); + if ( !features[j]->Ignore() ){ + features[j]->clear_matrix(); } } } @@ -376,7 +364,8 @@ namespace Timbl { bool expand ){ assert( runningPhase == LearnWords ); bool result = false; - if ( FileName != "" && ConfirmOptions() ){ + if ( !FileName.empty() + && ConfirmOptions() ){ if ( !ExpInvalid() ){ if ( !expand && ( Options.TableFrozen() || @@ -473,7 +462,7 @@ namespace Timbl { if ( !Verbosity(SILENT) ){ Info( "Preparation took " + prepT.toString() ); } - if ( warnOnSingleTarget && Targets->EffectiveValues() <=1 ){ + if ( warnOnSingleTarget && targets.EffectiveValues() <=1 ){ Warning( "Training file contains only 1 class." ); } result = true; @@ -494,7 +483,6 @@ namespace Timbl { } ostream& operator<< ( ostream& os, const fileIndex& fi ){ - using TiCC::operator<<; for ( const auto& it : fi ){ os << "<"; os << it.first << "," << it.second; @@ -545,8 +533,8 @@ namespace Timbl { if ( is_synced ){ CurrentDataFile = FileName; // assume magic! } - if ( CurrentDataFile == "" ) { - if ( FileName == "" ){ + if ( CurrentDataFile.empty() ) { + if ( FileName.empty() ){ Warning( "unable to build an InstanceBase: No datafile defined yet" ); result = false; } @@ -554,7 +542,7 @@ namespace Timbl { result = false; } } - else if ( FileName != "" && + else if ( !FileName.empty() && CurrentDataFile != FileName ){ Error( "Unable to Learn from file '" + FileName + "'\n" "while previously instantiated from file '" + @@ -644,7 +632,7 @@ namespace Timbl { const bool init ): TimblExperiment( IB1_a, s ){ if ( init ) { - InitClass( N ); + init_options_table(N); } TreeOrder = GRoverFeature; } @@ -661,21 +649,19 @@ namespace Timbl { Warning( "unable to Increment, No InstanceBase available" ); result = false; } + else if ( !Chop( InstanceString ) ){ + Error( "Couldn't convert to Instance: " + + TiCC::UnicodeToUTF8(InstanceString) ); + result = false; // No more input + } else { - if ( !Chop( InstanceString ) ){ - Error( "Couldn't convert to Instance: " - + TiCC::UnicodeToUTF8(InstanceString) ); - result = false; // No more input - } - else { - chopped_to_instance( TrainLearnWords ); - MBL_init = false; - bool happy = InstanceBase->AddInstance( CurrInst ); - if ( !happy ){ - Warning( "deviating exemplar weight in:\n" + - TiCC::UnicodeToUTF8(InstanceString) - + "\nIgnoring the new weight" ); - } + chopped_to_instance( TrainLearnWords ); + MBL_init = false; + bool happy = InstanceBase->AddInstance( CurrInst ); + if ( !happy ){ + Warning( "deviating exemplar weight in:\n" + + TiCC::UnicodeToUTF8(InstanceString) + + "\nIgnoring the new weight" ); } } return result; @@ -719,7 +705,7 @@ namespace Timbl { Warning( "unable to expand the InstanceBase: Not there" ); result = false; } - else if ( FileName == "" ){ + else if ( FileName.empty() ){ Warning( "unable to expand the InstanceBase: No inputfile specified" ); result = false; } @@ -800,7 +786,7 @@ namespace Timbl { Warning( "unable to remove from InstanceBase: Not there" ); result = false; } - else if ( FileName == "" ){ + else if ( FileName.empty() ){ Warning( "unable to remove from InstanceBase: No input specified" ); result = false; } @@ -908,7 +894,7 @@ namespace Timbl { Progress( 10000 ); } } - if ( exp_name != "" ){ + if ( !exp_name.empty() ){ os << "-" << exp_name << "-"; } os << "Tested: "; @@ -962,7 +948,7 @@ namespace Timbl { Progress( 10000 ); } } - if ( exp_name != "" ){ + if ( !exp_name.empty() ){ os << "-" << exp_name << "-"; } os << "Learning: "; @@ -1010,7 +996,7 @@ namespace Timbl { bool TimblExperiment::showStatistics( ostream& os ) const { os << endl; if ( confusionInfo ){ - confusionInfo->FScore( os, Targets, Verbosity(CLASS_STATS) ); + confusionInfo->FScore( os, targets, Verbosity(CLASS_STATS) ); } os << "overall accuracy: " << stats.testedCorrect()/(double) stats.dataLines() @@ -1042,13 +1028,13 @@ namespace Timbl { } if ( confusionInfo && Verbosity(CONF_MATRIX) ){ os << endl; - confusionInfo->Print( os, Targets ); + confusionInfo->Print( os, targets ); } return true; } bool TimblExperiment::createPercFile( const string& fileName ) const { - if ( fileName != "" ) { + if ( !fileName.empty() ) { ofstream outfile( fileName, ios::out | ios::trunc); if (!outfile) { Warning( "can't open: " + fileName ); @@ -1099,7 +1085,7 @@ namespace Timbl { const string& dString, const TargetValue *Best, const double Distance ) { - outfile << get_org_input() << CodeToStr(Best->name_u()); + outfile << get_org_input() << CodeToStr(Best->name()); if ( Verbosity(CONFIDENCE) ){ outfile << " [" << confidence << "]"; } @@ -1496,24 +1482,22 @@ namespace Timbl { + " vs. " + TiCC::toString(NumOfFeatures()) + ")" ); } } + else if ( Initialized ){ + result = true; + } + else if ( IBStatus() == Invalid ){ + Warning( "no Instance Base is available yet" ); + } + else if ( !setInputFormat( IF ) ){ + Error( "Couldn't set input format to " + TiCC::toString( IF ) ); + } else { - if ( Initialized ){ - result = true; - } - else if ( IBStatus() == Invalid ){ - Warning( "no Instance Base is available yet" ); - } - else if ( !setInputFormat( IF ) ){ - Error( "Couldn't set input format to " + TiCC::toString( IF ) ); - } - else { - if ( Verbosity(NEAR_N) ){ - Do_Exact( false ); - } - initExperiment(); - Initialized = true; - result = true; + if ( Verbosity(NEAR_N) ){ + Do_Exact( false ); } + initExperiment(); + Initialized = true; + result = true; } } return result; @@ -1543,7 +1527,7 @@ namespace Timbl { const TargetValue *targ = classifyString( TiCC::UnicodeFromUTF8(inst), distance ); if ( targ ){ - string cat = targ->Name(); + string cat = targ->name_string(); normalizeResult(); result["category"] = cat; if ( Verbosity(NEAR_N) ){ @@ -1602,7 +1586,7 @@ namespace Timbl { const TargetValue *targ = classifyString( TiCC::UnicodeFromUTF8(Line), Distance ); if ( targ ){ - Result = TiCC::UnicodeToUTF8(targ->name_u()); + Result = targ->name_string(); normalizeResult(); Dist = bestResult.getResult(); return true; @@ -1618,7 +1602,7 @@ namespace Timbl { Dist.remove(); const TargetValue *targ = classifyString( Line, Distance ); if ( targ ){ - Result = targ->name_u(); + Result = targ->name(); normalizeResult(); Dist = TiCC::UnicodeFromUTF8(bestResult.getResult()); return true; @@ -1649,12 +1633,12 @@ namespace Timbl { bool recurse = true; bool Tie = false; exact = false; - if ( !bestResult.reset( beamSize, normalisation, norm_factor, Targets ) ){ + if ( !bestResult.reset( beamSize, normalisation, norm_factor, targets ) ){ Warning( "no normalisation possible because a BeamSize is specified\n" "output is NOT normalized!" ); } - const ValueDistribution *ExResultDist = ExactMatch( Inst ); - WValueDistribution *ResultDist = 0; + const ClassDistribution *ExResultDist = ExactMatch( Inst ); + WClassDistribution *ResultDist = 0; nSet.clear(); const TargetValue *Res; if ( ExResultDist ){ @@ -1684,7 +1668,7 @@ namespace Timbl { ++num_of_neighbors; testInstance( Inst, InstanceBase ); bestArray.addToNeighborSet( nSet, num_of_neighbors ); - WValueDistribution *ResultDist2 = getBestDistribution(); + WClassDistribution *ResultDist2 = getBestDistribution(); const TargetValue *Res2 = ResultDist2->BestTarget( Tie2, (RandomSeed() >= 0) ); --num_of_neighbors; if ( !Tie2 ){ @@ -1758,7 +1742,7 @@ namespace Timbl { double TimblExperiment::sum_remaining_weights( size_t level ) const { double result = 0.0; for ( size_t i = level; i < EffectiveFeatures(); ++i ){ - result += PermFeatures[i]->Weight(); + result += features.perm_feats[i]->Weight(); } return result; } @@ -1774,20 +1758,20 @@ namespace Timbl { os << endl; os << "Deviant Feature Metrics:"; int cnt = 0; - size_t *InvPerm = new size_t[NumOfFeatures()]; + vector InvPerm( NumOfFeatures() ); for ( size_t i = 0; i < NumOfFeatures(); ++i ){ - InvPerm[permutation[i]] = i; + InvPerm[features.permutation[i]] = i; } for ( size_t i = 0; i < NumOfFeatures(); ++i ){ - if ( !Features[i]->Ignore() && + if ( !features[i]->Ignore() && InvPerm[i]+1 > TRIBL_offset() ){ - MetricType mt = Features[i]->getMetricType(); + MetricType mt = features[i]->getMetricType(); if ( mt != globalMetricOption ){ ++cnt; os << endl << " Feature[" << i+1 << "] : " << TiCC::toString( mt, true ); - if ( Features[i]->isStorableMetric() ){ + if ( features[i]->isStorableMetric() ){ bool readM = false; - if ( Features[i]->matrixPresent( readM ) ){ + if ( features[i]->matrixPresent( readM ) ){ if ( readM ){ os << " (User Defined)"; } @@ -1802,7 +1786,6 @@ namespace Timbl { } } } - delete [] InvPerm; if ( cnt ){ os << endl; } @@ -1832,7 +1815,7 @@ namespace Timbl { void TimblExperiment::show_ignore_info( ostream& os ) const{ bool first = true; for ( size_t i=0; i< NumOfFeatures(); ++i ){ - if ( Features[i]->Ignore() ){ + if ( features[i]->Ignore() ){ if ( first ){ first = false; os << "Ignored features : { "; @@ -2461,7 +2444,8 @@ namespace Timbl { bool Hashed; int Version; string range_buf; - if ( !get_IB_Info( is, Pruned, Version, Hashed, range_buf ) ){ + size_t numF = get_IB_Info( is, Pruned, Version, Hashed, range_buf ); + if ( numF == 0 ){ return false; } else if ( Pruned ){ @@ -2470,7 +2454,7 @@ namespace Timbl { } else { TreeOrder = DataFile; - Initialize(); + Initialize( numF ); if ( !get_ranges( range_buf ) ){ Warning( "couldn't retrieve ranges..." ); } @@ -2478,27 +2462,27 @@ namespace Timbl { srand( RandomSeed() ); int pos=0; for ( size_t i=0; i < NumOfFeatures(); ++i ){ - Features[i]->SetWeight( 1.0 ); - if ( Features[permutation[i]]->Ignore() ){ - PermFeatures[i] = NULL; + features[i]->SetWeight( 1.0 ); + if ( features[features.permutation[i]]->Ignore() ){ + features.perm_feats[i] = NULL; } else { - PermFeatures[pos++] = Features[permutation[i]]; + features.perm_feats[pos++] = features[features.permutation[i]]; } } InstanceBase = new IB_InstanceBase( EffectiveFeatures(), ibCount, (RandomSeed()>=0) ); if ( Hashed ){ - result = InstanceBase->ReadIB( is, PermFeatures, - *Targets, - *Targets->hash(), - *Features[0]->hash(), - Version ); + result = InstanceBase->ReadIB_hashed( is, + features, + targets, + Version ); } else { - result = InstanceBase->ReadIB( is, PermFeatures, - *Targets, + result = InstanceBase->ReadIB( is, + features, + targets, Version ); } } @@ -2564,7 +2548,7 @@ namespace Timbl { else { initExperiment(); for ( size_t i=0; i< NumOfFeatures(); ++i ){ - res.push_back( Features[i]->Weight() ); + res.push_back( features[i]->Weight() ); } } return true; @@ -2698,5 +2682,4 @@ namespace Timbl { return result; } - } diff --git a/src/neighborSet.cxx b/src/neighborSet.cxx index 8ce6d59..b9e562b 100644 --- a/src/neighborSet.cxx +++ b/src/neighborSet.cxx @@ -25,12 +25,9 @@ lamasoftware (at ) science.ru.nl */ -#include #include -#include #include "timbl/Common.h" -#include "timbl/MsgClass.h" #include "timbl/Types.h" #include "timbl/Instance.h" #include "timbl/neighborSet.h" @@ -89,7 +86,7 @@ namespace Timbl { } } - void neighborSet::push_back( double d, const ValueDistribution &dist ){ + void neighborSet::push_back( double d, const ClassDistribution &dist ){ distances.push_back( d ); distributions.push_back( dist.to_VD_Copy() ); } @@ -167,7 +164,7 @@ namespace Timbl { } break; default: - throw "wrong value in switch"; + throw std::logic_error( "wrong value in switch" ); } return result; } @@ -179,20 +176,20 @@ namespace Timbl { return distances[n]; } - const ValueDistribution *neighborSet::getDistribution( size_t n ) const { + const ClassDistribution *neighborSet::getDistribution( size_t n ) const { if ( size() <= n ){ throw std::range_error( "getDistribution() parameter exceeds size of neighborSet" ); } return distributions[n]; } - WValueDistribution *neighborSet::bestDistribution( const decayStruct *d, + WClassDistribution *neighborSet::bestDistribution( const decayStruct *d, size_t max ) const { - // Analyse the set to find THE best ValueDistribution. + // Analyse the set to find THE best ClassDistribution. // For each neighbor, we loop over the number of bests in that // bin, and merge that distribution into the result // - WValueDistribution *result = new WValueDistribution(); + WClassDistribution *result = new WClassDistribution(); size_t stop = distributions.size(); stop = ( max > 0 && max < stop ? max : stop ); for ( size_t k = 0; k < stop; ++k ) {