OpenWalnut
1.4.0
|
00001 //--------------------------------------------------------------------------- 00002 // 00003 // Project: OpenWalnut ( http://www.openwalnut.org ) 00004 // 00005 // Copyright 2009 OpenWalnut Community, BSV@Uni-Leipzig and CNCF@MPI-CBS 00006 // For more information see http://www.openwalnut.org/copying 00007 // 00008 // This file is part of OpenWalnut. 00009 // 00010 // OpenWalnut is free software: you can redistribute it and/or modify 00011 // it under the terms of the GNU Lesser General Public License as published by 00012 // the Free Software Foundation, either version 3 of the License, or 00013 // (at your option) any later version. 00014 // 00015 // OpenWalnut is distributed in the hope that it will be useful, 00016 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00018 // GNU Lesser General Public License for more details. 00019 // 00020 // You should have received a copy of the GNU Lesser General Public License 00021 // along with OpenWalnut. If not, see <http://www.gnu.org/licenses/>. 00022 // 00023 //--------------------------------------------------------------------------- 00024 00025 #ifndef WVALUESETHISTOGRAM_H 00026 #define WVALUESETHISTOGRAM_H 00027 00028 #include <map> 00029 #include <list> 00030 #include <utility> 00031 00032 #include <boost/shared_ptr.hpp> 00033 #include <boost/scoped_array.hpp> 00034 #include <boost/shared_array.hpp> 00035 00036 #include "../../common/WHistogram.h" 00037 #include "../WValueSet.h" 00038 00039 00040 /** 00041 * Used to find the occurrence frequencies of values in a value set. It implements a classical histogram but allows easy modification of bucket 00042 * sizes without unnecessary recalculation of the whole histogram. This histogram uses right-open intervals for counting, which is why there 00043 * always is a bucket at the end from max to infinity which holds all the max values. 00044 * 00045 * \note This histogram is different from from WValueSetHistogram which is a generic histogram class. 00046 */ 00047 class WValueSetHistogram: public WHistogram // NOLINT 00048 { 00049 friend class WValueSetHistogramTest; 00050 public: 00051 /** 00052 * Constructor. Creates the histogram for the specified value set. 00053 * 00054 * \param valueSet source of the data for the histogram 00055 * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1. 00056 */ 00057 explicit WValueSetHistogram( boost::shared_ptr< WValueSetBase > valueSet, size_t buckets = 1000 ); 00058 00059 /** 00060 * Constructor. Creates the histogram for the specified value set. 00061 * 00062 * \param valueSet source of the data for the histogram 00063 * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1. 00064 */ 00065 explicit WValueSetHistogram( const WValueSetBase& valueSet, size_t buckets = 1000 ); 00066 00067 /** 00068 * Constructor. Creates a histogram from the specified value set but allows cropping of values below the given min and above the given max. 00069 * It actually interprets all values below min and above max to be exactly min and exactly max and sorts them into the appropriate bin. This 00070 * is especially useful to filter out outliers in data. 00071 * 00072 * \param valueSet source data 00073 * \param min the new minimum to use 00074 * \param max the maximum to use 00075 * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1. 00076 */ 00077 WValueSetHistogram( boost::shared_ptr< WValueSetBase > valueSet, double min, double max, size_t buckets = 1000 ); 00078 00079 /** 00080 * Constructor. Creates a histogram from the specified value set but allows cropping of values below the given min and above the given max. 00081 * It actually interprets all values below min and above max to be exactly min and exactly max and sorts them into the appropriate bin. This 00082 * is especially useful to filter out outliers in data. 00083 * 00084 * \param valueSet source data 00085 * \param min the new minimum to use 00086 * \param max the maximum to use 00087 * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1. 00088 */ 00089 WValueSetHistogram( const WValueSetBase& valueSet, double min, double max, size_t buckets = 1000 ); 00090 00091 /** 00092 * Copy constructor. If another interval size is given the histogram gets matched to it using the initial bucket data. 00093 * \note this does not deep copy the m_initialBuckets and m_mappedBuckets array as these are shared_array instances. 00094 * 00095 * \param histogram another WValueSetHistogram 00096 * \param buckets the new number of buckets. Must be larger than 1 if specified. 00097 */ 00098 WValueSetHistogram( const WValueSetHistogram& histogram, size_t buckets = 0 ); 00099 00100 /** 00101 * Destructor. 00102 */ 00103 ~WValueSetHistogram(); 00104 00105 /** 00106 * Copy assignment. Copies the contents of the specified histogram to this instance. 00107 * 00108 * \param other the other instance 00109 * 00110 * \return this instance with the contents of the other one. 00111 * \note this does not deep copy the m_initialBuckets and m_mappedBuckets array as these are shared_array instances. 00112 */ 00113 WValueSetHistogram& operator=( const WValueSetHistogram& other ); 00114 00115 /** 00116 * Get the count of the bucket. 00117 * 00118 * \param index which buckets count is to be returned; starts with 0 which is the bucket containing the smallest values. 00119 * 00120 * \return elements in the bucket. 00121 */ 00122 virtual size_t operator[]( size_t index ) const; 00123 00124 /** 00125 * Get the count of the bucket. Testing if the position is valid. 00126 * 00127 * \param index which buckets count is to be returned; starts with 0 which is the bucket containing the smallest values. 00128 * 00129 * \return elements in the bucket 00130 */ 00131 virtual size_t at( size_t index ) const; 00132 00133 /** 00134 * Returns the number of buckets in the histogram with the actual mapping. 00135 * 00136 * \return number of buckets 00137 */ 00138 virtual size_t size() const; 00139 00140 /** 00141 * Return the size of one bucket. 00142 * 00143 * \param index the width for this bucket is queried. 00144 * 00145 * \return the size of a bucket. 00146 */ 00147 virtual double getBucketSize( size_t index = 0 ) const; 00148 00149 /** 00150 * Returns the actual interval associated with the given index. The interval is open, meaning that 00151 * getIntervalForIndex( i ).second == getIntervalForIndex( i + 1 ).first but does not belong anymore to the interval itself but every value 00152 * smaller than getIntervalForIndex( i ).second. 00153 * 00154 * \param index the intex 00155 * 00156 * \return the open interval. 00157 */ 00158 virtual std::pair< double, double > getIntervalForIndex( size_t index ) const; 00159 00160 /** 00161 * Returns the right index to the bucket containing the given value. If a value larger than the maximum, the maximum index is returned. Same 00162 * for minimum; if the value is smaller than the minimum, 0 is returned. 00163 * 00164 * \param value the value to search the index for 00165 * 00166 * \return the index of the bucket 00167 */ 00168 virtual size_t getIndexForValue( double value ) const; 00169 00170 /** 00171 * This returns the number of value set entries added to the histogram. This is especially useful to normalize the histogram counts. 00172 * 00173 * \return the number of elements distributed in the buckets. 00174 */ 00175 virtual size_t getTotalElementCount() const; 00176 00177 /** 00178 * Sums up the buckets in the specified interval. Especially useful for cumulative distribution functions or similar. 00179 * 00180 * \param startIndex the index where to start counting including this one 00181 * \param endIndex the index where to end summing up excluding this one. 00182 * 00183 * \return the sum of all buckets in the interval. 00184 * \throw WOutOfBounds if one of the indices is invalid. 00185 */ 00186 virtual size_t accumulate( size_t startIndex, size_t endIndex ) const; 00187 00188 protected: 00189 /** 00190 * Return the initial buckets. 00191 * 00192 * \return m_initialBuckets 00193 */ 00194 boost::shared_array< size_t > getInitialBuckets() const; 00195 00196 /** 00197 * Return the number of initial buckets. 00198 * 00199 * \return m_nInitialBuckets 00200 */ 00201 size_t getNInitialBuckets() const; 00202 00203 /** 00204 * Return the size of one initial bucket. 00205 * 00206 * \return m_bucketSize 00207 */ 00208 double getInitialBucketSize() const; 00209 00210 /** 00211 * increment the value by one, contains the logic to find the element place in the array. 00212 * Should only be used in the constructor i.e. while iterating over WValueSet. 00213 * 00214 * \param value value to increment 00215 */ 00216 virtual void insert( double value ); 00217 00218 private: 00219 /** 00220 * Size of one bucket in the initial histogram. 00221 */ 00222 double m_initialBucketSize; 00223 00224 /** 00225 * Pointer to all initial buckets of the histogram. 00226 */ 00227 boost::shared_array< size_t > m_initialBuckets; 00228 00229 /** 00230 * Number of buckets in the initial histogram. 00231 */ 00232 size_t m_nInitialBuckets; 00233 00234 /** 00235 * Pointer to all initial buckets of the histogram. 00236 */ 00237 boost::shared_array< size_t > m_mappedBuckets; 00238 00239 /** 00240 * Tracks the number of a buckets in the mapped histogram. 00241 */ 00242 size_t m_nMappedBuckets; 00243 00244 /** 00245 * Size of one bucket in the mapped histogram. 00246 */ 00247 double m_mappedBucketSize; 00248 00249 /** 00250 * The number of elements distributed in the buckets. 00251 */ 00252 size_t m_nbTotalElements; 00253 00254 /** 00255 * Actually builds the histogram. This function is simply used for avoiding code duplication in all these constructors. 00256 * 00257 * \param valueSet the value set. 00258 */ 00259 void buildHistogram( const WValueSetBase& valueSet ); 00260 }; 00261 00262 /** 00263 * Write a histogram in string representation to the given output stream. 00264 */ 00265 std::ostream& operator<<( std::ostream& out, const WValueSetHistogram& h ); 00266 00267 inline size_t WValueSetHistogram::getIndexForValue( double value ) const 00268 { 00269 // the position on the scala 00270 double pos = ( value - m_minimum ) / static_cast< double >( m_mappedBucketSize ); 00271 // the index is the floor( position ) 00272 size_t idx = static_cast< size_t >( std::floor( pos ) ); 00273 00274 // is the index larger than the size? 00275 bool inU = ( idx < m_nMappedBuckets ); 00276 // is the index smaller than the size? 00277 bool inL = ( pos >= 0.0 ); 00278 00279 // the trick done here is to clamp value into [m_minimum,m_maximum] without using if statements. The C++ Standard says that booleans are 00280 // always 1 if true. 00281 // NOTE: this is integral arithmetic 00282 return ( inL && inU ) * idx + ( !inU && inL ) * ( m_nMappedBuckets - 1 ); 00283 } 00284 00285 #endif // WVALUESETHISTOGRAM_H 00286