00001 //--------------------------------------------------------------------------- 00002 // 00003 // Project: OpenWalnut ( http://www.openwalnut.org ) 00004 // 00005 // Copyright 2009 OpenWalnut Community, BSV@Uni-Leipzig and CNCF@MPI-CBS 00006 // For more information see http://www.openwalnut.org/copying 00007 // 00008 // This file is part of OpenWalnut. 00009 // 00010 // OpenWalnut is free software: you can redistribute it and/or modify 00011 // it under the terms of the GNU Lesser General Public License as published by 00012 // the Free Software Foundation, either version 3 of the License, or 00013 // (at your option) any later version. 00014 // 00015 // OpenWalnut is distributed in the hope that it will be useful, 00016 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00018 // GNU Lesser General Public License for more details. 00019 // 00020 // You should have received a copy of the GNU Lesser General Public License 00021 // along with OpenWalnut. If not, see <http://www.gnu.org/licenses/>. 00022 // 00023 //--------------------------------------------------------------------------- 00024 00025 #ifndef WVALUESETHISTOGRAM_H 00026 #define WVALUESETHISTOGRAM_H 00027 00028 #include <map> 00029 #include <list> 00030 #include <utility> 00031 00032 #include <boost/shared_ptr.hpp> 00033 #include <boost/scoped_array.hpp> 00034 #include <boost/shared_array.hpp> 00035 00036 #include "../../common/WHistogram.h" 00037 #include "../WValueSet.h" 00038 #include "../WExportDataHandler.h" 00039 00040 /** 00041 * Used to find the occurrence frequencies of values in a value set. It implements a classical histogram but allows easy modification of bucket 00042 * sizes without unnecessary recalculation of the whole histogram. This histogram uses right-open intervals for counting, which is why there 00043 * always is a bucket at the end from max to infinity which holds all the max values. 00044 * 00045 * \note This histogram is different from from WValueSetHistogram which is a generic histogram class. 00046 */ 00047 class OWDATAHANDLER_EXPORT WValueSetHistogram: public WHistogram // NOLINT 00048 { 00049 friend class WValueSetHistogramTest; 00050 public: 00051 /** 00052 * Constructor. Creates the histogram for the specified value set. 00053 * 00054 * \param valueSet source of the data for the histogram 00055 * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1. 00056 */ 00057 explicit WValueSetHistogram( boost::shared_ptr< WValueSetBase > valueSet, size_t buckets = 1000 ); 00058 00059 /** 00060 * Constructor. Creates the histogram for the specified value set. 00061 * 00062 * \param valueSet source of the data for the histogram 00063 * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1. 00064 */ 00065 explicit WValueSetHistogram( const WValueSetBase& valueSet, size_t buckets = 1000 ); 00066 00067 /** 00068 * Constructor. Creates a histogram from the specified value set but allows cropping of values below the given min and above the given max. 00069 * It actually interprets all values below min and above max to be exactly min and exactly max and sorts them into the appropriate bin. This 00070 * is especially useful to filter out outliers in data. 00071 * 00072 * \param valueSet source data 00073 * \param min the new minimum to use 00074 * \param max the maximum to use 00075 * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1. 00076 */ 00077 WValueSetHistogram( boost::shared_ptr< WValueSetBase > valueSet, double min, double max, size_t buckets = 1000 ); 00078 00079 /** 00080 * Constructor. Creates a histogram from the specified value set but allows cropping of values below the given min and above the given max. 00081 * It actually interprets all values below min and above max to be exactly min and exactly max and sorts them into the appropriate bin. This 00082 * is especially useful to filter out outliers in data. 00083 * 00084 * \param valueSet source data 00085 * \param min the new minimum to use 00086 * \param max the maximum to use 00087 * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1. 00088 */ 00089 WValueSetHistogram( const WValueSetBase& valueSet, double min, double max, size_t buckets = 1000 ); 00090 00091 /** 00092 * Copy constructor. If another interval size is given the histogram gets matched to it using the initial bucket data. 00093 * \note this does not deep copy the m_initialBuckets and m_mappedBuckets array as these are shared_array instances. 00094 * 00095 * \param histogram another WValueSetHistogram 00096 * \param buckets the new number of buckets. Must be larger than 1 if specified. 00097 */ 00098 WValueSetHistogram( const WValueSetHistogram& histogram, size_t buckets = 0 ); 00099 00100 /** 00101 * Destructor. 00102 */ 00103 ~WValueSetHistogram(); 00104 00105 /** 00106 * Copy assignment. Copies the contents of the specified histogram to this instance. 00107 * 00108 * \param other the other instance 00109 * 00110 * \return this instance with the contents of the other one. 00111 * \note this does not deep copy the m_initialBuckets and m_mappedBuckets array as these are shared_array instances. 00112 */ 00113 WValueSetHistogram& operator=( const WValueSetHistogram& other ); 00114 00115 /** 00116 * Get the count of the bucket. 00117 * 00118 * \param index which buckets count is to be returned; starts with 0 which is the bucket containing the smallest values. 00119 * 00120 * \return elements in the bucket. 00121 */ 00122 virtual size_t operator[]( size_t index ) const; 00123 00124 /** 00125 * Get the count of the bucket. Testing if the position is valid. 00126 * 00127 * \param index which buckets count is to be returned; starts with 0 which is the bucket containing the smallest values. 00128 * 00129 * \return elements in the bucket 00130 */ 00131 virtual size_t at( size_t index ) const; 00132 00133 /** 00134 * Returns the number of buckets in the histogram with the actual mapping. 00135 * 00136 * \return number of buckets 00137 */ 00138 virtual size_t size() const; 00139 00140 /** 00141 * Return the size of one bucket. 00142 * 00143 * \param index the width for this bucket is queried. 00144 * 00145 * \return the size of a bucket. 00146 */ 00147 virtual double getBucketSize( size_t index = 0 ) const; 00148 00149 /** 00150 * Returns the actual interval associated with the given index. The interval is open, meaning that 00151 * getIntervalForIndex( i ).second == getIntervalForIndex( i + 1 ).first but does not belong anymore to the interval itself but every value 00152 * smaller than getIntervalForIndex( i ).second. 00153 * 00154 * \param index the intex 00155 * 00156 * \return the open interval. 00157 */ 00158 virtual std::pair< double, double > getIntervalForIndex( size_t index ) const; 00159 00160 /** 00161 * Returns the right index to the bucket containing the given value. If a value larger than the maximum, the maximum index is returned. Same 00162 * for minimum; if the value is smaller than the minimum, 0 is returned. 00163 * 00164 * \param value the value to search the index for 00165 * 00166 * \return the index of the bucket 00167 */ 00168 virtual size_t getIndexForValue( double value ) const; 00169 00170 /** 00171 * This returns the number of value set entries added to the histogram. This is especially useful to normalize the histogram counts. 00172 * 00173 * \return the number of elements distributed in the buckets. 00174 */ 00175 virtual size_t getTotalElementCount() const; 00176 00177 /** 00178 * Sums up the buckets in the specified interval. Especially useful for cumulative distribution functions or similar. 00179 * 00180 * \param startIndex the index where to start counting including this one 00181 * \param endIndex the index where to end summing up excluding this one. 00182 * 00183 * \return the sum of all buckets in the interval. 00184 * \throw WOutOfBounds if one of the indices is invalid. 00185 */ 00186 virtual size_t accumulate( size_t startIndex, size_t endIndex ) const; 00187 00188 protected: 00189 /** 00190 * Return the initial buckets. 00191 * 00192 * \return m_initialBuckets 00193 */ 00194 boost::shared_array< size_t > getInitialBuckets() const; 00195 00196 /** 00197 * Return the number of initial buckets. 00198 * 00199 * \return m_nInitialBuckets 00200 */ 00201 size_t getNInitialBuckets() const; 00202 00203 /** 00204 * Return the size of one initial bucket. 00205 * 00206 * \return m_bucketSize 00207 */ 00208 double getInitialBucketSize() const; 00209 00210 /** 00211 * increment the value by one, contains the logic to find the element place in the array. 00212 * Should only be used in the constructor i.e. while iterating over WValueSet. 00213 * 00214 * \param value value to increment 00215 */ 00216 virtual void insert( double value ); 00217 00218 private: 00219 00220 /** 00221 * Size of one bucket in the initial histogram. 00222 */ 00223 double m_initialBucketSize; 00224 00225 /** 00226 * Pointer to all initial buckets of the histogram. 00227 */ 00228 boost::shared_array< size_t > m_initialBuckets; 00229 00230 /** 00231 * Number of buckets in the initial histogram. 00232 */ 00233 size_t m_nInitialBuckets; 00234 00235 /** 00236 * Pointer to all initial buckets of the histogram. 00237 */ 00238 boost::shared_array< size_t > m_mappedBuckets; 00239 00240 /** 00241 * Tracks the number of a buckets in the mapped histogram. 00242 */ 00243 size_t m_nMappedBuckets; 00244 00245 /** 00246 * Size of one bucket in the mapped histogram. 00247 */ 00248 double m_mappedBucketSize; 00249 00250 /** 00251 * The number of elements distributed in the buckets. 00252 */ 00253 size_t m_nbTotalElements; 00254 00255 /** 00256 * Actually builds the histogram. This function is simply used for avoiding code duplication in all these constructors. 00257 * 00258 * \param valueSet the value set. 00259 */ 00260 void buildHistogram( const WValueSetBase& valueSet ); 00261 }; 00262 00263 /** 00264 * Write a histogram in string representation to the given output stream. 00265 */ 00266 std::ostream& operator<<( std::ostream& out, const WValueSetHistogram& h ); 00267 00268 inline size_t WValueSetHistogram::getIndexForValue( double value ) const 00269 { 00270 // the position on the scala 00271 double pos = ( value - m_minimum ) / static_cast< double >( m_mappedBucketSize ); 00272 // the index is the floor( position ) 00273 size_t idx = static_cast< size_t >( std::floor( pos ) ); 00274 00275 // is the index larger than the size? 00276 bool inU = ( idx < m_nMappedBuckets ); 00277 // is the index smaller than the size? 00278 bool inL = ( pos >= 0.0 ); 00279 00280 // the trick done here is to clamp value into [m_minimum,m_maximum] without using if statements. The C++ Standard says that booleans are 00281 // always 1 if true. 00282 // NOTE: this is integral arithmetic 00283 return ( inL && inU ) * idx + ( !inU && inL ) * ( m_nMappedBuckets - 1 ); 00284 } 00285 00286 #endif // WVALUESETHISTOGRAM_H 00287