OpenWalnut  1.4.0
WValueSetHistogram.h
00001 //---------------------------------------------------------------------------
00002 //
00003 // Project: OpenWalnut ( http://www.openwalnut.org )
00004 //
00005 // Copyright 2009 OpenWalnut Community, BSV@Uni-Leipzig and CNCF@MPI-CBS
00006 // For more information see http://www.openwalnut.org/copying
00007 //
00008 // This file is part of OpenWalnut.
00009 //
00010 // OpenWalnut is free software: you can redistribute it and/or modify
00011 // it under the terms of the GNU Lesser General Public License as published by
00012 // the Free Software Foundation, either version 3 of the License, or
00013 // (at your option) any later version.
00014 //
00015 // OpenWalnut is distributed in the hope that it will be useful,
00016 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00018 // GNU Lesser General Public License for more details.
00019 //
00020 // You should have received a copy of the GNU Lesser General Public License
00021 // along with OpenWalnut. If not, see <http://www.gnu.org/licenses/>.
00022 //
00023 //---------------------------------------------------------------------------
00024 
00025 #ifndef WVALUESETHISTOGRAM_H
00026 #define WVALUESETHISTOGRAM_H
00027 
00028 #include <map>
00029 #include <list>
00030 #include <utility>
00031 
00032 #include <boost/shared_ptr.hpp>
00033 #include <boost/scoped_array.hpp>
00034 #include <boost/shared_array.hpp>
00035 
00036 #include "../../common/WHistogram.h"
00037 #include "../WValueSet.h"
00038 
00039 
00040 /**
00041  * Used to find the occurrence frequencies of values in a value set. It implements a classical histogram but allows easy modification of bucket
00042  * sizes without unnecessary recalculation of the whole histogram. This histogram uses right-open intervals for counting, which is why there
00043  * always is a bucket at the end from max to infinity which holds all the max values.
00044  *
00045  * \note This histogram is different from from WValueSetHistogram which is a generic histogram class.
00046  */
00047 class WValueSetHistogram: public WHistogram // NOLINT
00048 {
00049 friend class WValueSetHistogramTest;
00050 public:
00051     /**
00052      * Constructor. Creates the histogram for the specified value set.
00053      *
00054      * \param valueSet source of the data for the histogram
00055      * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1.
00056      */
00057     explicit WValueSetHistogram( boost::shared_ptr< WValueSetBase > valueSet, size_t buckets = 1000 );
00058 
00059     /**
00060      * Constructor. Creates the histogram for the specified value set.
00061      *
00062      * \param valueSet source of the data for the histogram
00063      * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1.
00064      */
00065     explicit WValueSetHistogram( const WValueSetBase& valueSet, size_t buckets = 1000 );
00066 
00067     /**
00068      * Constructor. Creates a histogram from the specified value set but allows cropping of values below the given min and above the given max.
00069      * It actually interprets all values below min and above max to be exactly min and exactly max and sorts them into the appropriate bin. This
00070      * is especially useful to filter out outliers in data.
00071      *
00072      * \param valueSet source data
00073      * \param min the new minimum to use
00074      * \param max the maximum to use
00075      * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1.
00076      */
00077      WValueSetHistogram( boost::shared_ptr< WValueSetBase > valueSet, double min, double max, size_t buckets = 1000 );
00078 
00079     /**
00080      * Constructor. Creates a histogram from the specified value set but allows cropping of values below the given min and above the given max.
00081      * It actually interprets all values below min and above max to be exactly min and exactly max and sorts them into the appropriate bin. This
00082      * is especially useful to filter out outliers in data.
00083      *
00084      * \param valueSet source data
00085      * \param min the new minimum to use
00086      * \param max the maximum to use
00087      * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1.
00088      */
00089     WValueSetHistogram( const WValueSetBase& valueSet, double min, double max, size_t buckets = 1000 );
00090 
00091     /**
00092      * Copy constructor. If another interval size is given the histogram gets matched to it using the initial bucket data.
00093      * \note this does not deep copy the m_initialBuckets and m_mappedBuckets array as these are shared_array instances.
00094      *
00095      * \param histogram another WValueSetHistogram
00096      * \param buckets the new number of buckets. Must be larger than 1 if specified.
00097      */
00098     WValueSetHistogram( const WValueSetHistogram& histogram, size_t buckets = 0 );
00099 
00100     /**
00101      * Destructor.
00102      */
00103     ~WValueSetHistogram();
00104 
00105     /**
00106      * Copy assignment. Copies the contents of the specified histogram to this instance.
00107      *
00108      * \param other the other instance
00109      *
00110      * \return this instance with the contents of the other one.
00111      * \note this does not deep copy the m_initialBuckets and m_mappedBuckets array as these are shared_array instances.
00112      */
00113     WValueSetHistogram& operator=( const WValueSetHistogram& other );
00114 
00115     /**
00116      * Get the count of the bucket.
00117      *
00118      * \param index which buckets count is to be returned; starts with 0 which is the bucket containing the smallest values.
00119      *
00120      * \return elements in the bucket.
00121      */
00122     virtual size_t operator[]( size_t index ) const;
00123 
00124     /**
00125      * Get the count of the bucket. Testing if the position is valid.
00126      *
00127      * \param index which buckets count is to be returned; starts with 0 which is the bucket containing the smallest values.
00128      *
00129      * \return elements in the bucket
00130      */
00131     virtual size_t at( size_t index ) const;
00132 
00133     /**
00134      * Returns the number of buckets in the histogram with the actual mapping.
00135      *
00136      * \return number of buckets
00137      */
00138     virtual size_t size() const;
00139 
00140     /**
00141      * Return the size of one bucket.
00142      *
00143      * \param index the width for this bucket is queried.
00144      *
00145      * \return the size of a bucket.
00146      */
00147     virtual double getBucketSize( size_t index = 0 ) const;
00148 
00149     /**
00150      * Returns the actual interval associated with the given index. The interval is open, meaning that
00151      * getIntervalForIndex( i ).second == getIntervalForIndex( i + 1 ).first but does not belong anymore to the interval itself but every value
00152      * smaller than getIntervalForIndex( i ).second.
00153      *
00154      * \param index the intex
00155      *
00156      * \return the open interval.
00157      */
00158     virtual std::pair< double, double > getIntervalForIndex( size_t index ) const;
00159 
00160     /**
00161      * Returns the right index to the bucket containing the given value. If a value larger than the maximum, the maximum index is returned. Same
00162      * for minimum; if the value is smaller than the minimum, 0 is returned.
00163      *
00164      * \param value the value to search the index for
00165      *
00166      * \return the index of the bucket
00167      */
00168     virtual size_t getIndexForValue( double value ) const;
00169 
00170     /**
00171      * This returns the number of value set entries added to the histogram. This is especially useful to normalize the histogram counts.
00172      *
00173      * \return the number of elements distributed in the buckets.
00174      */
00175     virtual size_t getTotalElementCount() const;
00176 
00177     /**
00178      * Sums up the buckets in the specified interval. Especially useful for cumulative distribution functions or similar.
00179      *
00180      * \param startIndex the index where to start counting including this one
00181      * \param endIndex the index where to end summing up excluding this one.
00182      *
00183      * \return the sum of all buckets in the interval.
00184      * \throw WOutOfBounds if one of the indices is invalid.
00185      */
00186     virtual size_t accumulate( size_t startIndex, size_t endIndex ) const;
00187 
00188 protected:
00189     /**
00190      * Return the initial buckets.
00191      *
00192      * \return m_initialBuckets
00193      */
00194     boost::shared_array< size_t > getInitialBuckets() const;
00195 
00196     /**
00197      * Return the number of initial buckets.
00198      *
00199      * \return m_nInitialBuckets
00200      */
00201     size_t getNInitialBuckets() const;
00202 
00203     /**
00204      * Return the size of one initial bucket.
00205      *
00206      * \return m_bucketSize
00207      */
00208     double getInitialBucketSize() const;
00209 
00210     /**
00211      * increment the value by one, contains the logic to find the element place in the array.
00212      * Should only be used in the constructor i.e. while iterating over WValueSet.
00213      *
00214      * \param value value to increment
00215      */
00216     virtual void insert( double value );
00217 
00218 private:
00219     /**
00220      * Size of one bucket in the initial histogram.
00221      */
00222     double m_initialBucketSize;
00223 
00224     /**
00225      * Pointer to all initial buckets of the histogram.
00226      */
00227     boost::shared_array< size_t > m_initialBuckets;
00228 
00229     /**
00230      * Number of buckets in the initial histogram.
00231      */
00232     size_t m_nInitialBuckets;
00233 
00234     /**
00235      * Pointer to all initial buckets of the histogram.
00236      */
00237     boost::shared_array< size_t > m_mappedBuckets;
00238 
00239     /**
00240      * Tracks the number of a buckets in the mapped histogram.
00241      */
00242     size_t m_nMappedBuckets;
00243 
00244     /**
00245      * Size of one bucket in the mapped histogram.
00246      */
00247     double m_mappedBucketSize;
00248 
00249     /**
00250      * The number of elements distributed in the buckets.
00251      */
00252     size_t m_nbTotalElements;
00253 
00254     /**
00255      * Actually builds the histogram. This function is simply used for avoiding code duplication in all these constructors.
00256      *
00257      * \param valueSet the value set.
00258      */
00259     void buildHistogram( const WValueSetBase& valueSet );
00260 };
00261 
00262 /**
00263  * Write a histogram in string representation to the given output stream.
00264  */
00265 std::ostream& operator<<( std::ostream& out, const WValueSetHistogram& h );
00266 
00267 inline size_t WValueSetHistogram::getIndexForValue( double value ) const
00268 {
00269     // the position on the scala
00270     double pos = ( value - m_minimum ) / static_cast< double >( m_mappedBucketSize );
00271     // the index is the floor( position )
00272     size_t idx = static_cast< size_t >( std::floor( pos ) );
00273 
00274     // is the index larger than the size?
00275     bool inU = ( idx < m_nMappedBuckets );
00276     // is the index smaller than the size?
00277     bool inL = ( pos >= 0.0 );
00278 
00279     // the trick done here is to clamp value into [m_minimum,m_maximum] without using if statements. The C++ Standard says that booleans are
00280     // always 1 if true.
00281     // NOTE: this is integral arithmetic
00282     return ( inL && inU ) * idx + ( !inU && inL ) * ( m_nMappedBuckets - 1 );
00283 }
00284 
00285 #endif  // WVALUESETHISTOGRAM_H
00286