OpenWalnut  1.4.0
WValueSetHistogram.h
1 //---------------------------------------------------------------------------
2 //
3 // Project: OpenWalnut ( http://www.openwalnut.org )
4 //
5 // Copyright 2009 OpenWalnut Community, BSV@Uni-Leipzig and CNCF@MPI-CBS
6 // For more information see http://www.openwalnut.org/copying
7 //
8 // This file is part of OpenWalnut.
9 //
10 // OpenWalnut is free software: you can redistribute it and/or modify
11 // it under the terms of the GNU Lesser General Public License as published by
12 // the Free Software Foundation, either version 3 of the License, or
13 // (at your option) any later version.
14 //
15 // OpenWalnut is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 // GNU Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public License
21 // along with OpenWalnut. If not, see <http://www.gnu.org/licenses/>.
22 //
23 //---------------------------------------------------------------------------
24 
25 #ifndef WVALUESETHISTOGRAM_H
26 #define WVALUESETHISTOGRAM_H
27 
28 #include <map>
29 #include <list>
30 #include <utility>
31 
32 #include <boost/shared_ptr.hpp>
33 #include <boost/scoped_array.hpp>
34 #include <boost/shared_array.hpp>
35 
36 #include "../../common/WHistogram.h"
37 #include "../WValueSet.h"
38 
39 
40 /**
41  * Used to find the occurrence frequencies of values in a value set. It implements a classical histogram but allows easy modification of bucket
42  * sizes without unnecessary recalculation of the whole histogram. This histogram uses right-open intervals for counting, which is why there
43  * always is a bucket at the end from max to infinity which holds all the max values.
44  *
45  * \note This histogram is different from from WValueSetHistogram which is a generic histogram class.
46  */
47 class WValueSetHistogram: public WHistogram // NOLINT
48 {
49 friend class WValueSetHistogramTest;
50 public:
51  /**
52  * Constructor. Creates the histogram for the specified value set.
53  *
54  * \param valueSet source of the data for the histogram
55  * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1.
56  */
57  explicit WValueSetHistogram( boost::shared_ptr< WValueSetBase > valueSet, size_t buckets = 1000 );
58 
59  /**
60  * Constructor. Creates the histogram for the specified value set.
61  *
62  * \param valueSet source of the data for the histogram
63  * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1.
64  */
65  explicit WValueSetHistogram( const WValueSetBase& valueSet, size_t buckets = 1000 );
66 
67  /**
68  * Constructor. Creates a histogram from the specified value set but allows cropping of values below the given min and above the given max.
69  * It actually interprets all values below min and above max to be exactly min and exactly max and sorts them into the appropriate bin. This
70  * is especially useful to filter out outliers in data.
71  *
72  * \param valueSet source data
73  * \param min the new minimum to use
74  * \param max the maximum to use
75  * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1.
76  */
77  WValueSetHistogram( boost::shared_ptr< WValueSetBase > valueSet, double min, double max, size_t buckets = 1000 );
78 
79  /**
80  * Constructor. Creates a histogram from the specified value set but allows cropping of values below the given min and above the given max.
81  * It actually interprets all values below min and above max to be exactly min and exactly max and sorts them into the appropriate bin. This
82  * is especially useful to filter out outliers in data.
83  *
84  * \param valueSet source data
85  * \param min the new minimum to use
86  * \param max the maximum to use
87  * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1.
88  */
89  WValueSetHistogram( const WValueSetBase& valueSet, double min, double max, size_t buckets = 1000 );
90 
91  /**
92  * Copy constructor. If another interval size is given the histogram gets matched to it using the initial bucket data.
93  * \note this does not deep copy the m_initialBuckets and m_mappedBuckets array as these are shared_array instances.
94  *
95  * \param histogram another WValueSetHistogram
96  * \param buckets the new number of buckets. Must be larger than 1 if specified.
97  */
98  WValueSetHistogram( const WValueSetHistogram& histogram, size_t buckets = 0 );
99 
100  /**
101  * Destructor.
102  */
104 
105  /**
106  * Copy assignment. Copies the contents of the specified histogram to this instance.
107  *
108  * \param other the other instance
109  *
110  * \return this instance with the contents of the other one.
111  * \note this does not deep copy the m_initialBuckets and m_mappedBuckets array as these are shared_array instances.
112  */
114 
115  /**
116  * Get the count of the bucket.
117  *
118  * \param index which buckets count is to be returned; starts with 0 which is the bucket containing the smallest values.
119  *
120  * \return elements in the bucket.
121  */
122  virtual size_t operator[]( size_t index ) const;
123 
124  /**
125  * Get the count of the bucket. Testing if the position is valid.
126  *
127  * \param index which buckets count is to be returned; starts with 0 which is the bucket containing the smallest values.
128  *
129  * \return elements in the bucket
130  */
131  virtual size_t at( size_t index ) const;
132 
133  /**
134  * Returns the number of buckets in the histogram with the actual mapping.
135  *
136  * \return number of buckets
137  */
138  virtual size_t size() const;
139 
140  /**
141  * Return the size of one bucket.
142  *
143  * \param index the width for this bucket is queried.
144  *
145  * \return the size of a bucket.
146  */
147  virtual double getBucketSize( size_t index = 0 ) const;
148 
149  /**
150  * Returns the actual interval associated with the given index. The interval is open, meaning that
151  * getIntervalForIndex( i ).second == getIntervalForIndex( i + 1 ).first but does not belong anymore to the interval itself but every value
152  * smaller than getIntervalForIndex( i ).second.
153  *
154  * \param index the intex
155  *
156  * \return the open interval.
157  */
158  virtual std::pair< double, double > getIntervalForIndex( size_t index ) const;
159 
160  /**
161  * Returns the right index to the bucket containing the given value. If a value larger than the maximum, the maximum index is returned. Same
162  * for minimum; if the value is smaller than the minimum, 0 is returned.
163  *
164  * \param value the value to search the index for
165  *
166  * \return the index of the bucket
167  */
168  virtual size_t getIndexForValue( double value ) const;
169 
170  /**
171  * This returns the number of value set entries added to the histogram. This is especially useful to normalize the histogram counts.
172  *
173  * \return the number of elements distributed in the buckets.
174  */
175  virtual size_t getTotalElementCount() const;
176 
177  /**
178  * Sums up the buckets in the specified interval. Especially useful for cumulative distribution functions or similar.
179  *
180  * \param startIndex the index where to start counting including this one
181  * \param endIndex the index where to end summing up excluding this one.
182  *
183  * \return the sum of all buckets in the interval.
184  * \throw WOutOfBounds if one of the indices is invalid.
185  */
186  virtual size_t accumulate( size_t startIndex, size_t endIndex ) const;
187 
188 protected:
189  /**
190  * Return the initial buckets.
191  *
192  * \return m_initialBuckets
193  */
194  boost::shared_array< size_t > getInitialBuckets() const;
195 
196  /**
197  * Return the number of initial buckets.
198  *
199  * \return m_nInitialBuckets
200  */
201  size_t getNInitialBuckets() const;
202 
203  /**
204  * Return the size of one initial bucket.
205  *
206  * \return m_bucketSize
207  */
208  double getInitialBucketSize() const;
209 
210  /**
211  * increment the value by one, contains the logic to find the element place in the array.
212  * Should only be used in the constructor i.e. while iterating over WValueSet.
213  *
214  * \param value value to increment
215  */
216  virtual void insert( double value );
217 
218 private:
219  /**
220  * Size of one bucket in the initial histogram.
221  */
223 
224  /**
225  * Pointer to all initial buckets of the histogram.
226  */
227  boost::shared_array< size_t > m_initialBuckets;
228 
229  /**
230  * Number of buckets in the initial histogram.
231  */
233 
234  /**
235  * Pointer to all initial buckets of the histogram.
236  */
237  boost::shared_array< size_t > m_mappedBuckets;
238 
239  /**
240  * Tracks the number of a buckets in the mapped histogram.
241  */
243 
244  /**
245  * Size of one bucket in the mapped histogram.
246  */
248 
249  /**
250  * The number of elements distributed in the buckets.
251  */
253 
254  /**
255  * Actually builds the histogram. This function is simply used for avoiding code duplication in all these constructors.
256  *
257  * \param valueSet the value set.
258  */
259  void buildHistogram( const WValueSetBase& valueSet );
260 };
261 
262 /**
263  * Write a histogram in string representation to the given output stream.
264  */
265 std::ostream& operator<<( std::ostream& out, const WValueSetHistogram& h );
266 
267 inline size_t WValueSetHistogram::getIndexForValue( double value ) const
268 {
269  // the position on the scala
270  double pos = ( value - m_minimum ) / static_cast< double >( m_mappedBucketSize );
271  // the index is the floor( position )
272  size_t idx = static_cast< size_t >( std::floor( pos ) );
273 
274  // is the index larger than the size?
275  bool inU = ( idx < m_nMappedBuckets );
276  // is the index smaller than the size?
277  bool inL = ( pos >= 0.0 );
278 
279  // the trick done here is to clamp value into [m_minimum,m_maximum] without using if statements. The C++ Standard says that booleans are
280  // always 1 if true.
281  // NOTE: this is integral arithmetic
282  return ( inL && inU ) * idx + ( !inU && inL ) * ( m_nMappedBuckets - 1 );
283 }
284 
285 #endif // WVALUESETHISTOGRAM_H
286 
Used to find the occurrence frequencies of values in a value set.
virtual size_t getTotalElementCount() const
This returns the number of value set entries added to the histogram.
size_t m_nInitialBuckets
Number of buckets in the initial histogram.
size_t getNInitialBuckets() const
Return the number of initial buckets.
~WValueSetHistogram()
Destructor.
virtual std::pair< double, double > getIntervalForIndex(size_t index) const
Returns the actual interval associated with the given index.
double m_mappedBucketSize
Size of one bucket in the mapped histogram.
double m_minimum
The smallest value.
Definition: WHistogram.h:123
Test WValueSetHistogram.
virtual size_t size() const
Returns the number of buckets in the histogram with the actual mapping.
WValueSetHistogram & operator=(const WValueSetHistogram &other)
Copy assignment.
boost::shared_array< size_t > m_initialBuckets
Pointer to all initial buckets of the histogram.
Container which associate values with (uniform width) bins (aka intervals or buckets).
Definition: WHistogram.h:36
size_t m_nMappedBuckets
Tracks the number of a buckets in the mapped histogram.
virtual void insert(double value)
increment the value by one, contains the logic to find the element place in the array.
double m_initialBucketSize
Size of one bucket in the initial histogram.
double getInitialBucketSize() const
Return the size of one initial bucket.
virtual size_t operator[](size_t index) const
Get the count of the bucket.
boost::shared_array< size_t > m_mappedBuckets
Pointer to all initial buckets of the histogram.
WValueSetHistogram(boost::shared_ptr< WValueSetBase > valueSet, size_t buckets=1000)
Constructor.
void buildHistogram(const WValueSetBase &valueSet)
Actually builds the histogram.
virtual double getBucketSize(size_t index=0) const
Return the size of one bucket.
size_t m_nbTotalElements
The number of elements distributed in the buckets.
Abstract base class to all WValueSets.
Definition: WValueSetBase.h:59
virtual size_t at(size_t index) const
Get the count of the bucket.
virtual size_t getIndexForValue(double value) const
Returns the right index to the bucket containing the given value.
boost::shared_array< size_t > getInitialBuckets() const
Return the initial buckets.
virtual size_t accumulate(size_t startIndex, size_t endIndex) const
Sums up the buckets in the specified interval.