Commit 92a308dd authored by Sebastian Eichelbaum's avatar Sebastian Eichelbaum

[CHANGE] - histogram now can clamp values to a certain user defined min-max interval

parent 46ee1e7d
......@@ -23,10 +23,12 @@
//---------------------------------------------------------------------------
#include <cstring> // memset()
#include <numeric>
#include <utility>
#include "../../common/WAssert.h"
#include "../../common/WLimits.h"
#include "../../common/exceptions/WOutOfBounds.h"
#include "../WDataHandlerEnums.h"
#include "WValueSetHistogram.h"
......@@ -34,45 +36,36 @@
WValueSetHistogram::WValueSetHistogram( boost::shared_ptr< WValueSetBase > valueSet, size_t buckets ):
WHistogram( valueSet->getMinimumValue(), valueSet->getMaximumValue(), buckets )
{
// create base histogram
WAssert( buckets > 1, "WValueSetHistogram::WValueSetHistogram : number of buckets needs to be larger than 1." );
m_nInitialBuckets = buckets - 1;
m_initialBucketSize = ( m_maximum - m_minimum ) / static_cast< double >( m_nInitialBuckets );
WAssert( m_initialBucketSize > 0.0, "WValueSetHistogram::WValueSetHistogram() : m_initialBucketSize to small." );
// NOTE: as all the intervals are right-open, we need an additional slot in our array for the last interval [m_maximum,\infinity). For the
// calculation of interval sizes, the value must not be incremented
m_nInitialBuckets++;
// create and initialize array to zero which finally contains the counts
size_t* initialBuckets = new size_t[ m_nInitialBuckets ];
memset( initialBuckets, 0, m_nInitialBuckets * sizeof( size_t ) );
// *initialBuckets = { 0 }; // this should works with C++0x (instead memset), TEST IT!
buildHistogram( *valueSet );
}
// the array can be shared among several instances of WValueSetHistogram.
m_initialBuckets = boost::shared_array< size_t >( initialBuckets );
WValueSetHistogram::WValueSetHistogram( const WValueSetBase& valueSet, size_t buckets ):
WHistogram( valueSet.getMinimumValue(), valueSet.getMaximumValue(), buckets )
{
buildHistogram( valueSet );
}
// no mapping applied yet. Initial and mapped are equal
m_nMappedBuckets = m_nInitialBuckets;
m_mappedBuckets = m_initialBuckets;
m_mappedBucketSize = m_initialBucketSize;
WValueSetHistogram::WValueSetHistogram( boost::shared_ptr< WValueSetBase > valueSet, double min, double max, size_t buckets ):
WHistogram( min, max, buckets )
{
buildHistogram( *valueSet );
}
// and finally create the histogram
for( size_t i = 0; i < valueSet->size(); ++i )
{
double tmp = valueSet->getScalarDouble( i );
insert( tmp );
}
WValueSetHistogram::WValueSetHistogram( const WValueSetBase& valueSet, double min, double max, size_t buckets ):
WHistogram( min, max, buckets )
{
buildHistogram( valueSet );
}
WValueSetHistogram::WValueSetHistogram( const WValueSetBase& valueSet, size_t buckets ):
WHistogram( valueSet.getMinimumValue(), valueSet.getMaximumValue(), buckets )
void WValueSetHistogram::buildHistogram( const WValueSetBase& valueSet )
{
m_nbTotalElements = 0;
// create base histogram
WAssert( buckets > 1, "WValueSetHistogram::WValueSetHistogram : number of buckets needs to be larger than 1." );
m_nInitialBuckets = buckets - 1;
WAssert( m_nbBuckets > 1, "WValueSetHistogram::buildHistogram : number of buckets needs to be larger than 1." );
m_nInitialBuckets = m_nbBuckets - 1;
m_initialBucketSize = ( m_maximum - m_minimum ) / static_cast< double >( m_nInitialBuckets );
WAssert( m_initialBucketSize > 0.0, "WValueSetHistogram::WValueSetHistogram() : m_initialBucketSize to small." );
WAssert( m_initialBucketSize > 0.0, "WValueSetHistogram::buildHistogram() : m_initialBucketSize to small." );
// NOTE: as all the intervals are right-open, we need an additional slot in our array for the last interval [m_maximum,\infinity). For the
// calculation of interval sizes, the value must not be incremented
......@@ -97,6 +90,8 @@ WValueSetHistogram::WValueSetHistogram( const WValueSetBase& valueSet, size_t bu
double tmp = valueSet.getScalarDouble( i );
insert( tmp );
}
m_nbTotalElements = valueSet.size();
}
WValueSetHistogram::WValueSetHistogram( const WValueSetHistogram& histogram, size_t buckets ):
......@@ -106,7 +101,8 @@ WValueSetHistogram::WValueSetHistogram( const WValueSetHistogram& histogram, siz
m_nInitialBuckets( histogram.m_nInitialBuckets ),
m_mappedBuckets( histogram.m_mappedBuckets ),
m_nMappedBuckets( histogram.m_nMappedBuckets ),
m_mappedBucketSize( histogram.m_mappedBucketSize )
m_mappedBucketSize( histogram.m_mappedBucketSize ),
m_nbTotalElements( histogram.m_nbTotalElements )
{
// apply modification of the histogram bucket size?
if( ( buckets == 0 ) || ( buckets == m_nMappedBuckets ) )
......@@ -193,8 +189,7 @@ double WValueSetHistogram::getBucketSize( size_t /* index */ ) const
void WValueSetHistogram::insert( double value )
{
size_t index = static_cast< size_t >( ( value - m_minimum ) / static_cast< double >( m_mappedBucketSize ) );
m_mappedBuckets[ index ]++;
m_mappedBuckets[ getIndexForValue( value ) ]++;
}
size_t WValueSetHistogram::operator[]( size_t index ) const
......@@ -216,6 +211,11 @@ size_t WValueSetHistogram::size() const
return m_nMappedBuckets; // overwrite the WHistogram::size here as we have our own size.
}
size_t WValueSetHistogram::getTotalElementCount() const
{
return m_nbTotalElements;
}
std::pair< double, double > WValueSetHistogram::getIntervalForIndex( size_t index ) const
{
double first = m_minimum + m_mappedBucketSize * index;
......@@ -223,6 +223,29 @@ std::pair< double, double > WValueSetHistogram::getIntervalForIndex( size_t inde
return std::make_pair( first, second );
}
size_t WValueSetHistogram::accumulate( size_t startIndex, size_t endIndex ) const
{
if ( startIndex > endIndex )
{
std::swap( startIndex, endIndex );
}
// valid index?
if ( endIndex > size() ) // as endIndex is exclusive, it is allowed to equal size()
{
throw WOutOfBounds( "The specified endIndex is out of bounds." );
}
// unfortunately, shared_array can't be used for std::accumulate
size_t acc = 0;
while ( startIndex != endIndex )
{
acc += m_mappedBuckets[ startIndex++ ];
}
return acc;
}
std::ostream& operator<<( std::ostream& out, const WValueSetHistogram& h )
{
for ( size_t i = 0; i < h.size() - 1; ++i )
......
......@@ -63,6 +63,30 @@ public:
*/
explicit WValueSetHistogram( const WValueSetBase& valueSet, size_t buckets = 1000 );
/**
* Constructor. Creates a histogram from the specified value set but allows cropping of values below the given min and above the given max.
* It actually interprets all values below min and above max to be exactly min and exactly max and sorts them into the appropriate bin. This
* is especially useful to filter out outliers in data.
*
* \param valueSet source data
* \param min the new minimum to use
* \param max the maximum to use
* \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1.
*/
WValueSetHistogram( boost::shared_ptr< WValueSetBase > valueSet, double min, double max, size_t buckets = 1000 );
/**
* Constructor. Creates a histogram from the specified value set but allows cropping of values below the given min and above the given max.
* It actually interprets all values below min and above max to be exactly min and exactly max and sorts them into the appropriate bin. This
* is especially useful to filter out outliers in data.
*
* \param valueSet source data
* \param min the new minimum to use
* \param max the maximum to use
* \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1.
*/
WValueSetHistogram( const WValueSetBase& valueSet, double min, double max, size_t buckets = 1000 );
/**
* Copy constructor. If another interval size is given the histogram gets matched to it using the initial bucket data.
* \note this does not deep copy the m_initialBuckets and m_mappedBuckets array as these are shared_array instances.
......@@ -132,6 +156,34 @@ public:
*/
virtual std::pair< double, double > getIntervalForIndex( size_t index ) const;
/**
* Returns the right index to the bucket containing the given value. If a value larger than the maximum, the maximum index is returned. Same
* for minimum; if the value is smaller than the minimum, 0 is returned.
*
* \param value the value to search the index for
*
* \return the index of the bucket
*/
virtual size_t getIndexForValue( double value ) const;
/**
* This returns the number of value set entries added to the histogram. This is especially useful to normalize the histogram counts.
*
* \return the number of elements distributed in the buckets.
*/
virtual size_t getTotalElementCount() const;
/**
* Sums up the buckets in the specified interval. Especially useful for cumulative distribution functions or similar.
*
* \param startIndex the index where to start counting including this one
* \param endIndex the index where to end summing up excluding this one.
*
* \return the sum of all buckets in the interval.
* \throw WOutOfBounds if one of the indices is invalid.
*/
virtual size_t accumulate( size_t startIndex, size_t endIndex ) const;
protected:
/**
* Return the initial buckets.
......@@ -193,6 +245,18 @@ private:
* Size of one bucket in the mapped histogram.
*/
double m_mappedBucketSize;
/**
* The number of elements distributed in the buckets.
*/
size_t m_nbTotalElements;
/**
* Actually builds the histogram. This function is simply used for avoiding code duplication in all these constructors.
*
* \param valueSet the value set.
*/
void buildHistogram( const WValueSetBase& valueSet );
};
/**
......@@ -200,5 +264,23 @@ private:
*/
std::ostream& operator<<( std::ostream& out, const WValueSetHistogram& h );
inline size_t WValueSetHistogram::getIndexForValue( double value ) const
{
// the position on the scala
double pos = ( value - m_minimum ) / static_cast< double >( m_mappedBucketSize );
// the index is the floor( position )
size_t idx = static_cast< size_t >( pos );
// is the index larger than the size?
bool inU = ( idx < m_nMappedBuckets );
// is the index smaller than the size?
bool inL = ( pos > 0.0 );
// the trick done here is to clamp value into [m_minimum,m_maximum] without using if statements. The C++ Standard says that booleans are
// always 1 if true.
// NOTE: this is integral arithmetic
return ( inL && inU ) * idx + ( !inU && inL ) * ( m_nMappedBuckets - 1 );
}
#endif // WVALUESETHISTOGRAM_H
......@@ -132,6 +132,53 @@ class WValueSetHistogramTest : public CxxTest::TestSuite
TS_ASSERT_EQUALS( hist.getBucketSize(), 1.0 ); // 0.0, 1.0, 2.0, 3.0 and 4.0
}
/**
* Test getIndexForValue()
*/
void testIndex( void )
{
// create some test data
double a[5] = { 0.0, 4.0, 1.0, 2.0, 1.0 };
const std::vector< double > v( a, a + sizeof( a ) / sizeof( double ) );
WValueSet< double >* valueSet = new WValueSet< double >( 0, 1, v, W_DT_DOUBLE );
// create histogram
WValueSetHistogram hist( *valueSet, 5 );
// 0 = [0, 1) = 1
// 1 = [1, 2) = 2
// 2 = [2, 3) = 1
// 3 = [3, 4) = 0
// 4 = [4, inf) = 1
TS_ASSERT_EQUALS( hist.getIndexForValue( 4.0 ), 4 );
TS_ASSERT_EQUALS( hist.getIndexForValue( 3.999 ), 3 );
TS_ASSERT_EQUALS( hist.getIndexForValue( 0.0 ), 0 );
TS_ASSERT_EQUALS( hist.getIndexForValue( 122.0 ), 4 ); // test values above maximum
TS_ASSERT_EQUALS( hist.getIndexForValue( -122.0 ), 0 ); // test values below minumum
}
/**
* Test accumulate
*/
void testAccum( void )
{
// create some test data
double a[5] = { 0.0, 4.0, 1.0, 2.0, 1.0 };
const std::vector< double > v( a, a + sizeof( a ) / sizeof( double ) );
WValueSet< double >* valueSet = new WValueSet< double >( 0, 1, v, W_DT_DOUBLE );
// create histogram
WValueSetHistogram hist( *valueSet, 5 );
std::cout << hist << std::endl;
TS_ASSERT_EQUALS( hist.accumulate( 0, 2 ), 3 );
TS_ASSERT_EQUALS( hist.accumulate( 2, 0 ), 3 ); // it also needs to handle switched indices
TS_ASSERT_EQUALS( hist.accumulate( 2, 2 ), 0 ); // exclude second index properly?
TS_ASSERT( hist.accumulate( 2, 2 ) != hist[ 2 ] ); // exclude second index properly?
TS_ASSERT_THROWS_ANYTHING( hist.accumulate( 0, 123 ) );
}
/**
* Test copy construction.
**/
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment