ATLAS Offline Software
Loading...
Searching...
No Matches
FloatCompressor.cxx
Go to the documentation of this file.
1/*
2 Copyright (C) 2002-2022 CERN for the benefit of the ATLAS collaboration
3*/
4
5// $Id: FloatCompressor.cxx 789425 2016-12-13 10:50:12Z krasznaa $
6
7// System include(s):
8#include <cmath>
9
10// Local include(s):
12
13namespace CxxUtils {
14
16 static const unsigned int NMANTISSA = 23;
17
18 FloatCompressor::FloatCompressor( unsigned int mantissaBits )
19 : m_mantissaBits( mantissaBits ), m_mantissaBitmask( 0 ) {
20
21 // IEEE754 single-precision float
22 // SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
23 // F F 8 0 0 0 7 F
24
25 // Definition:
26 //
27 // Assume that we'd like to keep only 7 bits in the mantissa
28 // In this case the memory layout of the bits will be:
29 //
30 // Sign | Exp (8 bits) | Frac (23 bits)
31 // S EEEEEEEE FFFFFFLRTTTTTTTTTTTTTTT
32 //
33 // where
34 //
35 // S : Sign bit
36 // E : Exponent bits
37 // F : Fraction bits
38 // L : Least significant bit (lsb) for 7 bits mantissa precision
39 // R : Rounding bit
40 // T : Sticky bits (i.e any bit after lsb + 1)
41 //
42 // In the current implementation there are essentially 4 cases:
43 //
44 // Case 1: L = 0 and R = 0
45 // In this case there'll be rounding down
46 //
47 // Case 2: L = 1 and R = 0
48 // In this case there'll be rounding down
49 //
50 // Case 3: L = 0 and R = 1
51 // In this case there'll be rounding up
52 //
53 // Note: This scenario can be different than bfloat16 implementation
54 // of TensorFlow, where they round down if all the Ts are zero.
55 // Otherwise, they also round up.
56 //
57 // Case 4: L = 1 and R = 1
58 // In this case there'll be rounding up
59 //
60 // In all cases, we do an extra check to avoid overflow.
61 //
62 // From a technical point of view, the rounding is computed
63 // to be the half of the lsb(=1) and added to the original value
64 // as long as the new value doesn't overflow. Then the
65 // undesired bits are masked. We never go below 5 bits in the
66 // mantissa.
67
68 // Adjust the received bit number to some reasonable value:
69 if( m_mantissaBits < 5 ) {
71 }
74 }
75
76 // Fill up the lower N bits:
77 for( unsigned int i = 0; i < ( NMANTISSA - m_mantissaBits ); ++i ) {
78 m_mantissaBitmask |= ( 0x1 << i );
79 }
80 // And now negate it to get the correct mask:
82
83 // Set the Magic numbers
85 m_rounding = 0;
86 }
87 else {
88 m_rounding = 0x1 << ( 32 - (1 + 8 + m_mantissaBits) - 1 );
89 }
90 // The part below is taken from AthenaPoolCnvSvc/Compressor
91 // and would work the same as long as the user doesn't
92 // compress lower than 3 mantissa bits, which is not allowed
93 // in any case.
94 m_vmax = 0x7f7 << 20;
95 m_vmax |= 0x000fffff xor (m_rounding);
96 }
97
98 float FloatCompressor::reduceFloatPrecision( float value ) const {
99
100 // Check if any compression is to be made:
101 if( m_mantissaBits == NMANTISSA ) {
102 return value;
103 }
104
105 // Check for NaN, etc:
106 if( ! std::isfinite( value ) ) {
107 return value;
108 }
109
110 // Create the helper object:
111 floatint_t fi;
112 fi.fvalue = value;
113
114 //safety-check if value (omitting the sign-bit) is lower than vmax
115 //(avoid overflow)
116 if( ( fi.ivalue & 0x7fffffff ) < m_vmax ) {
117 fi.ivalue += m_rounding;
118 }
119
120 // Do the compression:
122 return fi.fvalue;
123 }
124
125} // namespace CxxUtils
unsigned int m_mantissaBits
Number of mantissa bits to keep.
uint32_t m_vmax
Largest possible positive 32bit float minus the rounding.
uint32_t m_mantissaBitmask
Bitmask for zeroing out the non-interesting bits.
float reduceFloatPrecision(float value) const
Function returning a reduced precision float value.
FloatCompressor(unsigned int mantissaBits=7)
Constructor with the number of mantissa bits to retain.
static const unsigned int NMANTISSA
Total number of total mantissa bits.
Type used in the compression.