331 lines
13 KiB
C++
331 lines
13 KiB
C++
#pragma once
|
|
#include "Defines.h"
|
|
#include "CompileTimeArray.h"
|
|
#include <vector>
|
|
#include <assert.h>
|
|
#include "../inc/tbb/parallel_for.h"
|
|
|
|
template<size_t x> struct BitCount
|
|
{
|
|
// The line below actually counts the number of bits set in x
|
|
enum {
|
|
value = (((((x & 0x55) + ((x >> 1) & 0x55)) & 0x33) + ((((x & 0x55) + ((x >> 1) & 0x55)) >> 2) & 0x33)) & 0x0F) +
|
|
((((((x & 0x55) + ((x >> 1) & 0x55)) & 0x33) + ((((x & 0x55) + ((x >> 1) & 0x55)) >> 2) & 0x33)) >> 4) & 0x0F)
|
|
};
|
|
};
|
|
|
|
typedef generate_array<256, BitCount>::result BitCountLookupTable;
|
|
|
|
namespace BitHelper
|
|
{
|
|
inline static unsigned8 GetSet(unsigned64 x)
|
|
{
|
|
x = (x & 0x5555555555555555) + ((x >> 1) & 0x5555555555555555); // 0x55 = 01010101
|
|
x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333); // 0x33 = 00110011
|
|
x = (x & 0x0F0F0F0F0F0F0F0F) + ((x >> 4) & 0x0F0F0F0F0F0F0F0F); // 0x0F = 00001111
|
|
x = (x & 0x00FF00FF00FF00FF) + ((x >> 8) & 0x00FF00FF00FF00FF);
|
|
x = (x & 0x0000FFFF0000FFFF) + ((x >> 16) & 0x0000FFFF0000FFFF);
|
|
x = (x & 0x00000000FFFFFFFF) + ((x >> 32) & 0x00000000FFFFFFFF);
|
|
return (unsigned8)x;
|
|
}
|
|
|
|
inline static unsigned8 GetSet(unsigned32 v)
|
|
{
|
|
// From https://graphics.stanford.edu/~seander/bithacks.html, method only uses 12 operations :)
|
|
v = v - ((v >> 1) & 0x55555555); // reuse input as temporary
|
|
v = (v & 0x33333333) + ((v >> 2) & 0x33333333); // temp
|
|
return (unsigned8)((((v + (v >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24); // count
|
|
|
|
//x -= ((x >> 1) & 0x55555555);
|
|
//x = (((x >> 2) & 0x33333333) + (x & 0x33333333));
|
|
//x = (((x >> 4) + x) & 0x0f0f0f0f);
|
|
//x += (x >> 8);
|
|
//x += (x >> 16);
|
|
//return (x & 0x0000003f);
|
|
}
|
|
|
|
inline static unsigned8 GetSet(unsigned16 x)
|
|
{
|
|
x = (x & 0x5555) + ((x >> 1) & 0x5555); // 0x55 = 01010101
|
|
x = (x & 0x3333) + ((x >> 2) & 0x3333); // 0x33 = 00110011
|
|
x = (x & 0x0F0F) + ((x >> 4) & 0x0F0F); // 0x0F = 00001111
|
|
x = (x & 0x00FF) + ((x >> 8) & 0x00FF);
|
|
return (unsigned8)x;
|
|
}
|
|
|
|
inline static unsigned8 GetSet(unsigned8 x)
|
|
{
|
|
return BitCountLookupTable::data[x];
|
|
//// Parallel bit count
|
|
//x = (x & 0x55) + ((x >> 1) & 0x55); // 0x55 = 01010101
|
|
//x = (x & 0x33) + ((x >> 2) & 0x33); // 0x33 = 00110011
|
|
//x = (x & 0x0F) + ((x >> 4) & 0x0F); // 0x0F = 00001111
|
|
//return x;
|
|
}
|
|
|
|
template<typename T>
|
|
inline T GetLSMaskUntil(const unsigned8 bit) { return (~((T)0)) << bit; }
|
|
|
|
template<typename T> inline T GetLSSingleBitMask(const unsigned8 bit) { return (T)1 << bit; }
|
|
template<typename T, unsigned8 bitsInT = sizeof(T) * 8> inline T GetHSSingleBitMask(const unsigned8 bit) { return (T)1 << (bitsInT - 1 - bit); }
|
|
// Template specialization for HSSingleBitMask
|
|
template<> inline unsigned8 GetHSSingleBitMask(const unsigned8 pos) { return (unsigned8)0x80 >> pos; }
|
|
template<> inline unsigned16 GetHSSingleBitMask(const unsigned8 pos) { return (unsigned16)0x8000 >> pos; }
|
|
template<> inline unsigned32 GetHSSingleBitMask(const unsigned8 pos) { return (unsigned32)0x80000000 >> pos; }
|
|
template<> inline unsigned64 GetHSSingleBitMask(const unsigned8 pos) { return (unsigned64)0x8000000000000000 >> pos; }
|
|
|
|
template<typename T, unsigned8 bitsInT = sizeof(T) * 8>
|
|
inline T GetLSMask(const unsigned8 startIndex, const unsigned8 endIndex) {
|
|
if (startIndex == endIndex) return (T)0;
|
|
// Note that s needs to be defined first. If this happens in the same line, I couldn't get the compiler to make sure the size of the datatype is correct
|
|
T s = ~((T)0);
|
|
unsigned8 firstShift = bitsInT - endIndex;
|
|
s <<= firstShift;
|
|
s >>= firstShift + startIndex;
|
|
s <<= startIndex;
|
|
return s;
|
|
}
|
|
template<typename T, unsigned8 bitsInT = sizeof(T) * 8>
|
|
inline T GetHSMask(const unsigned8 startIndex, const unsigned8 endIndex) {
|
|
if (startIndex == endIndex) return (T)0;
|
|
// Note that s needs to be defined first. If this happens in the same line, I couldn't get the compiler to make sure the size of the datatype is correct
|
|
T s = ~((T)0);
|
|
unsigned8 firstShift = bitsInT - endIndex;
|
|
s >>= firstShift;
|
|
s <<= firstShift + startIndex;
|
|
s >>= startIndex;
|
|
return s;
|
|
}
|
|
|
|
template<typename T, unsigned8 bitsInT = sizeof(T) * 8>
|
|
inline unsigned8 GetHSSetBefore(const T value, const unsigned8 pos) { return GetSet((T)(value << (bitsInT - pos))); }
|
|
|
|
template<typename T> inline bool GetHS(const T value, const unsigned8 pos) { return (GetHSSingleBitMask<T>(pos) & value) != 0; }
|
|
template<typename T> inline bool GetLS(const T value, const unsigned8 pos) { return (BitHelper::GetLSSingleBitMask<T>(pos) & value) != 0; }
|
|
|
|
template<typename T> inline T SetLS(const T value, const T& setMask, const T& setValue) { return (value & ~setMask) | (setMask & setValue); }
|
|
|
|
template<typename T> inline void SetLS(T& value, const unsigned8 pos) { value |= GetLSSingleBitMask<T>(pos); }
|
|
template<typename T> inline void SetHS(T& value, const unsigned8 pos) { value |= GetHSSingleBitMask<T>(pos); }
|
|
|
|
template<typename T> inline void SetLS(T& value, const unsigned8 pos, const bool& set) {
|
|
T bitmask = GetLSSingleBitMask<T>(pos);
|
|
value &= ~bitmask; // Clear the bit
|
|
if (set) value |= bitmask; // Set it if necessary
|
|
}
|
|
template<typename T> inline void SetHS(T& value, const unsigned8 pos, const bool& set)
|
|
{
|
|
T bitmask = GetHSSingleBitMask<T>(pos);
|
|
value &= ~bitmask;
|
|
if (set) value |= bitmask;
|
|
}
|
|
|
|
inline unsigned32 CeilToNearestPowerOfTwo(unsigned32 v)
|
|
{
|
|
v--;
|
|
v |= v >> 1;
|
|
v |= v >> 2;
|
|
v |= v >> 4;
|
|
v |= v >> 8;
|
|
v |= v >> 16;
|
|
v++;
|
|
return v;
|
|
}
|
|
inline unsigned32 FloorToNearestPowerOfTwo(unsigned32 value) { return CeilToNearestPowerOfTwo(value) >> 1; }
|
|
|
|
template<typename T>
|
|
inline bool IsPowerOfTwo(T v) { return v && !(v & (v - 1)); }
|
|
|
|
// Log2 = the index of the highest significant bit.
|
|
template<typename T>
|
|
inline unsigned8 Log2(T v) {
|
|
unsigned8 r = 0;
|
|
while (v >>= 1) r++;
|
|
return (unsigned8)r;
|
|
}
|
|
|
|
template<typename T>
|
|
inline unsigned8 Log2Ceil(T v) {
|
|
return Log2(v) + (IsPowerOfTwo(v) ? 0 : 1); // If v is not a power of two, the number was rounded down.
|
|
}
|
|
|
|
// 2^x is equal to taking the number 1 and shifting it x indices.
|
|
template<typename T>
|
|
inline unsigned64 Exp2(const T& v) { return ((unsigned64)1) << v; }
|
|
|
|
template<typename T>
|
|
inline T RoundToBytes(const T& v)
|
|
{
|
|
if ((v & GetLSMask<T>(0, 3)) != 0) return ((v >> 3) + 1) << 3;
|
|
else return v;
|
|
}
|
|
|
|
|
|
|
|
template<typename T, unsigned8 bitsInT = sizeof(T) * 8>
|
|
inline T CircularShiftLeft(T v, T shift) { shift %= bitsInT; return (v << shift) | (v >> (bitsInT - shift)); }
|
|
|
|
template<typename T, unsigned8 bitsInT = sizeof(T) * 8>
|
|
inline T CircularShiftRight(T v, T shift) { shift %= bitsInT; return (v >> shift) | (v << (bitsInT - shift)); }
|
|
|
|
template<typename T>
|
|
inline void SplitInBytesAndMove(T value, std::vector<unsigned8>& destination, size_t offset, size_t size = sizeof(T))
|
|
{
|
|
SplitInBytesAndMove<T>(value, &destination[0], offset, size);
|
|
}
|
|
|
|
template<typename T>
|
|
inline void SplitInBytesAndMove(T value, unsigned8* destination, size_t offset, size_t size = sizeof(T))
|
|
{
|
|
for (size_t byte = 0; byte < size; byte++)
|
|
{
|
|
size_t lsByte = size - 1 - byte;
|
|
destination[offset + byte] = (unsigned8)((GetLSMask<T>((unsigned8)(lsByte * 8), (unsigned8)((lsByte + 1) * 8)) & value) >> (lsByte << 3));
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
inline std::vector<unsigned8> SplitInBytes(T value, size_t size = sizeof(T))
|
|
{
|
|
std::vector<unsigned8> res(size);
|
|
SplitInBytesAndMove(value, res, 0, size);
|
|
return res;
|
|
}
|
|
|
|
template<typename T>
|
|
inline void JoinBytes(const std::vector<unsigned8>& source, T& dest, size_t offset = 0, size_t size = sizeof(T)) { JoinBytes(&source[0], dest, offset, size); }
|
|
|
|
template<typename T>
|
|
inline void JoinBytes(const unsigned8* source, T& dest, size_t offset = 0, size_t size = sizeof(T))
|
|
{
|
|
for (size_t byte = 0; byte < size; byte++)
|
|
dest |= source[offset + byte] << ((size - byte - 1) * 8);
|
|
}
|
|
|
|
template<typename T>
|
|
inline void JoinBytesLittleEndian(const std::vector<unsigned8>& source, T& dest, size_t offset = 0, size_t size = sizeof(T)) { JoinBytes(&source[0], dest, offset, size); }
|
|
|
|
template<typename T>
|
|
inline void JoinBytesLittleEndian(const unsigned8* source, T& dest, size_t offset = 0, size_t size = sizeof(T))
|
|
{
|
|
for (size_t byte = 0; byte < size; byte++)
|
|
dest |= source[offset + (size - byte - 1)] << ((size - byte - 1) * 8);
|
|
}
|
|
|
|
template<typename T, unsigned8 bitsInT = ((unsigned8)sizeof(T) * 8)>
|
|
std::vector<unsigned8> GetBitMapHS(T mask)
|
|
{
|
|
unsigned setBits = BitHelper::GetSet(mask);
|
|
std::vector<unsigned8> bitMap(setBits);
|
|
unsigned index = 0;
|
|
for (unsigned8 i = 0; i < bitsInT; i++)
|
|
{
|
|
if (BitHelper::GetHS(mask, i))
|
|
bitMap[index++] = i;
|
|
}
|
|
return bitMap;
|
|
}
|
|
|
|
// Packs the value v tightly, meaning that only the bits that are set in the mask will be kept, and they will be shifted so that they are next to each other
|
|
template<typename T, unsigned8 bitsInType = sizeof(T) * 8>
|
|
inline T PackTight(const T& v, const T& mask)
|
|
{
|
|
unsigned8 bitOffset = bitsInType - GetSet(mask);
|
|
T res = 0;
|
|
for (T i = 0; i < bitsInType; i++)
|
|
{
|
|
T curBitMask = GetHSSingleBitMask(i);
|
|
if ((mask & curBitMask) != 0)
|
|
{
|
|
if ((v & curBitMask) != 0) SetHS(res, bitOffset);
|
|
bitOffset++;
|
|
}
|
|
}
|
|
return res;
|
|
}
|
|
|
|
template<typename T, unsigned8 bitsInType = sizeof(T) * 8>
|
|
inline T UnpackTight(const T& v, const T& mask)
|
|
{
|
|
unsigned8 bitOffset = bitsInType - GetSet(mask);
|
|
T res = 0;
|
|
for (unsigned8 i = 0; i < bitsInType; i++)
|
|
if (GetHS(mask, i))
|
|
if (GetHS(v, bitOffset++)) SetHS(res, i);
|
|
return res;
|
|
}
|
|
|
|
// Used to tightly pack a whole vector of values at once. Offset should give the offset in bits within the first packed value
|
|
template<typename T_packed, typename T_unpacked, unsigned8 bitsInPacked = sizeof(T_packed) * 8, unsigned8 bitsInUnpacked = sizeof(T_unpacked) * 8>
|
|
std::vector<T_packed> PackTight(const std::vector<T_unpacked>& input, const std::vector<unsigned8>& bitMap, const unsigned8 offset = 0, const size_t startIndex = 0, size_t endIndex = ~size_t(0))
|
|
{
|
|
assert(offset < bitsInPacked);
|
|
if (endIndex > input.size()) endIndex = input.size();
|
|
size_t setBits = bitMap.size();
|
|
size_t requiredBits = setBits * (endIndex - startIndex) + offset;
|
|
size_t outputSize = requiredBits / bitsInPacked + (((requiredBits % bitsInPacked) == 0) ? 0 : 1);
|
|
std::vector<T_packed> packed(outputSize);
|
|
// Compress the new data (in parallel per packed byte)
|
|
tbb::parallel_for(size_t(0), outputSize, [&](size_t i)
|
|
{
|
|
size_t startBit = i * bitsInPacked;
|
|
T_packed cur = 0;
|
|
for (size_t j = 0; j < bitsInPacked; j++)
|
|
{
|
|
size_t bit = startBit + j;
|
|
if (bit < offset) continue;
|
|
bit -= offset;
|
|
size_t sourceIndex = bit / setBits;
|
|
size_t sourceBitIndex = bit % setBits;
|
|
size_t inputIndex = startIndex + sourceIndex;
|
|
if (inputIndex >= endIndex) break;
|
|
auto source = input[inputIndex];
|
|
if (GetHS(source, bitMap[sourceBitIndex]))
|
|
SetHS(cur, (unsigned8)j);
|
|
}
|
|
packed[i] = cur;
|
|
});
|
|
return packed;
|
|
}
|
|
|
|
// Used to tightly pack a whole vector of values at once. Offset should give the offset in bits within the first packed value
|
|
template<typename T_packed, typename T_unpacked, unsigned8 bitsInPacked = sizeof(T_packed) * 8, unsigned8 bitsInUnpacked = sizeof(T_unpacked) * 8>
|
|
std::vector<T_packed> PackTight(const std::vector<T_unpacked>& input, const T_unpacked& mask, const unsigned8 offset = 0, const size_t startIndex = 0, size_t endIndex = ~size_t(0))
|
|
{
|
|
return PackTight<T_packed, T_unpacked, bitsInPacked, bitsInUnpacked>(input, GetBitMapHS(mask), offset, startIndex, endIndex);
|
|
}
|
|
|
|
// Unpack a single packed byte at some location.
|
|
// packed should contain the vector with packed data
|
|
// i gives the index of the value to unpack (unpacked value index)
|
|
// bitMap should contain the bitMap for the mask to be used (GetBitMapHS(mask))
|
|
// offset gives the offset from which the data starts
|
|
// packedArrayOffset gives the index where the packed data with the current mask starts
|
|
template<typename T_packed, typename T_unpacked, unsigned8 bitsInPacked = sizeof(T_packed) * 8, unsigned8 bitsInUnpacked = sizeof(T_unpacked) * 8>
|
|
T_unpacked UnpackTightAt(const std::vector<T_packed>& packed, const size_t& i, const std::vector<unsigned8>& bitMap, const unsigned8& offset = 0, const size_t& packedArrayOffset = 0)
|
|
{
|
|
assert(offset < bitsInPacked);
|
|
size_t setBits = bitMap.size();
|
|
size_t divisionShift = (bitsInPacked >> 3) * 3; // (bitsInPacked / 8) * 3
|
|
size_t modulusMask = BitHelper::GetLSMask<size_t>(0, (unsigned8)divisionShift); // Mask that captures the bits that are shifted out by the divisionShift
|
|
|
|
T_unpacked value = 0;
|
|
for (size_t j = 0; j < bitMap.size(); j++)
|
|
{
|
|
size_t bit = i * setBits + j + offset;
|
|
size_t byteIndex = bit >> divisionShift; // Divide by the number of bits in the packed type (gives the index of the packed value to look at)
|
|
T_packed bitInByte = (T_packed)(bit & modulusMask); // Modulus by the number of bits in the packed type (gives the bit index within the packed type)
|
|
if (BitHelper::GetHS(packed[packedArrayOffset + byteIndex], bitInByte)) // If the bit in the packed type is set, it should also be in the unpacked type
|
|
SetHS(value, bitMap[j]);
|
|
}
|
|
return value;
|
|
}
|
|
|
|
template<typename T_packed, typename T_unpacked, unsigned8 bitsInPacked = sizeof(T_packed) * 8, unsigned8 bitsInUnpacked = sizeof(T_unpacked) * 8>
|
|
T_unpacked UnpackTightAt(const std::vector<T_packed>& packed, const size_t& i, const T_unpacked mask, const unsigned8& offset = 0, const size_t& packedArrayOffset = 0)
|
|
{
|
|
return UnpackTightAt<T_packed, T_unpacked, bitsInPacked, bitsInUnpacked>(packed, i, GetBitMapHS(mask), offset, packedArrayOffset);
|
|
}
|
|
};
|
|
|