vegaevo_doc/html/_ice_revisited_radix_8cpp_source.html

/*

To do:

    - add an offset parameter between two input values (avoid some data recopy sometimes)

    - unroll ? asm ?

    - 11 bits trick & 3 passes as Michael did

    - prefetch stuff the day I have a P3

    - make a version with 16-bits indices ?

*/


// Precompiled Header

#include "Stdafx.h"


using namespace Opcode;


#define INVALIDATE_RANKS    mCurrentSize|=0x80000000

#define VALIDATE_RANKS      mCurrentSize&=0x7fffffff

#define CURRENT_SIZE        (mCurrentSize&0x7fffffff)

#define INVALID_RANKS       (mCurrentSize&0x80000000)


#define CHECK_RESIZE(n)                                                                     \

    if(n!=mPreviousSize)                                                                    \

    {                                                                                       \

                if(n>mCurrentSize)  Resize(n);                                              \

        else                        ResetRanks();                                           \

        mPreviousSize = n;                                                                  \

    }


#define CREATE_HISTOGRAMS(type, buffer)                                                     \

    /* Clear counters/histograms */                                                         \

    ZeroMemory(mHistogram, 256*4*sizeof(udword));                                           \

                                                                                            \

    /* Prepare to count */                                                                  \

    ubyte* p = (ubyte*)input;                                                               \

    ubyte* pe = &p[nb*4];                                                                   \

    udword* h0= &mHistogram[0];     /* Histogram for first pass (LSB)   */                  \

    udword* h1= &mHistogram[256];   /* Histogram for second pass        */                  \

    udword* h2= &mHistogram[512];   /* Histogram for third pass         */                  \

    udword* h3= &mHistogram[768];   /* Histogram for last pass (MSB)    */                  \

                                                                                            \

    bool AlreadySorted = true;  /* Optimism... */                                           \

                                                                                            \

    if(INVALID_RANKS)                                                                       \

    {                                                                                       \

        /* Prepare for temporal coherence */                                                \

        type* Running = (type*)buffer;                                                      \

        type PrevVal = *Running;                                                            \

                                                                                            \

        while(p!=pe)                                                                        \

        {                                                                                   \

            /* Read input buffer in previous sorted order */                                \

            type Val = *Running++;                                                          \

            /* Check whether already sorted or not */                                       \

            if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */               \

            /* Update for next iteration */                                                 \

            PrevVal = Val;                                                                  \

                                                                                            \

            /* Create histograms */                                                         \

            h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++;                                 \

        }                                                                                   \

                                                                                            \

        /* If all input values are already sorted, we just have to return and leave the */  \

        /* previous list unchanged. That way the routine may take advantage of temporal */  \

        /* coherence, for example when used to sort transparent faces.                  */  \

        if(AlreadySorted)                                                                   \

        {                                                                                   \

            mNbHits++;                                                                      \

            for(udword i=0;i<nb;i++)    mRanks[i] = i;                                      \

            return *this;                                                                   \

        }                                                                                   \

    }                                                                                       \

    else                                                                                    \

    {                                                                                       \

        /* Prepare for temporal coherence */                                                \

        udword* Indices = mRanks;                                                           \

        type PrevVal = (type)buffer[*Indices];                                              \

                                                                                            \

        while(p!=pe)                                                                        \

        {                                                                                   \

            /* Read input buffer in previous sorted order */                                \

            type Val = (type)buffer[*Indices++];                                            \

            /* Check whether already sorted or not */                                       \

            if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */               \

            /* Update for next iteration */                                                 \

            PrevVal = Val;                                                                  \

                                                                                            \

            /* Create histograms */                                                         \

            h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++;                                 \

        }                                                                                   \

                                                                                            \

        /* If all input values are already sorted, we just have to return and leave the */  \

        /* previous list unchanged. That way the routine may take advantage of temporal */  \

        /* coherence, for example when used to sort transparent faces.                  */  \

        if(AlreadySorted)   { mNbHits++; return *this;  }                                   \

    }                                                                                       \

                                                                                            \

    /* Else there has been an early out and we must finish computing the histograms */      \

    while(p!=pe)                                                                            \

    {                                                                                       \

        /* Create histograms without the previous overhead */                               \

        h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++;                                     \

    }


#define CHECK_PASS_VALIDITY(pass)                                                           \

    /* Shortcut to current counters */                                                      \

    udword* CurCount = &mHistogram[pass<<8];                                                \

                                                                                            \

    /* Reset flag. The sorting pass is supposed to be performed. (default) */               \

    bool PerformPass = true;                                                                \

                                                                                            \

    /* Check pass validity */                                                               \

                                                                                            \

    /* If all values have the same byte, sorting is useless. */                             \

    /* It may happen when sorting bytes or words instead of dwords. */                      \

    /* This routine actually sorts words faster than dwords, and bytes */                   \

    /* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */             \

    /* for words and O(n) for bytes. Running time for floats depends on actual values... */ \

                                                                                            \

    /* Get first byte */                                                                    \

    ubyte UniqueVal = *(((ubyte*)input)+pass);                                              \

                                                                                            \

    /* Check that byte's counter */                                                         \

    if(CurCount[UniqueVal]==nb) PerformPass=false;


RadixSort::RadixSort() : mCurrentSize(0), mRanks(null), mRanks2(null), mTotalCalls(0), mNbHits(0)

{

#ifndef RADIX_LOCAL_RAM

    // Allocate input-independent ram

    mHistogram  = new udword[256*4];

    mOffset     = new udword[256];

#endif

    // Initialize indices

    INVALIDATE_RANKS;

}


RadixSort::~RadixSort()

{

    // Release everything

#ifndef RADIX_LOCAL_RAM

    DELETEARRAY(mOffset);

    DELETEARRAY(mHistogram);

#endif

    DELETEARRAY(mRanks2);

    DELETEARRAY(mRanks);

}


bool RadixSort::Resize(udword nb)

{

    // Free previously used ram

    DELETEARRAY(mRanks2);

    DELETEARRAY(mRanks);


    // Get some fresh one

    mRanks  = new udword[nb];   CHECKALLOC(mRanks);

    mRanks2 = new udword[nb];   CHECKALLOC(mRanks2);


    return true;

}


inline_ void RadixSort::CheckResize(udword nb)

{

    udword CurSize = CURRENT_SIZE;

    if(nb!=CurSize)

    {

        if(nb>CurSize)  Resize(nb);

        mCurrentSize = nb;

        INVALIDATE_RANKS;

    }

}


RadixSort& RadixSort::Sort(const udword* input, udword nb, RadixHint hint)

{

    // Checkings

    if(!input || !nb || nb&0x80000000)  return *this;


    // Stats

    mTotalCalls++;


    // Resize lists if needed

    CheckResize(nb);


#ifdef RADIX_LOCAL_RAM

    // Allocate histograms & offsets on the stack

    udword mHistogram[256*4];

//  udword mOffset[256];

    udword* mLink[256];

#endif


    // Create histograms (counters). Counters for all passes are created in one run.

    // Pros:    read input buffer once instead of four times

    // Cons:    mHistogram is 4Kb instead of 1Kb

    // We must take care of signed/unsigned values for temporal coherence.... I just

    // have 2 code paths even if just a single opcode changes. Self-modifying code, someone?

    if(hint==RADIX_UNSIGNED)    { CREATE_HISTOGRAMS(udword, input); }

    else                        { CREATE_HISTOGRAMS(sdword, input); }


    // Compute #negative values involved if needed

    udword NbNegativeValues = 0;

    if(hint==RADIX_SIGNED)

    {

        // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128

        // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,

        // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.

        udword* h3= &mHistogram[768];

        for(udword i=128;i<256;i++) NbNegativeValues += h3[i];  // 768 for last histogram, 128 for negative part

    }


    // Radix sort, j is the pass number (0=LSB, 3=MSB)

    for(udword j=0;j<4;j++)

    {

        CHECK_PASS_VALIDITY(j);


        // Sometimes the fourth (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is

        // not a problem, numbers are correctly sorted anyway.

        if(PerformPass)

        {

            // Should we care about negative values?

            if(j!=3 || hint==RADIX_UNSIGNED)

            {

                // Here we deal with positive values only


                // Create offsets

//              mOffset[0] = 0;

//              for(udword i=1;i<256;i++)       mOffset[i] = mOffset[i-1] + CurCount[i-1];

                mLink[0] = mRanks2;

                for(udword i=1;i<256;i++)       mLink[i] = mLink[i-1] + CurCount[i-1];

            }

            else

            {

                // This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.


                udword i;

                // Create biased offsets, in order for negative numbers to be sorted as well

//              mOffset[0] = NbNegativeValues;                                              // First positive number takes place after the negative ones

                mLink[0] = &mRanks2[NbNegativeValues];                                      // First positive number takes place after the negative ones

//              for(udword i=1;i<128;i++)       mOffset[i] = mOffset[i-1] + CurCount[i-1];  // 1 to 128 for positive numbers

                for(i=1;i<128;i++)      mLink[i] = mLink[i-1] + CurCount[i-1];      // 1 to 128 for positive numbers


                // Fixing the wrong place for negative values

//              mOffset[128] = 0;

                mLink[128] = mRanks2;

//              for(i=129;i<256;i++)            mOffset[i] = mOffset[i-1] + CurCount[i-1];

                for(i=129;i<256;i++)        mLink[i] = mLink[i-1] + CurCount[i-1];

            }


            // Perform Radix Sort

            ubyte* InputBytes   = (ubyte*)input;

            InputBytes += j;

            if(INVALID_RANKS)

            {

//              for(udword i=0;i<nb;i++)    mRanks2[mOffset[InputBytes[i<<2]]++] = i;

                for(udword i=0;i<nb;i++)    *mLink[InputBytes[i<<2]]++ = i;

                VALIDATE_RANKS;

            }

            else

            {

                udword* Indices     = mRanks;

                udword* IndicesEnd  = &mRanks[nb];

                while(Indices!=IndicesEnd)

                {

                    udword id = *Indices++;

//                  mRanks2[mOffset[InputBytes[id<<2]]++] = id;

                    *mLink[InputBytes[id<<2]]++ = id;

                }

            }


            // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.

            udword* Tmp = mRanks;   mRanks = mRanks2; mRanks2 = Tmp;

        }

    }

    return *this;

}


RadixSort& RadixSort::Sort(const float* input2, udword nb)

{

    // Checkings

    if(!input2 || !nb || nb&0x80000000) return *this;


    // Stats

    mTotalCalls++;


    udword* input = (udword*)input2;


    // Resize lists if needed

    CheckResize(nb);


#ifdef RADIX_LOCAL_RAM

    // Allocate histograms & offsets on the stack

    udword mHistogram[256*4];

//  udword mOffset[256];

    udword* mLink[256];

#endif


    // Create histograms (counters). Counters for all passes are created in one run.

    // Pros:    read input buffer once instead of four times

    // Cons:    mHistogram is 4Kb instead of 1Kb

    // Floating-point values are always supposed to be signed values, so there's only one code path there.

    // Please note the floating point comparison needed for temporal coherence! Although the resulting asm code

    // is dreadful, this is surprisingly not such a performance hit - well, I suppose that's a big one on first

    // generation Pentiums....We can't make comparison on integer representations because, as Chris said, it just

    // wouldn't work with mixed positive/negative values....

    { CREATE_HISTOGRAMS(float, input2); }


    // Compute #negative values involved if needed

    udword NbNegativeValues = 0;

    // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128

    // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,

    // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.

    udword* h3= &mHistogram[768];

    udword i;

    for(i=128;i<256;i++)    NbNegativeValues += h3[i];  // 768 for last histogram, 128 for negative part


    // Radix sort, j is the pass number (0=LSB, 3=MSB)

    for(udword j=0;j<4;j++)

    {

        // Should we care about negative values?

        if(j!=3)

        {

            // Here we deal with positive values only

            CHECK_PASS_VALIDITY(j);


            if(PerformPass)

            {

                // Create offsets

//              mOffset[0] = 0;

                mLink[0] = mRanks2;

//              for(udword i=1;i<256;i++)       mOffset[i] = mOffset[i-1] + CurCount[i-1];

                for(udword i=1;i<256;i++)       mLink[i] = mLink[i-1] + CurCount[i-1];


                // Perform Radix Sort

                ubyte* InputBytes = (ubyte*)input;

                InputBytes += j;

                if(INVALID_RANKS)

                {

//                  for(i=0;i<nb;i++)   mRanks2[mOffset[InputBytes[i<<2]]++] = i;

                    for(udword i=0;i<nb;i++)    *mLink[InputBytes[i<<2]]++ = i;

                    VALIDATE_RANKS;

                }

                else

                {

                    udword* Indices     = mRanks;

                    udword* IndicesEnd  = &mRanks[nb];

                    while(Indices!=IndicesEnd)

                    {

                        udword id = *Indices++;

//                      mRanks2[mOffset[InputBytes[id<<2]]++] = id;

                        *mLink[InputBytes[id<<2]]++ = id;

                    }

                }


                // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.

                udword* Tmp = mRanks;   mRanks = mRanks2; mRanks2 = Tmp;

            }

        }

        else

        {

            // This is a special case to correctly handle negative values

            CHECK_PASS_VALIDITY(j);


            if(PerformPass)

            {

                // Create biased offsets, in order for negative numbers to be sorted as well

//              mOffset[0] = NbNegativeValues;                                              // First positive number takes place after the negative ones

                mLink[0] = &mRanks2[NbNegativeValues];                                      // First positive number takes place after the negative ones

//              for(udword i=1;i<128;i++)       mOffset[i] = mOffset[i-1] + CurCount[i-1];  // 1 to 128 for positive numbers

                for(i=1;i<128;i++)      mLink[i] = mLink[i-1] + CurCount[i-1];      // 1 to 128 for positive numbers


                // We must reverse the sorting order for negative numbers!

//              mOffset[255] = 0;

                mLink[255] = mRanks2;

//              for(i=0;i<127;i++)      mOffset[254-i] = mOffset[255-i] + CurCount[255-i];  // Fixing the wrong order for negative values

                for(i=0;i<127;i++)  mLink[254-i] = mLink[255-i] + CurCount[255-i];      // Fixing the wrong order for negative values

//              for(i=128;i<256;i++)    mOffset[i] += CurCount[i];                          // Fixing the wrong place for negative values

                for(i=128;i<256;i++)    mLink[i] += CurCount[i];                            // Fixing the wrong place for negative values


                // Perform Radix Sort

                if(INVALID_RANKS)

                {

                    for(udword i=0;i<nb;i++)

                    {

                        udword Radix = input[i]>>24;                            // Radix byte, same as above. AND is useless here (udword).

                        // ### cmp to be killed. Not good. Later.

//                      if(Radix<128)       mRanks2[mOffset[Radix]++] = i;      // Number is positive, same as above

//                      else                mRanks2[--mOffset[Radix]] = i;      // Number is negative, flip the sorting order

                        if(Radix<128)       *mLink[Radix]++ = i;        // Number is positive, same as above

                        else                *(--mLink[Radix]) = i;      // Number is negative, flip the sorting order

                    }

                    VALIDATE_RANKS;

                }

                else

                {

                    for(udword i=0;i<nb;i++)

                    {

                        udword Radix = input[mRanks[i]]>>24;                            // Radix byte, same as above. AND is useless here (udword).

                        // ### cmp to be killed. Not good. Later.

//                      if(Radix<128)       mRanks2[mOffset[Radix]++] = mRanks[i];      // Number is positive, same as above

//                      else                mRanks2[--mOffset[Radix]] = mRanks[i];      // Number is negative, flip the sorting order

                        if(Radix<128)       *mLink[Radix]++ = mRanks[i];        // Number is positive, same as above

                        else                *(--mLink[Radix]) = mRanks[i];      // Number is negative, flip the sorting order

                    }

                }

                // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.

                udword* Tmp = mRanks;   mRanks = mRanks2; mRanks2 = Tmp;

            }

            else

            {

                // The pass is useless, yet we still have to reverse the order of current list if all values are negative.

                if(UniqueVal>=128)

                {

                    if(INVALID_RANKS)

                    {

                        // ###Possible?

                        for(udword i=0;i<nb;i++)    mRanks2[i] = nb-i-1;

                        VALIDATE_RANKS;

                    }

                    else

                    {

                        for(udword i=0;i<nb;i++)    mRanks2[i] = mRanks[nb-i-1];

                    }


                    // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.

                    udword* Tmp = mRanks;   mRanks = mRanks2; mRanks2 = Tmp;

                }

            }

        }

    }

    return *this;

}


udword RadixSort::GetUsedRam() const

{

    udword UsedRam = sizeof(RadixSort);

#ifndef RADIX_LOCAL_RAM

    UsedRam += 256*4*sizeof(udword);            // Histograms

    UsedRam += 256*sizeof(udword);              // Offsets

#endif

    UsedRam += 2*CURRENT_SIZE*sizeof(udword);   // 2 lists of indices

    return UsedRam;

}