Vegastrike 0.5.1 rc1  1.0
Original sources for Vegastrike Evolved
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
IceRevisitedRadix.cpp
Go to the documentation of this file.
1 
8 
11 
39 
41 /*
42 To do:
43  - add an offset parameter between two input values (avoid some data recopy sometimes)
44  - unroll ? asm ?
45  - 11 bits trick & 3 passes as Michael did
46  - prefetch stuff the day I have a P3
47  - make a version with 16-bits indices ?
48 */
49 
51 // Precompiled Header
52 #include "Stdafx.h"
53 
54 
55 using namespace Opcode;
56 
57 #define INVALIDATE_RANKS mCurrentSize|=0x80000000
58 #define VALIDATE_RANKS mCurrentSize&=0x7fffffff
59 #define CURRENT_SIZE (mCurrentSize&0x7fffffff)
60 #define INVALID_RANKS (mCurrentSize&0x80000000)
61 
62 #define CHECK_RESIZE(n) \
63  if(n!=mPreviousSize) \
64  { \
65  if(n>mCurrentSize) Resize(n); \
66  else ResetRanks(); \
67  mPreviousSize = n; \
68  }
69 
70 #define CREATE_HISTOGRAMS(type, buffer) \
71  /* Clear counters/histograms */ \
72  ZeroMemory(mHistogram, 256*4*sizeof(udword)); \
73  \
74  /* Prepare to count */ \
75  ubyte* p = (ubyte*)input; \
76  ubyte* pe = &p[nb*4]; \
77  udword* h0= &mHistogram[0]; /* Histogram for first pass (LSB) */ \
78  udword* h1= &mHistogram[256]; /* Histogram for second pass */ \
79  udword* h2= &mHistogram[512]; /* Histogram for third pass */ \
80  udword* h3= &mHistogram[768]; /* Histogram for last pass (MSB) */ \
81  \
82  bool AlreadySorted = true; /* Optimism... */ \
83  \
84  if(INVALID_RANKS) \
85  { \
86  /* Prepare for temporal coherence */ \
87  type* Running = (type*)buffer; \
88  type PrevVal = *Running; \
89  \
90  while(p!=pe) \
91  { \
92  /* Read input buffer in previous sorted order */ \
93  type Val = *Running++; \
94  /* Check whether already sorted or not */ \
95  if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
96  /* Update for next iteration */ \
97  PrevVal = Val; \
98  \
99  /* Create histograms */ \
100  h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
101  } \
102  \
103  /* If all input values are already sorted, we just have to return and leave the */ \
104  /* previous list unchanged. That way the routine may take advantage of temporal */ \
105  /* coherence, for example when used to sort transparent faces. */ \
106  if(AlreadySorted) \
107  { \
108  mNbHits++; \
109  for(udword i=0;i<nb;i++) mRanks[i] = i; \
110  return *this; \
111  } \
112  } \
113  else \
114  { \
115  /* Prepare for temporal coherence */ \
116  udword* Indices = mRanks; \
117  type PrevVal = (type)buffer[*Indices]; \
118  \
119  while(p!=pe) \
120  { \
121  /* Read input buffer in previous sorted order */ \
122  type Val = (type)buffer[*Indices++]; \
123  /* Check whether already sorted or not */ \
124  if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
125  /* Update for next iteration */ \
126  PrevVal = Val; \
127  \
128  /* Create histograms */ \
129  h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
130  } \
131  \
132  /* If all input values are already sorted, we just have to return and leave the */ \
133  /* previous list unchanged. That way the routine may take advantage of temporal */ \
134  /* coherence, for example when used to sort transparent faces. */ \
135  if(AlreadySorted) { mNbHits++; return *this; } \
136  } \
137  \
138  /* Else there has been an early out and we must finish computing the histograms */ \
139  while(p!=pe) \
140  { \
141  /* Create histograms without the previous overhead */ \
142  h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
143  }
144 
145 #define CHECK_PASS_VALIDITY(pass) \
146  /* Shortcut to current counters */ \
147  udword* CurCount = &mHistogram[pass<<8]; \
148  \
149  /* Reset flag. The sorting pass is supposed to be performed. (default) */ \
150  bool PerformPass = true; \
151  \
152  /* Check pass validity */ \
153  \
154  /* If all values have the same byte, sorting is useless. */ \
155  /* It may happen when sorting bytes or words instead of dwords. */ \
156  /* This routine actually sorts words faster than dwords, and bytes */ \
157  /* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */ \
158  /* for words and O(n) for bytes. Running time for floats depends on actual values... */ \
159  \
160  /* Get first byte */ \
161  ubyte UniqueVal = *(((ubyte*)input)+pass); \
162  \
163  /* Check that byte's counter */ \
164  if(CurCount[UniqueVal]==nb) PerformPass=false;
165 
167 
170 RadixSort::RadixSort() : mCurrentSize(0), mRanks(null), mRanks2(null), mTotalCalls(0), mNbHits(0)
172 {
173 #ifndef RADIX_LOCAL_RAM
174  // Allocate input-independent ram
175  mHistogram = new udword[256*4];
176  mOffset = new udword[256];
177 #endif
178  // Initialize indices
180 }
181 
183 
188 {
189  // Release everything
190 #ifndef RADIX_LOCAL_RAM
191  DELETEARRAY(mOffset);
192  DELETEARRAY(mHistogram);
193 #endif
194  DELETEARRAY(mRanks2);
195  DELETEARRAY(mRanks);
196 }
197 
199 
204 bool RadixSort::Resize(udword nb)
206 {
207  // Free previously used ram
208  DELETEARRAY(mRanks2);
209  DELETEARRAY(mRanks);
210 
211  // Get some fresh one
212  mRanks = new udword[nb]; CHECKALLOC(mRanks);
213  mRanks2 = new udword[nb]; CHECKALLOC(mRanks2);
214 
215  return true;
216 }
217 
218 inline_ void RadixSort::CheckResize(udword nb)
219 {
220  udword CurSize = CURRENT_SIZE;
221  if(nb!=CurSize)
222  {
223  if(nb>CurSize) Resize(nb);
224  mCurrentSize = nb;
226  }
227 }
228 
230 
238 RadixSort& RadixSort::Sort(const udword* input, udword nb, RadixHint hint)
240 {
241  // Checkings
242  if(!input || !nb || nb&0x80000000) return *this;
243 
244  // Stats
245  mTotalCalls++;
246 
247  // Resize lists if needed
248  CheckResize(nb);
249 
250 #ifdef RADIX_LOCAL_RAM
251  // Allocate histograms & offsets on the stack
252  udword mHistogram[256*4];
253 // udword mOffset[256];
254  udword* mLink[256];
255 #endif
256 
257  // Create histograms (counters). Counters for all passes are created in one run.
258  // Pros: read input buffer once instead of four times
259  // Cons: mHistogram is 4Kb instead of 1Kb
260  // We must take care of signed/unsigned values for temporal coherence.... I just
261  // have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
262  if(hint==RADIX_UNSIGNED) { CREATE_HISTOGRAMS(udword, input); }
263  else { CREATE_HISTOGRAMS(sdword, input); }
264 
265  // Compute #negative values involved if needed
266  udword NbNegativeValues = 0;
267  if(hint==RADIX_SIGNED)
268  {
269  // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
270  // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
271  // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
272  udword* h3= &mHistogram[768];
273  for(udword i=128;i<256;i++) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
274  }
275 
276  // Radix sort, j is the pass number (0=LSB, 3=MSB)
277  for(udword j=0;j<4;j++)
278  {
280 
281  // Sometimes the fourth (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
282  // not a problem, numbers are correctly sorted anyway.
283  if(PerformPass)
284  {
285  // Should we care about negative values?
286  if(j!=3 || hint==RADIX_UNSIGNED)
287  {
288  // Here we deal with positive values only
289 
290  // Create offsets
291 // mOffset[0] = 0;
292 // for(udword i=1;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
293  mLink[0] = mRanks2;
294  for(udword i=1;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
295  }
296  else
297  {
298  // This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
299 
300  udword i;
301  // Create biased offsets, in order for negative numbers to be sorted as well
302 // mOffset[0] = NbNegativeValues; // First positive number takes place after the negative ones
303  mLink[0] = &mRanks2[NbNegativeValues]; // First positive number takes place after the negative ones
304 // for(udword i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
305  for(i=1;i<128;i++) mLink[i] = mLink[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
306 
307  // Fixing the wrong place for negative values
308 // mOffset[128] = 0;
309  mLink[128] = mRanks2;
310 // for(i=129;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
311  for(i=129;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
312  }
313 
314  // Perform Radix Sort
315  ubyte* InputBytes = (ubyte*)input;
316  InputBytes += j;
317  if(INVALID_RANKS)
318  {
319 // for(udword i=0;i<nb;i++) mRanks2[mOffset[InputBytes[i<<2]]++] = i;
320  for(udword i=0;i<nb;i++) *mLink[InputBytes[i<<2]]++ = i;
322  }
323  else
324  {
325  udword* Indices = mRanks;
326  udword* IndicesEnd = &mRanks[nb];
327  while(Indices!=IndicesEnd)
328  {
329  udword id = *Indices++;
330 // mRanks2[mOffset[InputBytes[id<<2]]++] = id;
331  *mLink[InputBytes[id<<2]]++ = id;
332  }
333  }
334 
335  // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
336  udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
337  }
338  }
339  return *this;
340 }
341 
343 
351 RadixSort& RadixSort::Sort(const float* input2, udword nb)
353 {
354  // Checkings
355  if(!input2 || !nb || nb&0x80000000) return *this;
356 
357  // Stats
358  mTotalCalls++;
359 
360  udword* input = (udword*)input2;
361 
362  // Resize lists if needed
363  CheckResize(nb);
364 
365 #ifdef RADIX_LOCAL_RAM
366  // Allocate histograms & offsets on the stack
367  udword mHistogram[256*4];
368 // udword mOffset[256];
369  udword* mLink[256];
370 #endif
371 
372  // Create histograms (counters). Counters for all passes are created in one run.
373  // Pros: read input buffer once instead of four times
374  // Cons: mHistogram is 4Kb instead of 1Kb
375  // Floating-point values are always supposed to be signed values, so there's only one code path there.
376  // Please note the floating point comparison needed for temporal coherence! Although the resulting asm code
377  // is dreadful, this is surprisingly not such a performance hit - well, I suppose that's a big one on first
378  // generation Pentiums....We can't make comparison on integer representations because, as Chris said, it just
379  // wouldn't work with mixed positive/negative values....
380  { CREATE_HISTOGRAMS(float, input2); }
381 
382  // Compute #negative values involved if needed
383  udword NbNegativeValues = 0;
384  // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
385  // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
386  // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
387  udword* h3= &mHistogram[768];
388  udword i;
389  for(i=128;i<256;i++) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
390 
391  // Radix sort, j is the pass number (0=LSB, 3=MSB)
392  for(udword j=0;j<4;j++)
393  {
394  // Should we care about negative values?
395  if(j!=3)
396  {
397  // Here we deal with positive values only
399 
400  if(PerformPass)
401  {
402  // Create offsets
403 // mOffset[0] = 0;
404  mLink[0] = mRanks2;
405 // for(udword i=1;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
406  for(udword i=1;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
407 
408  // Perform Radix Sort
409  ubyte* InputBytes = (ubyte*)input;
410  InputBytes += j;
411  if(INVALID_RANKS)
412  {
413 // for(i=0;i<nb;i++) mRanks2[mOffset[InputBytes[i<<2]]++] = i;
414  for(udword i=0;i<nb;i++) *mLink[InputBytes[i<<2]]++ = i;
416  }
417  else
418  {
419  udword* Indices = mRanks;
420  udword* IndicesEnd = &mRanks[nb];
421  while(Indices!=IndicesEnd)
422  {
423  udword id = *Indices++;
424 // mRanks2[mOffset[InputBytes[id<<2]]++] = id;
425  *mLink[InputBytes[id<<2]]++ = id;
426  }
427  }
428 
429  // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
430  udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
431  }
432  }
433  else
434  {
435  // This is a special case to correctly handle negative values
437 
438  if(PerformPass)
439  {
440  // Create biased offsets, in order for negative numbers to be sorted as well
441 // mOffset[0] = NbNegativeValues; // First positive number takes place after the negative ones
442  mLink[0] = &mRanks2[NbNegativeValues]; // First positive number takes place after the negative ones
443 // for(udword i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
444  for(i=1;i<128;i++) mLink[i] = mLink[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
445 
446  // We must reverse the sorting order for negative numbers!
447 // mOffset[255] = 0;
448  mLink[255] = mRanks2;
449 // for(i=0;i<127;i++) mOffset[254-i] = mOffset[255-i] + CurCount[255-i]; // Fixing the wrong order for negative values
450  for(i=0;i<127;i++) mLink[254-i] = mLink[255-i] + CurCount[255-i]; // Fixing the wrong order for negative values
451 // for(i=128;i<256;i++) mOffset[i] += CurCount[i]; // Fixing the wrong place for negative values
452  for(i=128;i<256;i++) mLink[i] += CurCount[i]; // Fixing the wrong place for negative values
453 
454  // Perform Radix Sort
455  if(INVALID_RANKS)
456  {
457  for(udword i=0;i<nb;i++)
458  {
459  udword Radix = input[i]>>24; // Radix byte, same as above. AND is useless here (udword).
460  // ### cmp to be killed. Not good. Later.
461 // if(Radix<128) mRanks2[mOffset[Radix]++] = i; // Number is positive, same as above
462 // else mRanks2[--mOffset[Radix]] = i; // Number is negative, flip the sorting order
463  if(Radix<128) *mLink[Radix]++ = i; // Number is positive, same as above
464  else *(--mLink[Radix]) = i; // Number is negative, flip the sorting order
465  }
467  }
468  else
469  {
470  for(udword i=0;i<nb;i++)
471  {
472  udword Radix = input[mRanks[i]]>>24; // Radix byte, same as above. AND is useless here (udword).
473  // ### cmp to be killed. Not good. Later.
474 // if(Radix<128) mRanks2[mOffset[Radix]++] = mRanks[i]; // Number is positive, same as above
475 // else mRanks2[--mOffset[Radix]] = mRanks[i]; // Number is negative, flip the sorting order
476  if(Radix<128) *mLink[Radix]++ = mRanks[i]; // Number is positive, same as above
477  else *(--mLink[Radix]) = mRanks[i]; // Number is negative, flip the sorting order
478  }
479  }
480  // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
481  udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
482  }
483  else
484  {
485  // The pass is useless, yet we still have to reverse the order of current list if all values are negative.
486  if(UniqueVal>=128)
487  {
488  if(INVALID_RANKS)
489  {
490  // ###Possible?
491  for(udword i=0;i<nb;i++) mRanks2[i] = nb-i-1;
493  }
494  else
495  {
496  for(udword i=0;i<nb;i++) mRanks2[i] = mRanks[nb-i-1];
497  }
498 
499  // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
500  udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
501  }
502  }
503  }
504  }
505  return *this;
506 }
507 
509 
515 {
516  udword UsedRam = sizeof(RadixSort);
517 #ifndef RADIX_LOCAL_RAM
518  UsedRam += 256*4*sizeof(udword); // Histograms
519  UsedRam += 256*sizeof(udword); // Offsets
520 #endif
521  UsedRam += 2*CURRENT_SIZE*sizeof(udword); // 2 lists of indices
522  return UsedRam;
523 }