I have benchmarks!
These are the results for my eMachines T6216 (Athlon 64 3200+ at 2.01
GHz) running Windows XP Home (32 bit). I have a few fancy flags in
boot.ini that probably affect scheduling…
The benchmark uses QueryPerformanceCounter() so it’s Windows only for
the moment. Shouldn’t be hard to convert it if anyone wants to try it
on Linux.
I compiled using VC10 compiler in the Windows SDK build environment.
The benchmark was compiled like so:
cl /c /O2tx /arch:SSE2 /EHsc bittest.cpp
link /out:bittest.exe bittest.obj
and like so:
cl /c /EHsc bittest.cpp
link /out:bittest.exe bittest.obj
The revised bittest.cpp and results are attached.
-------------- next part --------------
#include
#include
#include
#include <conio.h>
#include
void printbits(const void* base, unsigned offset, unsigned length) {
const unsigned char* b = (const unsigned char*) base;
b += offset / 8;
offset %= 8;
for (; length--; ++offset) {
if (offset == 8) {
offset = 0;
++b;
}
std::printf("%d", (*b >> offset) & 1);
}
}
int cmpbits(const void* base1, unsigned off1,
const void* base2, unsigned off2,
unsigned length) {
const unsigned char* b1;
const unsigned char* b2;
b1 = (unsigned char*) base1;
b2 = (const unsigned char*) base2;
while (length--) {
b1 += off1 >> 3;
off1 &= 7;
b2 += off2 >> 3;
off2 &= 7;
switch ((((*b1 >> off1++) & 1) << 1) |
(*b2 >> off2++) & 1) {
case 1:
return -1;
case 2:
return 1;
}
}
return 0;
}
//bit by bit
void movebits1( void* dstBase, unsigned dstOff,
const void* srcBase, unsigned srcOff,
unsigned length ) {
unsigned char* d = (unsigned char*) dstBase;
const unsigned char* s = (const unsigned char*) srcBase;
while (length--) {
d += dstOff >> 3;
dstOff &= 7;
s += srcOff >> 3;
srcOff &= 7;
*d = (*d & ~(1 << dstOff)) | (((*s >> srcOff) & 1) << dstOff);
++srcOff;
++dstOff;
}
}
void movebits2( void* dstBase, unsigned dstOff,
const void* srcBase, unsigned srcOff,
unsigned length ) {
unsigned char* d;
const unsigned char* s;
d = (unsigned char*) dstBase;
s = (const unsigned char*) srcBase;
// normalize offset and base pointers
d += dstOff >> 3;
dstOff &= 7;
s += srcOff >> 3;
srcOff &= 7;
// deal with short transfers
switch (((dstOff + length <= 8) << 1) |
(srcOff + length <= 8)) {
case 3:
*d = (*d & ~((~(255 << length)) << dstOff)) | (((*s >> srcOff) & ~(255 << length)) << dstOff);
return;
case 2:
*d = (*d & ~((~(255 << length)) << dstOff)) | (((unsigned char) (*(unsigned short*)s >> srcOff) & ~(255 << length)) << dstOff);
return;
case 1:
*(unsigned short*)d = (*(unsigned short*)d & ~((~(0xFFFF << length)) << dstOff)) | (((*s >> srcOff) & ~(255 << length)) << dstOff);
return;
}
// deal with dstOff
if (dstOff != 0) {
*d = (*d & ~(255 << dstOff)) | ((unsigned char) (*(unsigned short*)s >> srcOff) << dstOff);
++d;
// adjust length and srcOff
length -= 8 - dstOff;
srcOff += 8 - dstOff;
dstOff = 0;
s += srcOff >> 3;
srcOff &= 7;
}
// copy remaining bits
switch (((unsigned)((length & 7) == 0) << 1) |
(unsigned)(srcOff == 0)) {
case 0: { /* default */
unsigned l2 = 255 << (length & 7);
for (length = length >> 3; length--; ++d, ++s) {
*d = (unsigned char)(*(unsigned short*)s >> srcOff);
}
*d = (*d & l2) | ((unsigned char)(*(unsigned short*)s >> srcOff) & ~l2);
break;
}
case 1: { /* srcOff == 0 */
unsigned l2 = 255 << (length & 7);
memcpy(d, s, length = length >> 3); d += length; s += length;
*d = (*d & l2) | (*s & ~l2);
break;
}
case 2: /* (length & 7) == 0 */
for (length = length >> 3; length--; ++d, ++s) {
*d = (unsigned char)(*(unsigned short*)s >> srcOff);
}
break;
case 3: /* moving bytes */
memcpy(d, s, length >> 3);
}
}
int test() {
unsigned char buffer[16];
unsigned i, A, B, L;
srand(time(0));
while (!_kbhit()) {
for (i = 0; i < sizeof(buffer); ++i) {
buffer[i] = (unsigned char) rand();
}
L = rand() % (8 * sizeof(buffer) / 3);
A = rand() % (8 * sizeof(buffer) - L);
do {
B = rand() % (8 * sizeof(buffer) - L);
} while ((A <= B && A+L > B) || (B <= A && B+L > A));
printf("\nA = %d, B = %d, L = %d\n", A, B, L);
for (i = 0; i < 64; ++i) {
printf((i&7)?"%c":(i&63)?" %c":"\n%c",
(A <= i && A+L > i)? 'A':
(B <= i && B+L > i)? 'B': ' '
);
}
for (i = 0; i < sizeof(buffer); ++i) {
printf((i&7)?" ":"\n");
printbits(buffer, i*8, 8);
}
for (i = sizeof(buffer)*8 - 64; i < sizeof(buffer) * 8; ++i) {
printf((i&7)?"%c":(i&63)?" %c":"\n%c",
(A <= i && A+L > i)? 'A':
(B <= i && B+L > i)? 'B': ' '
);
}
printf("\nA -> ");
printbits(buffer, A, L);
printf("\nB -> ");
printbits(buffer, B, L);
printf("\n\nMoving A to B...\n");
movebits2(buffer, B, buffer, A, L);
for (i = 0; i < 64; ++i) {
printf((i&7)?"%c":(i&63)?" %c":"\n%c",
(A <= i && A+L > i)? 'A':
(B <= i && B+L > i)? 'B': ' '
);
}
for (i = 0; i < sizeof(buffer); ++i) {
printf((i&7)?" ":"\n");
printbits(buffer, i*8, 8);
}
for (i = sizeof(buffer)*8 - 64; i < sizeof(buffer) * 8; ++i) {
printf((i&7)?"%c":(i&63)?" %c":"\n%c",
(A <= i && A+L > i)? 'A':
(B <= i && B+L > i)? 'B': ' '
);
}
printf("\nA -> ");
printbits(buffer, A, L);
printf("\nB -> ");
printbits(buffer, B, L);
printf("\n");
if (cmpbits(buffer, A, buffer, B, L) != 0) {
printf ("\nA and B don't match\n");
return 1;
}
}
return 0;
}
#include <windows.h>
int bench(unsigned count, size_t size) {
unsigned i;
srand(time(0));
// printf ("\nCreating dataset… “);
unsigned char* buffer = new unsigned char[size];
for (i = 0; i < size; ++i) {
buffer[i] = rand();
// printf((i&7)?” “:”\n");
// printbits(buffer, i*8, 8);
}
// printf ("\n\nInitializing variables…\n");
unsigned* vars = new unsigned[count*3];
for (i = count; i–; ) {
unsigned& A = vars[i*3+0];
unsigned& B = vars[i*3+1];
unsigned& L = vars[i*3+2];
L = rand() % (8 * size / 3);
A = rand() % (8 * size - L);
do {
B = rand() % (8 * size - L);
} while ((A <= B && A+L > B) || (B <= A && B+L > A));
// printf("%3d %3d %3d\t", A, B, L);
}
printf ("\n\nRunning benchmark (%u iterations, %u byte buffer)...", count, size);
long long freq = 0, start = 0, end = 0;
QueryPerformanceFrequency((LARGE_INTEGER*) &freq);
printf("\nmovebits1: ");
QueryPerformanceCounter((LARGE_INTEGER*) &start);
for (i = count; i--; ) {
movebits1(buffer, vars[i*3+0], buffer, vars[i*3+1], vars[i*3+2]);
}
QueryPerformanceCounter((LARGE_INTEGER*) &end);
printf ("%lf seconds", (double)(end-start)/freq);
printf("\nmovebits2: ");
QueryPerformanceCounter((LARGE_INTEGER*) &start);
for (i = count; i--; ) {
movebits2(buffer, vars[i*3+0], buffer, vars[i*3+1], vars[i*3+2]);
}
QueryPerformanceCounter((LARGE_INTEGER*) &end);
printf ("%lf seconds", (double)(end-start)/freq);
printf("\nempty loop:");
unsigned junk = 0;
QueryPerformanceCounter((LARGE_INTEGER*) &start);
for (i = count; i--; ) {
junk |= vars[i*3+0]|vars[i*3+1]|vars[i*3+2];
}
QueryPerformanceCounter((LARGE_INTEGER*) &end);
printf ("%lf seconds\n", (double)(end-start)/freq, junk);
delete vars;
delete buffer;
return 0;
}
int main(int, char**) {
bench(1000000, 256);
bench(1000000, 128);
bench(1000000, 64);
bench(1000000, 32);
bench(1000000, 16);
bench(1000000, 8);
bench(1000000, 4);
bench(1000000, 2);
bench(1000000, 1);
return 0;
}
-------------- next part --------------
Running benchmark (1000000 iterations, 256 byte buffer)…
movebits1: 1.810916 seconds
movebits2: 0.101904 seconds
empty loop:0.006286 seconds
Running benchmark (1000000 iterations, 128 byte buffer)…
movebits1: 0.905211 seconds
movebits2: 0.072261 seconds
empty loop:0.006125 seconds
Running benchmark (1000000 iterations, 64 byte buffer)…
movebits1: 0.456868 seconds
movebits2: 0.058807 seconds
empty loop:0.006458 seconds
Running benchmark (1000000 iterations, 32 byte buffer)…
movebits1: 0.237964 seconds
movebits2: 0.049728 seconds
empty loop:0.006458 seconds
Running benchmark (1000000 iterations, 16 byte buffer)…
movebits1: 0.124081 seconds
movebits2: 0.042296 seconds
empty loop:0.006100 seconds
Running benchmark (1000000 iterations, 8 byte buffer)…
movebits1: 0.069005 seconds
movebits2: 0.036983 seconds
empty loop:0.006107 seconds
Running benchmark (1000000 iterations, 4 byte buffer)…
movebits1: 0.040075 seconds
movebits2: 0.030821 seconds
empty loop:0.005774 seconds
Running benchmark (1000000 iterations, 2 byte buffer)…
movebits1: 0.026830 seconds
movebits2: 0.022643 seconds
empty loop:0.006423 seconds
Running benchmark (1000000 iterations, 1 byte buffer)…
movebits1: 0.016274 seconds
movebits2: 0.022905 seconds
empty loop:0.006127 seconds
-------------- next part --------------
Running benchmark (1000000 iterations, 256 byte buffer)…
movebits1: 4.619098 seconds
movebits2: 0.389204 seconds
empty loop:0.012779 seconds
Running benchmark (1000000 iterations, 128 byte buffer)…
movebits1: 2.296133 seconds
movebits2: 0.228250 seconds
empty loop:0.012435 seconds
Running benchmark (1000000 iterations, 64 byte buffer)…
movebits1: 1.168352 seconds
movebits2: 0.146105 seconds
empty loop:0.012440 seconds
Running benchmark (1000000 iterations, 32 byte buffer)…
movebits1: 0.586166 seconds
movebits2: 0.104490 seconds
empty loop:0.012716 seconds
Running benchmark (1000000 iterations, 16 byte buffer)…
movebits1: 0.307077 seconds
movebits2: 0.080317 seconds
empty loop:0.012499 seconds
Running benchmark (1000000 iterations, 8 byte buffer)…
movebits1: 0.170761 seconds
movebits2: 0.067020 seconds
empty loop:0.012712 seconds
Running benchmark (1000000 iterations, 4 byte buffer)…
movebits1: 0.091559 seconds
movebits2: 0.053966 seconds
empty loop:0.012461 seconds
Running benchmark (1000000 iterations, 2 byte buffer)…
movebits1: 0.055960 seconds
movebits2: 0.038912 seconds
empty loop:0.012841 seconds
Running benchmark (1000000 iterations, 1 byte buffer)…
movebits1: 0.032166 seconds
movebits2: 0.036359 seconds
empty loop:0.012471 secondsOn 28/06/2010, Kenneth Bull <@Kenneth_Bull> wrote:
Here’s movebits2, which should be a bit faster. I still need to do
benchmarks, and I haven’t touched Bob’s Duff’s Device suggestion yet.
I’m not sure if what I’ve done with the switch statements would be an
improvement over ifs or not. I guess it depends on how the compiler
optimizes conditional expressions.