Friday, February 08, 2008

Cache Thrash

A program designed to test to 64-bitness and poor cache performance on 5 GB of RAM:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <assert.h>

void time_point(const struct timeval start, const char*message){
  struct timeval now;
  double d;
  gettimeofday(&now,0);

  now.tv_sec -= start.tv_sec;
  now.tv_usec -= start.tv_usec;
  d=now.tv_usec;
  d/=1000000;
  d+=now.tv_sec;
  d/=60;
  printf("t=%9.4f minutes: %s\n",d,message);
}

int main(void){
   
  size_t one=1;
  size_t gigabyte=one << 30;
  size_t modulo=5*gigabyte-3; /* znprimroot(precprime(5*2^30)) */
  
  char*m;
  size_t p=1;
  size_t count;
  
  struct timeval start;
  gettimeofday(&start,0);
  
  assert(sizeof(size_t)>4);
  /* should be 8 */
  printf("sizeof(size_t)=%u   %f MB\n",(int)(sizeof(size_t)),
         ((double)modulo)/1000000);

  m=(char*)malloc(modulo);
  time_point(start,"malloc done");
  memset(m,0,modulo);
  time_point(start,"memset done");

  m[0]=1;
  for(count=1;count<modulo;++count){
    
    m[p]=1;
    p*=2;
    p%=modulo;
    
  }
  time_point(start,"set1 done");
  p=0;
  for(count=0;count<modulo;++count)p+=m[count];
  time_point(start,"total done");
  assert(modulo==p);

  return 0;
}

An AMD machine:

 time ./a.out
sizeof(size_t)=8   5368.709117 MB
t=   0.0000 minutes: malloc done
t=   0.2224 minutes: memset done
t=  20.8318 minutes: set1 done
t=  21.5037 minutes: total done

real    21m31.113s
user    20m53.114s
sys     0m9.665s

UltraSPARC (Sun Fire V440)

 time ./a.out
sizeof(size_t)=8 5368.709117 MB
t=    0.000 minutes: malloc done
t=    0.166 minutes: memset done
t=   23.356 minutes: set1 done
t=   23.944 minutes: total done

real    23m59.392s
user    23m49.118s
sys     0m6.925s

Update 11-2010, Intel Core i7

$ time ./cache-thrash 
sizeof(size_t)=8   5368.709117 MB
t=   0.0000 minutes: malloc done
t=   0.0336 minutes: memset done
t=   4.6457 minutes: set1 done
t=   4.7146 minutes: total done

real 4m43.056s
user 4m41.626s
sys 0m1.448s

2 comments :

Anonymous said...

Could you plz briefly explain the source code? Especially, why it measures the CPU cache performance rather than the VM(swap) ?

Ken said...

You need at least 5 GB of RAM to run this test, so it fits entirely into memory, not needing VM (swap).