/* matrix multiply tests -- C language, version 1.0, May 1993 * * compile with -DN= * * I usually run a script file * time.script 500 >500.times * where the script file contains * cc -O -DN=$1 mm.c * a.out -n (I suggest at least two runs per method to * a.out -n alert you to variations. Five or ten runs * a.out -t each, giving avg. and std dev. of times is * a.out -t best.) * ... * * 500x500 mm - normal algorithm utime 230.81 secs * 500x500 mm - normal algorithm utime 230.72 secs * 500x500 mm - temporary variable in loop utime 231.00 secs * 500x500 mm - temporary variable in loop utime 230.79 secs * 500x500 mm - unrolled inner loop, factor of 8 utime 232.09 secs * 500x500 mm - unrolled inner loop, factor of 8 utime 231.84 secs * 500x500 mm - pointers used to access matrices utime 230.74 secs * 500x500 mm - pointers used to access matrices utime 230.45 secs * 500x500 mm - blocking, factor of 32 utime 60.40 secs * 500x500 mm - blocking, factor of 32 utime 60.57 secs * 500x500 mm - interchanged inner loops utime 27.36 secs * 500x500 mm - interchanged inner loops utime 27.40 secs * 500x500 mm - 20x20 subarray (from D. Warner) utime 9.49 secs * 500x500 mm - 20x20 subarray (from D. Warner) utime 9.50 secs * 500x500 mm - 20x20 subarray (from T. Maeno) utime 9.10 secs * 500x500 mm - 20x20 subarray (from T. Maeno) utime 9.05 secs * * The algorithms can also be sensitive to TLB thrashing. On a 600x600 * test an IBM RS/6000 Model 30 showed variations depending on relative * location of the matrices. (The model 30 has 64 TLB entries organized * as 2-way set associative.) * * 600x600 mm - 20x20 subarray (from T. Maeno) utime 19.12 secs * 600x600 mm - 20x20 subarray (from T. Maeno) utime 19.23 secs * 600x600 mm - 20x20 subarray (from D. Warner) utime 18.87 secs * 600x600 mm - 20x20 subarray (from D. Warner) utime 18.64 secs * 600x600 mm - 20x20 btranspose (Warner/Smotherman) utime 17.70 secs * 600x600 mm - 20x20 btranspose (Warner/Smotherman) utime 17.76 secs * * Changing the declaration to include 10000 dummy entries between the * b and c matrices (suggested by T. Maeno), i.e., * * double a[N][N],b[N][N],dummy[10000],c[N][N],d[N][N],bt[N][N]; * * 600x600 mm - 20x20 subarray (from T. Maeno) utime 16.41 secs * 600x600 mm - 20x20 subarray (from T. Maeno) utime 16.40 secs * 600x600 mm - 20x20 subarray (from D. Warner) utime 16.68 secs * 600x600 mm - 20x20 subarray (from D. Warner) utime 16.67 secs * 600x600 mm - 20x20 btranspose (Warner/Smotherman) utime 16.97 secs * 600x600 mm - 20x20 btranspose (Warner/Smotherman) utime 16.98 secs * */ /***************************************************************/ /* Timer options. You MUST uncomment one of the options below */ /* or compile, for example, with the '-DUNIX' option. */ /***************************************************************/ /* #define Amiga */ /* #define UNIX */ /* #define UNIX_Old */ /* #define VMS */ /* #define BORLAND_C */ /* #define MSC */ /* #define MAC */ /* #define IPSC */ /* #define FORTRAN_SEC */ /* #define GTODay */ /* #define CTimer */ /* #define UXPM */ /* #define MAC_TMgr */ /* #define PARIX */ /* #define POSIX */ /* #define WIN32 */ /* #define POSIX1 */ /***********************/ #include #include /* #include */ /* #include */ void msg(); void normal(); void tnsq(); void unroll4(); void unroll8(); void unroll16(); void pnsq(); void transpose(); void reg_loops(); void tiling(); void maeno(); void warner(); void robert(); void exit(); double a[N][N],b[N][N],c[N][N],bt[N][N],d[N][N]; main(argc,argv) int argc; char *argv[]; { char alg_choice; int parm; int i,j; double sum,row_sum; double t_start=0.0,t_end=0.0,dtime(); /* defaults */ alg_choice = 'x'; parm = 4; i = 1; while(argc>1){ switch(argv[i][0]){ case '-': alg_choice = argv[i][1]; break; default: sscanf(argv[i],"%d",&parm); } argc--; i++; } switch(alg_choice){ case 'n': printf("%3dx%3d mm - normal algorithm ",N,N); break; case 'v': printf("%3dx%3d mm - temporary variable in loop ",N,N); break; case 'p': printf("%3dx%3d mm - pointers used to access matrices ",N,N); break; case 't': printf("%3dx%3d mm - transposed b matrix ",N,N); break; case 'i': printf("%3dx%3d mm - interchanged inner loops ",N,N); break; case 'u': if((parm!=4)&&(parm!=8)&&(parm!=16)){ printf("%s: unrolling factor of %d not implemented\n",argv[0],parm); printf(" usage: %s -