Compiled with VC++ 4.0, every trick in the book.
Machine: Pentium P55-166, 48 MBytes, NT 4.0

500x500 mm - normal algorithm                     utime      41.95 secs
500x500 mm - unrolled inner loop, factor of  4    utime      42.38 secs
500x500 mm - blocking, factor of  20              utime      10.05 secs
500x500 mm - transposed b matrix                  utime      12.84 secs
500x500 mm - Robert's algorithm                   utime      12.34 secs
500x500 mm -  20x 20 subarray (from T. Maeno)     utime       6.06 secs
500x500 mm -  20x 20 subarray (from D. Warner)    utime       5.81 secs

120x120 mm - normal algorithm                     utime       0.13 secs
120x120 mm - unrolled inner loop, factor of  4    utime       0.12 secs
120x120 mm - blocking, factor of  20              utime       0.13 secs
120x120 mm - transposed b matrix                  utime       0.11 secs
120x120 mm - Robert's algorithm                   utime       0.11 secs
120x120 mm -  20x 20 subarray (from T. Maeno)     utime       0.08 secs
120x120 mm -  20x 20 subarray (from D. Warner)    utime       0.07 secs

 60x 60 mm - normal algorithm                     utime       0.01 secs
 60x 60 mm - unrolled inner loop, factor of  4    utime       0.01 secs
 60x 60 mm - blocking, factor of  20              utime       0.02 secs
 60x 60 mm - transposed b matrix                  utime       0.01 secs
 60x 60 mm - Robert's algorithm                   utime       0.01 secs
 60x 60 mm -  20x 20 subarray (from T. Maeno)     utime       0.01 secs
 60x 60 mm -  20x 20 subarray (from D. Warner)    utime       0.01 secs

500x500 mm - normal algorithm                     utime 44.69 secs, 5.59 Mflops
500x500 mm - unrolled inner loop, factor of  4    utime 42.32 secs, 5.91 Mflops
500x500 mm - blocking, factor of  20              utime 12.14 secs, 20.60 Mflops
500x500 mm - transposed b matrix                  utime 12.99 secs, 19.25 Mflops
500x500 mm - Robert's algorithm                   utime 12.74 secs, 19.63 Mflops
500x500 mm -  20x 20 subarray (from T. Maeno)     utime  8.17 secs, 30.60 Mflops
500x500 mm -  20x 20 subarray (from D. Warner)    utime  5.67 secs, 44.11 Mflops

120x120 mm - normal algorithm                     utime  0.18 secs, 19.12 Mflops
120x120 mm - unrolled inner loop, factor of  4    utime  0.12 secs, 29.29 Mflops
120x120 mm - blocking, factor of  20              utime  0.16 secs, 21.74 Mflops
120x120 mm - transposed b matrix                  utime  0.11 secs, 31.96 Mflops
120x120 mm - Robert's algorithm                   utime  0.11 secs, 31.92 Mflops
120x120 mm -  20x 20 subarray (from T. Maeno)     utime  0.10 secs, 32.97 Mflops
120x120 mm -  20x 20 subarray (from D. Warner)    utime  0.07 secs, 51.97 Mflops

-- Athlon 900 
500x500 mm - normal algorithm                     24.73 MFlops, utime 10.108 secs
500x500 mm - blocking, factor of  20              124.61 MFlops, utime 2.006 secs
500x500 mm - transposed b matrix                  78.83 MFlops, utime 3.171 secs
500x500 mm - Robert's algorithm                   130.25 MFlops, utime 1.919 secs
500x500 mm -  20x 20 subarray (from T. Maeno)     292.51 MFlops, utime 0.855 secs
500x500 mm -  20x 20 subarray (from D. Warner)    218.34 MFlops, utime 1.145 secs

120x120 mm - normal algorithm                     130.85 MFlops, utime 0.026 secs
120x120 mm - blocking, factor of  20              145.29 MFlops, utime 0.024 secs
120x120 mm - transposed b matrix                  354.01 MFlops, utime 0.010 secs
120x120 mm - Robert's algorithm                   378.22 MFlops, utime 0.009 secs
120x120 mm -  20x 20 subarray (from T. Maeno)     418.28 MFlops, utime 0.008 secs
120x120 mm -  20x 20 subarray (from D. Warner)    300.20 MFlops, utime 0.012 secs

 60x 60 mm - normal algorithm                     141.92 MFlops, utime 0.003 secs
 60x 60 mm - blocking, factor of  20              148.76 MFlops, utime 0.003 secs
 60x 60 mm - transposed b matrix                  392.73 MFlops, utime 0.001 secs
 60x 60 mm - Robert's algorithm                   406.78 MFlops, utime 0.001 secs
 60x 60 mm -  20x 20 subarray (from T. Maeno)     431.14 MFlops, utime 0.001 secs
 60x 60 mm -  20x 20 subarray (from D. Warner)    312.59 MFlops, utime 0.001 secs

-- Pentium 166 MHz
500x500 mm - normal algorithm                     5.00 MFlops, utime 49.952 secs
500x500 mm - blocking, factor of  20              14.00 MFlops, utime 17.856 secs
500x500 mm - transposed b matrix                  20.26 MFlops, utime 12.338 secs
500x500 mm - Robert's algorithm                   20.25 MFlops, utime 12.347 secs
500x500 mm -  20x 20 subarray (from T. Maeno)     30.19 MFlops, utime 8.282 secs
500x500 mm -  20x 20 subarray (from D. Warner)    39.00 MFlops, utime 6.410 secs

120x120 mm - normal algorithm                     14.53 MFlops, utime 0.238 secs
120x120 mm - blocking, factor of  20              14.38 MFlops, utime 0.240 secs
120x120 mm - transposed b matrix                  32.49 MFlops, utime 0.106 secs
120x120 mm - Robert's algorithm                   32.11 MFlops, utime 0.108 secs
120x120 mm -  20x 20 subarray (from T. Maeno)     32.11 MFlops, utime 0.108 secs
120x120 mm -  20x 20 subarray (from D. Warner)    43.13 MFlops, utime 0.080 secs

 60x 60 mm - normal algorithm                     17.97 MFlops, utime 0.024 secs
 60x 60 mm - blocking, factor of  20              14.48 MFlops, utime 0.030 secs
 60x 60 mm - transposed b matrix                  32.19 MFlops, utime 0.013 secs
 60x 60 mm - Robert's algorithm                   31.72 MFlops, utime 0.014 secs
 60x 60 mm -  20x 20 subarray (from T. Maeno)     32.19 MFlops, utime 0.013 secs
 60x 60 mm -  20x 20 subarray (from D. Warner)    43.20 MFlops, utime 0.010 secs

500x500 mm - normal algorithm                     5.02 MFlops, utime 49.781 secs
500x500 mm - blocking, factor of  20              13.94 MFlops, utime 17.936 secs
500x500 mm - transposed b matrix                  20.16 MFlops, utime 12.398 secs
500x500 mm - Robert's algorithm                   20.26 MFlops, utime 12.338 secs
500x500 mm -  20x 20 subarray (from T. Maeno)     30.22 MFlops, utime 8.272 secs
500x500 mm -  20x 20 subarray (from D. Warner)    38.35 MFlops, utime 6.519 secs

120x120 mm - normal algorithm                     14.45 MFlops, utime 0.239 secs
120x120 mm - blocking, factor of  20              14.39 MFlops, utime 0.240 secs
120x120 mm - transposed b matrix                  32.88 MFlops, utime 0.105 secs
120x120 mm - Robert's algorithm                   32.49 MFlops, utime 0.106 secs
120x120 mm -  20x 20 subarray (from T. Maeno)     32.07 MFlops, utime 0.108 secs
120x120 mm -  20x 20 subarray (from D. Warner)    43.13 MFlops, utime 0.080 secs

 60x 60 mm - normal algorithm                     17.99 MFlops, utime 0.024 secs
 60x 60 mm - blocking, factor of  20              14.48 MFlops, utime 0.030 secs
 60x 60 mm - transposed b matrix                  32.19 MFlops, utime 0.013 secs
 60x 60 mm - Robert's algorithm                   32.19 MFlops, utime 0.013 secs
 60x 60 mm -  20x 20 subarray (from T. Maeno)     31.72 MFlops, utime 0.014 secs
 60x 60 mm -  20x 20 subarray (from D. Warner)    43.11 MFlops, utime 0.010 secs
