On the numerical evaluation of factorials IV

Algorithms for Faster Multiplication

And finally I proudly present…uh, whatever… here are the Toom-Cook 2,3,(4) and FFT functions to speed up multiplication and squaring.

This function to divide a Bigdecimal by a Number is used in the Toom-Cook multiplication algorithms, so, as much as I am disapointed about forgetting it, it is listed here.

Bigdecimal.prototype.divD = function(n){
  var t    = this.digits;
  var tlen = this.digits.length;
  var tsgn = this.sign;
  var bsgn = (n<0)?-1:1;
  var ret  = new Bigdecimal();
  
  var carry = 0;

  ret.sign = (tsgn == bsgn)?1:-1;
  ret.digits = new Array(tlen);
  ret.used = tlen;
  for(var i=tlen-1;i>=0;i--){
    var temp  = (t[i]+carry)/n;
    var ftemp = Math.floor(temp);
    carry = 0;
    if(temp != ftemp){
      //carry += Math.BIGDECIMAL_BASE*Math.round((temp-ftemp)*n);
      carry += (Math.BIGDECIMAL_BASE*carry +t[i]) %n
    }
    //ret.digits.unshift(ftemp);
    ret.digits[i] = ftemp;
    //ret.used++;
  }
  ret.clamp();
  return ret;
};

The Toom-Cook 2 algorithm, better known as the Karatsuba algorithm.

Bigdecimal.prototype.karatsuba = function(bdec){

  var x0,x1,y0,y1,z0,z1,z2,temp,xy;

  var tlen = this.used;
  var blen = bdec.used;

  x0 = new Bigdecimal();
  x1 = new Bigdecimal();
  y0 = new Bigdecimal();
  y1 = new Bigdecimal();


  var m = Math.min(tlen >> 1, blen >> 1);
  if(m <= 20)return this.lowlevel_mul(bdec)
  x1.digits = this.digits.slice(m,tlen);
  x1.used = x1.digits.length;
  x1.sign = this.sign;

  x0.digits = this.digits.slice(0,m);
  x0.used = x0.digits.length;
  x0.sign = this.sign;

  y1.digits = bdec.digits.slice(m,blen);
  y1.used = y1.digits.length;
  y1.sign = bdec.sign;

  y0.digits = bdec.digits.slice(0,m);
  y0.used = y0.digits.length;
  y0.sign = bdec.sign;

  z2 = x1.karatsuba(y1);
  z0 = x0.karatsuba(y0);

  // z1 = (x1 + x0)(y1 + y0) - z2 - z0
  z1 = ( (x1.add(x0)).karatsuba(y1.add(y0)) ).sub(z2).sub(z0)
  // z2 * B^(2*m) + z1 * B^m + z0
  xy = (z2.dShiftLeft(2*m)).add(z1.dShiftLeft(m)).add(z0)

  xy.sign = this.sign * bdec.sign;
  return xy;
};

The Toom-Cook 3 algorithm.

Bigdecimal.prototype.toomCookMul = function(bdec){
  var a0,a1,a2,b0,b1,b2,b3,tmp1,tmp2,w0,w1,w2,w3,w4,c;
  var tmp
  var tlen = this.used;
  var blen = bdec.used;

  var tsign = (this.s<0)?-1:1;
  var bsign = (this.s<0)?-1:1;
  sign = tsign * bsign;

  a0 = new Bigdecimal();
  a1 = new Bigdecimal();
  a2 = new Bigdecimal();
 
  b0 = new Bigdecimal();
  b1 = new Bigdecimal();
  b2 = new Bigdecimal();


  var m = Math.floor(Math.min(tlen,blen)/3);
  if(m <= 40)return this.karatsuba(bdec)
  a0.digits = this.digits.slice(0,m);        // a(mod l^B)
  a0.used   = a0.digits.length;
  a0.sign   = this.sign;
    
  a1.digits = this.digits.slice(m,2*m);      // a/(l^B),   a1(mod l^B)
  a1.used   = a1.digits.length;
  a1.sign   = this.sign;  
  
  a2.digits = this.digits.slice(2*m,tlen);   // a/(l^(2B)),a2(mod l^B)
  a2.used   = a2.digits.length;
  a2.sign   = this.sign;   
  // same with second factor
  
  b0.digits = bdec.digits.slice(0,m);        // b(mod l^B)
  b0.used   = b0.digits.length;
  b0.sign   = this.sign;  

  b1.digits = bdec.digits.slice(m,2*m);      // b/(l^B),   b1(mod l^B)
  b1.used   = b1.digits.length;
  b1.sign   = this.sign;  
  
  b2.digits = bdec.digits.slice(2*m,blen);   // b/(l^(2B)),b2(mod l^B)
  b2.used   = b2.digits.length;
  b2.sign   = this.sign;

  w0 = a0.toomCookMul(b0);
  w4 = a2.toomCookMul(b2);

  //w1 = (a2 + 2(a1 + 2a0))        (b2 + 2(b1 + 2b0))
  w1 = (a2.add((a1.add(a0.mulD(2)) ).mulD(2)))   .toomCookMul
       (b2.add((b1.add(b0.mulD(2)) ).mulD(2)))

  //w3 = (a0 + 2(a1 + 2a2))       (b0 + 2(b1 + 2b2))
  w3 = (a0.add( (a1.add(a2.mulD(2)) ).mulD(2) ) ) .toomCookMul
       (b0.add( (b1.add(b2.mulD(2)) ).mulD(2) ) )

  //w2 = (a2 + a1 + a0)           (b2 + b1 + b0) 
  w2 = (a2.add(a1).add(a0)) . toomCookMul(b2.add(b1).add(b0))

  // solve
  // 18a
  w1 = w1.sub(w4) // c
  // 18b
  w3 = w3.sub(w0) // c
  // 19a
  w1 = w1.divD(2); // c
  // 19b
  w3 = w3.divD(2); // 
  // 20a,b
  w2 = w2.sub(w0).sub(w4);   // c
  // 21a 
  w1 = w1.sub(w2);          // c
  // 21b
  w3 = w3.sub(w2);          // c
  // 22a,b
  w1 = w1.sub(w0.mulD(8));        // c
  // 22c,d
  w3 = w3.sub(w4.mulD(8));        // c
  // 23a,b,c
  w2 = w2.mulD(3).sub(w1).sub(w3);          // c
  // 24a
  w1 = w1.sub(w2);          // c
  // 24b

  w3 = w3.sub(w2);          // c
//alert(tmp +" - "+ w2 +"-"+ w3)
  // 25a
  w1 = w1.divD(3);      // c
  // 25b
  w3 = w3.divD(3);      // c
    // w0 = w0.dlShift(0*m)
  w1 = w1.dShiftLeft(1*m);     // c
  w2 = w2.dShiftLeft(2*m);     // c
  w3 = w3.dShiftLeft(3*m);     // c
  w4 = w4.dShiftLeft(4*m);     // c
  
  c = w0.add(w1).add(w2).add(w3).add(w4);
  
  c.sign = (sign<0)?-1:1;
  return c;
};

The Toom-Cook 4 algorithm (not tested, therefore not used).

Bigdecimal.prototype.toomCookMul4 = function(bdec){
  var a0,a1,a2,a3,b0,b1,b2,b3,b4,c;
  var S1,S2,S3,S4,S5,S6,S7;

  var tlen = this.used;
  var blen = bdec.used;

  var tsign = (this.s<0)?-1:1;
  var bsign = (this.s<0)?-1:1;
  sign = tsign * bsign;

  a0 = new Bigdecimal();
  a1 = new Bigdecimal();
  a2 = new Bigdecimal();
  a3 = new Bigdecimal();
 
  b0 = new Bigdecimal();
  b1 = new Bigdecimal();
  b2 = new Bigdecimal();
  b3 = new Bigdecimal();
  
  var m = Math.floor(Math.min(tlen,blen)/4);
  if(m <= 40)return this.lowlevel_mul(bdec);
  a0.digits = this.digits.slice(0,m);
  a0.used   = a0.digits.length;
  a0.sign   = this.sign;
    
  a1.digits = this.digits.slice(m,2*m);
  a1.used   = a1.digits.length;
  a1.sign   = this.sign;  
  
  a2.digits = this.digits.slice(2*m,3*m);
  a2.used   = a2.digits.length;
  a2.sign   = this.sign; 

  a3.digits = this.digits.slice(3*m,tlen);
  a3.used   = a3.digits.length;
  a3.sign   = this.sign; 

  b0.digits = bdec.digits.slice(0,m);
  b0.used   = b0.digits.length;
  b0.sign   = bdec.sign;
    
  b1.digits = bdec.digits.slice(m,2*m);
  b1.used   = b1.digits.length;
  b1.sign   = bdec.sign;  
  
  b2.digits = bdec.digits.slice(2*m,3*m);
  b2.used   = b2.digits.length;
  b2.sign   = bdec.sign; 

  b3.digits = bdec.digits.slice(3*m,blen);
  b3.used   = b3.digits.length;
  b3.sign   = bdec.sign; 

  S1 = a3.lowlevel_mul(b3);
  //(8*a3+4*a2+2*a1+a0)*(8*b3+4*b2+2*b1+b0)
  S2 = (a3.mulD(8).add(a2.mulD(4)).add(a1.add(a1).add(a0))).lowlevel_mul
       (b3.mulD(8).add(b2.mulD(4)).add(b1.add(b1).add(b0)));
  //PRINT("(8*"+a3+"+ 4*"+a2+"+ 2*"+a1+" +"+a0+" )*(8*"+b3+" +4*"+b2+" +2*"+b1+" +"+b0+") - " +S2 + "<br>")
  //(+a3+a2+a1+a0)*(+b3+b2+b1+b0)
  S3 = (a3.add(a2).add(a1).add(a0)).lowlevel_mul
       (b3.add(b2).add(b1).add(b0))
  // (-a3+a2-a1+a0)*(-b3+b2-b1+b0) 
  S4 = ((a2).sub(a1).add(a0).sub(a3)).lowlevel_mul
       ((b2).sub(b1).add(b0).sub(b3))   
  // (+8*a0+4*a1+2*a2+a3)*(+8*b0+4*b1+2*b2+b3)
  S5 = (a0.mulD(8).add(a1.mulD(4)).add(a2.add(a2)).add(a3) ).lowlevel_mul
       (b0.mulD(8).add(b1.mulD(4)).add(b2.add(b2)).add(b3) )
  //(-8*a0+4*a1-2*a2+a3)*(-8*b0+4*b1-2*b2+b3)
  S6 = (a0.mulD(-8).add(a1.mulD(4)).sub(a2.add(a2)).add(a3) ).lowlevel_mul
       (b0.mulD(-8).add(b1.mulD(4)).sub(b2.add(b2)).add(b3) )
       
  S7 = a0.lowlevel_mul(b0);

                  //PRINT(S2+" + "+S5+" - "+S2.add(S5)+"<br>")
  S2 = S2.add(S5);//PRINT(S4+" - "+S3+" - "+S4.sub(S3)+"<br>")
  S4 = S4.sub(S3);//PRINT(S6+" - "+S5+" - "+S6.sub(S5)+"<br>")
  S6 = S6.sub(S5);//PRINT(S4+"/2 - "+S4.divD(2)+"<br>")
  S4 = S4.divD(2);//PRINT(S5+" - "+S1+" - "+S5.sub(S1)+"<br>")
  S5 = S5.sub(S1);//PRINT(S5+" - ("+S7+" * 64) - "+S5.sub((S7.mulD(64)))+"<br>")
  S5 = S5.sub((S7.mulD(64)));//PRINT(S3+" + "+S4+" - "+S3.add(S4)+"<br>")
  S3 = S3.add(S4);//PRINT(S5+" + "+S5+" - "+S5.add(S5)+"<br>")
  S5 = S5.add(S5);//PRINT(S5+" + "+S6+" - "+S5.add(S6)+"<br>")
  S5 = S5.add(S6);//PRINT(S2+" - ("+S3+" * 65) - "+S2.sub((S3.mulD(65)))+"<br>")
  S2 = S2.sub((S3.mulD(65)));//PRINT(S3+" - "+S1+" - "+S3.sub(S1)+"<br>")
  S3 = S3.sub(S1);//PRINT(S3+" - "+S7+" - "+S3.sub(S7)+"<br>")
  S3 = S3.sub(S7);//PRINT(S4+"<br>"+S4.neg()+"<br>")
  S4 = S4.neg();//PRINT(S6+"<br>"+S6.neg()+"<br>")
  S6 = S6.neg();//PRINT(S2+" + ("+S3+" * 45) - "+S2.add((S3.mulD(45)))+"<br>")
  S2 = S2.add((S3.mulD(45)));//PRINT(S5+" - ("+S3+" * 8) - "+S5.sub((S3.mulD(8)))+"<br>")
  S5 = S5.sub((S3.mulD(8)));//PRINT(S5+"/24 - "+S5.divD(24)+"<br>")
  S5 = S5.divD(24);
  S6 = S6.sub(S2);
  S2 = S2.sub((S4.mulD(16)));
  S2 = S2.divD(18);
  S3 = S3.sub(S5);
  S4 = S4.sub(S2);
  S6 = S6.add((S2.mulD(30)));//PRINT("divrem("+S6+",60) -"+S6.divremD(60)+"<br>")
  S6 = S6.divD(60);
  S2 = S2.sub(S6);

 
  S1 = S1.dShiftLeft(6*m);
  S2 = S2.dShiftLeft(5*m);
  S3 = S3.dShiftLeft(4*m);
  S4 = S4.dShiftLeft(3*m);
  S5 = S5.dShiftLeft(2*m);
  S6 = S6.dShiftLeft(1*m);
  //S7 = S7.dShiftLeft(0*m); 
  //PRINT(S1 +" + "+S2 +" + "+S3 +" + "+S4 +" + "+S5 +" + "+S6 +" + "+S7 +" - "+
  //S1.add(S2).add(S3).add(S4).add(S5).add(S6).add(S7)+"<br><br>")
  c = S1.add(S2).add(S3).add(S4).add(S5).add(S6).add(S7);
  c.sign = (sign<0)?-1:1;
  return c
};

The long awaited FFT-multiplication. If that is not the most crude
implementation I’d eat my hat a banana.
Please turn your attention to the digit-conversion starting at line 145.

Bigdecimal.prototype.mulFFT  = function(bdec,base){
  var nextPow2 = function(n){
      var lg2 = Math.ceil(Math.log(n)*Math.LOG2E);
      return Math.pow(2,lg2);
  };
  var ispow2 = function(x){
    if(x<2)return false;
    if(x&(x-1))return false;
    return true;
  };
  var bitlength = function(x){
    if(x<2) {return undefined;}
    for(var i=0;i<52;i++){
      if(x&(1<<i))return i;
    }
  };
  var esrever = function(k,n){
    var rev = 0;
    for(var i=0;i<n;i++){
      rev = (rev<<1)|(k&1);
      k>>=1;
    }
    return rev;
  };

  var normalize = function(a){
        var t     = a;
        var tlen  = a.length;

        var ret   = new Array(tlen);
        var carry = 0;
        for(var i=0;i<tlen;i++){
          var temp = carry;
          carry = 0;
          temp += Math.round(t[i]);
          if(temp >= Math.BIGDECIMAL_BASE){
            carry = Math.floor(temp/Math.BIGDECIMAL_BASE);
            temp  = temp % Math.BIGDECIMAL_BASE;
          }   
          ret[i] = (temp);
        }
        if(carry)ret.push(carry);
	//while(ret[tlen-1] == 0){ret.pop();tlen--;}
        return ret;
      };

  var fftsimple = function(Ri, Ii,n,dir){
    var nbits;
    var bsize; var bend = 1;
  
    var angle = 2.0 * Math.PI;
    var tempreal, tempimaginary;
    var Ro = new Array(Ri.length);
    var Io = new Array(Ri.length);
  
    var ar  = new Array(3);
    var ai  = new Array(3);  
  
    if(!ispow2(n))return undefined;
    if(dir)angle = -angle;
  
    nbits = bitlength(n);
    for(var i=0;i<n;i++){
      j = esrever(i,nbits);
      Ro[j] = Ri[i];
      Io[j] = Ii[i];
    }
    for(bsize=2;bsize<=n;bsize<<=1){
      var quotangle = angle/bsize;
      var onesinangle = Math.sin ( -quotangle );
      var twosinangle = Math.sin ( -2 * quotangle );
      var onecosangle = Math.cos ( -quotangle );
      var twocosangle = Math.cos ( -2 * quotangle );
      var twiceonecosangle   = 2 * onecosangle;
  
      for(var i=0;i<n;i+=bsize){
        ar[2] = twocosangle;ar[1] = onecosangle;
        ai[2] = twosinangle;ai[1] = onesinangle;
        for(var j=i,l=0;l<bend;j++,l++ ){
              ar[0] = twiceonecosangle*ar[1] - ar[2];
              ar[2] = ar[1];ar[1] = ar[0];
              ai[0] = twiceonecosangle*ai[1] - ai[2];
              ai[2] = ai[1];ai[1] = ai[0];
              var k = j + bend;
              tempreal      = ar[0]*Ro[k] - ai[0]*Io[k];
              tempimaginary = ar[0]*Io[k] + ai[0]*Ro[k];
              Ro[k] = Ro[j] - tempreal;
              Io[k] = Io[j] - tempimaginary;
              Ro[j] += tempreal;
              Io[j] += tempimaginary;
        }
      }
      bend = bsize;
    }
    if(dir){
      for(i=0;i<n;i++){
         Ro[i] /= n;Io[i] /= n;
      }
    }
    return [Ro,Io]
  };
  var fftconv = function(ARi,AIi,BRi,BIi,n){
    var CRo = new Array(n);
    var CIo = new Array(n);
    for(var i=0;i<n;i++){
      CRo[i] = ARi[i]*BRi[i]-AIi[i]*BIi[i];
      CIo[i] = ARi[i]*BIi[i]+AIi[i]*BRi[i];
    }
    return [CRo,CIo];
  };
  
  var mulFFT = function(ARi,AIi,BRi,BIi,n){
    var retlen = 2;
    var ret;
    var fft1;var fft2;var conv;
  
    while(retlen<2*n)retlen*=2;
    var r1 = new Array(retlen).memset(0);
    var r2 = new Array(retlen).memset(0);
    var r3 = new Array(retlen).memset(0);
    var r4 = new Array(retlen).memset(0);

    ARi = ARi.concat(r1);
    AIi = AIi.concat(r2);
    BRi = BRi.concat(r3);
    BIi = BIi.concat(r4);


    //for(i=n;i<retlen;i++){
      //ARi.push(0);AIi.push(0);
      //BRi.push(0);BIi.push(0);
    //}
    fft1   = fftsimple(ARi,AIi,retlen,false);
    fft2   = fftsimple(BRi,BIi,retlen,false);
    conv   = fftconv(fft1[0],fft1[1],fft2[0],fft2[1],retlen);
    return   fftsimple(conv[0],conv[1],2*n,true);
  };

  var ret = new Bigdecimal();
  var ti,bi;

  var td   = this.dup();
  var bd   = bdec.dup();

  bd = bd.toString()
  td = td.toString()

  if(arguments.length == 2){
    if(base > 5) return Number.NaN;
    if(base == 5){
      Math.BIGDECIMAL_BASE       = 100000;
      Math.BIGDECIMAL_LOG10_BASE =      5;
    }
    else if(base == 4){
      Math.BIGDECIMAL_BASE       = 10000;
      Math.BIGDECIMAL_LOG10_BASE =     4;
    }
    else if(base == 3){
      Math.BIGDECIMAL_BASE       = 1000;
      Math.BIGDECIMAL_LOG10_BASE =    3;
    }
    else if(base == 2){
      Math.BIGDECIMAL_BASE       = 100;
      Math.BIGDECIMAL_LOG10_BASE =   2;
    }
    else if(base == 1){
      Math.BIGDECIMAL_BASE       = 10;
      Math.BIGDECIMAL_LOG10_BASE =  1;
    }
    else
      return Number.NaN;
  }
  else{
    Math.BIGDECIMAL_BASE       = 1000;
    Math.BIGDECIMAL_LOG10_BASE =    3;
  }

  td = td.toBigdecimal()
  bd = bd.toBigdecimal()

  var t    = td.digits;
  var tlen = td.used;

  var b    = bd.digits;
  var blen = bd.used;
  if(tlen<blen){
    var temp = new Array(blen-tlen).memset(0)
    t = t.concat(temp);
    tlen = blen;
  }
  if(blen<tlen){
    var temp = new Array(tlen-blen).memset(0)
     b = b.concat(temp);
    blen = tlen;
  }
  var pwr2 = nextPow2(tlen);
  var temp1 = new Array(pwr2-tlen).memset(0);
  var temp2 = new Array(pwr2-blen).memset(0);
  t = t.concat(temp1);
  tlen = pwr2;
  b = b.concat(temp2);
  blen = tlen;

  ti = new Array(tlen).memset(0);
  bi = new Array(blen).memset(0);

  ret.digits = normalize(mulFFT(t,ti,b,bi,tlen)[0]);
  ret.used = ret.digits.length;
  ret.clamp();
  ret = ret.toString()
  
  Math.BIGDECIMAL_BASE       = 10000000;
  Math.BIGDECIMAL_LOG10_BASE =        7;

  return ret.toBigdecimal();
};

The FFT-function above is only good for up to Bigdecimal.FFT_CUTOFF long Bigdecimals, so I used the Karatsuba algorithm to hack around in a simple way.

Bigdecimal.prototype.mulWithPoorFFT = function(bdec,limit,base){
    var x0,x1,y0,y1,z0,z1,z2,temp,xy;

  var tlen = this.used;
  var blen = bdec.used;

  x0 = new Bigdecimal();
  x1 = new Bigdecimal();
  y0 = new Bigdecimal();
  y1 = new Bigdecimal();


  var m = Math.min(tlen >> 1, blen >> 1);
  var M = Math.max(tlen, blen);
  if(M < 5*Bigdecimal.FFT_CUTOFF)return this.mulFFT(bdec,base)
  x1.digits = this.digits.slice(m,tlen);
  x1.used = x1.digits.length;
  x1.sign = this.sign;

  x0.digits = this.digits.slice(0,m);
  x0.used = x0.digits.length;
  x0.sign = this.sign;
  
  y1.digits = bdec.digits.slice(m,blen);
  y1.used = y1.digits.length;
  y1.sign = bdec.sign;

  y0.digits = bdec.digits.slice(0,m);
  y0.used = y0.digits.length;
  y0.sign = bdec.sign;


  z2 = x1.mulWithPoorFFT(y1,limit,base);
  z0 = x0.mulWithPoorFFT(y0,limit,base);

  // z1 = (x1 + x0)(y1 + y0) - z2 - z0
  z1 = ( (x1.add(x0)).mulWithPoorFFT(y1.add(y0),limit,base) ).sub(z2).sub(z0)

  xy = (z2.dShiftLeft(2*m)).add(z1.dShiftLeft(m)).add(z0)

  return xy;
}

We had squaring already, did we not?

Bigdecimal.prototype.squareNormal = function(){
  var t    = this.digits;
  var tlen = this.used;
  var ret  = new Bigdecimal();
  var retv;
  var uv = 0;
  var carry = 0;
  var i,j
  ret.digits = new Array(2*tlen+1).memset(0);
  ret.used = 2*tlen+1;
  retv = ret.digits;
  
  for(i=0;i<tlen;i++){
    uv       = retv[2*i]+ t[i]*t[i];
    retv[2*i] = uv%Math.BIGDECIMAL_BASE;             // v
    carry    = Math.floor(uv/Math.BIGDECIMAL_BASE);  // u
    for(j=i+1;j<tlen;j++){
      uv       = retv[i+j]+2*t[j]*t[i]+carry;
      retv[i+j] = uv%Math.BIGDECIMAL_BASE;            // v
      carry    = Math.floor(uv/Math.BIGDECIMAL_BASE); // u
    }
    retv[i+j] = carry;                         // u
  }  
//  if(carry)ret.digits.push(carry);
  ret.clamp();
  return ret;
};

Karatsuba squaring. Developing stopped here for now, I found no time yet to port the other Toom-Cook algorithms to Javascript.

Bigdecimal.prototype.karatsubaSquare = function(){

  var x0,x1,z0,z2,xy;

  var tlen = this.used;

  x0 = new Bigdecimal();
  x1 = new Bigdecimal();


  var m = tlen >> 1;
  if(m <= 20)return this.squareNormal();
  x1.digits = this.digits.slice(m,tlen);
  x1.used = x1.digits.length;

  x0.digits = this.digits.slice(0,m);
  x0.used = x0.digits.length;

  z2 = x1.karatsubaSquare();
  z0 = x0.karatsubaSquare();
  
  z1 = (x1.add(x0)).karatsubaSquare().sub(z2).sub(z0);
  xy = (z2.dShiftLeft(2*m)).add(z1.dShiftLeft(m)).add(z0);

  xy.sign = 1;

  return xy;
};

The Toom-Cook 3 algorithm for squaring.

Bigdecimal.prototype.toomCookSquare = function(){
  var a0,a1,a2,tmp1,w0,w1,w2,w3,w4,c;
  var tmp
  var tlen = this.used;

  sign = 1;

  a0 = new Bigdecimal();
  a1 = new Bigdecimal();
  a2 = new Bigdecimal();


  var m = Math.floor(tlen/3);
  if(m <= 40)return this.karatsubaSquare(bdec)
  a0.digits = this.digits.slice(0,m);        // a(mod l^B)
  a0.used   = a0.digits.length;
  a0.sign   = this.sign;
    
  a1.digits = this.digits.slice(m,2*m);      // a/(l^B),   a1(mod l^B)
  a1.used   = a1.digits.length;
  a1.sign   = this.sign;  
  
  a2.digits = this.digits.slice(2*m,tlen);   // a/(l^(2B)),a2(mod l^B)
  a2.used   = a2.digits.length;
  a2.sign   = this.sign;   


  w0 = a0.toomCookMulSquare();
  w1 = (a2.add(a1).add(a0)).toomCookSquare();
  w2 = (a2.sub(a1).add(a0)).toomCookSquare();
  w3 = (a1.add(a1)).mul(a2);
  w4 = a2.toomCookSquare();

  tmp1 = w1.add(w2);
  tmp1 = tmp1.divD(2);

  w1 = w1.sub(tmp1).sub(w3);
  
  w2 = tmp1.sub(w4).sub(w0);
  
  w1 = w1.dShiftLeft(1*m);     // c
  w2 = w2.dShiftLeft(2*m);     // c
  w3 = w3.dShiftLeft(3*m);     // c
  w4 = w4.dShiftLeft(4*m);     // c
  
  c = w0.add(w1).add(w2).add(w3).add(w4);
  
  c.sign = 1;
  return c;
};

The Toom-Cook 4 algorithm for squaring (not tested, therefore not used).

Bigdecimal.prototype.toomCook4Square = function(){
  var a0,a1,a2,a4,tmp1,tmp2,tmp3,tmp4,tmp5,tmp6,tmp7,tmp8,
  var w0,w1,w2,w3,w4,w5,w6,w7,c;

  var tlen = this.used;

  sign = 1;

  a0 = new Bigdecimal();
  a1 = new Bigdecimal();
  a2 = new Bigdecimal();
  a3 = new Bigdecimal();

  var m = Math.floor(tlen/4);
  if(m <= 40)return this.karatsubaSquare(bdec)
  a0.digits = this.digits.slice(0,m);        // a(mod l^B)
  a0.used   = a0.digits.length;
  a0.sign   = this.sign;
    
  a1.digits = this.digits.slice(m,2*m);      // a/(l^B),   a1(mod l^B)
  a1.used   = a1.digits.length;
  a1.sign   = this.sign;  
  
  a2.digits = this.digits.slice(2*m,3*m);   // a/(l^(2B)),a2(mod l^B)
  a2.used   = a2.digits.length;
  a2.sign   = this.sign;   

  a3.digits = this.digits.slice(3*m,tlen);
  a3.used   = a3.digits.length;
  a3.sign   = this.sign; 


  w1 = a0.toomCook4Square();
  w2 = a0.add(a0).mul(a1);
  
  w3 = a0.add(a1).sub(a2).sub(a3);
  w3 = w3.mul(a0.sub(a1).sub(a2).add(a3));
  
  w4 = (a0.add(a1).add(a2).add(a3)).toomCook4Square();
  
  w5 = (a0.sub(a2)).mulD(2).mul(a1.sub(a3));
  
  w6 = a3.mul(a2).mulD(2);
  
  w7 = a3.toomCook4Square();

  tmp1 = w3.add(w4);             // T1 = S3 + S4
  tmp2 = (tmp1.add(w5)).divD(2); // T2 = (T1 + S5)/2
  tmp3 = w2.add(w6);             // T3 = S2 + S6
  tmp4 = tmp2.sub(tmp3);         // T4 = T2 - T3
  tmp5 = tmp3.sub(w5);           // T5 = T3 - S5
  tmp6 = tmp4.sub(w1);           // T6 = T4 - S3
  tmp7 = tmp4.sub(w1);           // T7 = T4 - S1
  tmp8 = tmp6.sub(w7);           // T8 = T6 - S7

  w1 = w7.dShiftLeft(6*m);
  w2 = w6.dShiftLeft(5*m);
  w3 = tmp7.dShiftLeft(4*m);
  w4 = tmp5.dShiftLeft(3*m);
  w5 = tmp8.dShiftLeft(2*m);
  w6 = w2.dShiftLeft(1*m);
  w7 = w1;

  c = w1.add(w2).add(w3).add(w4).add(w5).add(w6).add(w7);

  c.sign = 1;
  return c;
};

The FFT algorithm for squaring.

Bigdecimal.prototype.squareFFT  = function(base){

  var nextPow2 = function(n){
      var lg2 = Math.ceil(Math.log(n)*Math.LOG2E);
      return Math.pow(2,lg2);
  };
  var ispow2 = function(x){
    if(x<2)return false;
    if(x&(x-1))return false;
    return true;
  };
  var bitlength = function(x){
    if(x<2) {return undefined;}
    for(var i=0;;i++){
      if(x&(1<<i))return i;
    }
  };
  var esrever = function(k,n){
    var rev = 0;
    for(var i=0;i<n;i++){
      rev = (rev<<1)|(k&1);
      k>>=1;
    }
    return rev;
  };

  var normalize = function(a){
        var t     = a;
        var tlen  = a.length;

        var ret   = new Array();
        var carry = 0;
        for(var i=0;i<tlen;i++){
          var temp = carry;
          carry = 0;
          temp += Math.round(t[i]);
          if(temp >= Math.BIGDECIMAL_BASE){
            carry = Math.floor(temp/Math.BIGDECIMAL_BASE);
            temp  = temp % Math.BIGDECIMAL_BASE;
          }   
          ret.push(temp);
        }
        if(carry)ret.push(carry);
	while(ret[tlen-1] == 0){ret.pop();tlen--;}
        return ret;
      };

  var fftsimple = function(Ri, Ii,n,dir){
    var nbits;
    var bsize; var bend = 1;
  
    var angle = 2.0 * Math.PI;
    var tempreal, tempimaginary;
    var Ro = new Array(Ri.length);
    var Io = new Array(Ri.length);
  
    var ar  = new Array(3);
    var ai  = new Array(3);  
  
    if(!ispow2(n))return undefined;
    if(dir)angle = -angle;
  
    nbits = bitlength(n);
    for(var i=0;i<n;i++){
      j = esrever(i,nbits);
      Ro[j] = Ri[i];
      Io[j] = Ii[i];
    }
    for(bsize=2;bsize<=n;bsize<<=1){
      var quotangle = angle/bsize;
      var onesinangle = Math.sin ( -quotangle );
      var twosinangle = Math.sin ( -2 * quotangle );
      var onecosangle = Math.cos ( -quotangle );
      var twocosangle = Math.cos ( -2 * quotangle );
      var twiceonecosangle   = 2 * onecosangle;
  
      for(var i=0;i<n;i+=bsize){
        ar[2] = twocosangle;ar[1] = onecosangle;
        ai[2] = twosinangle;ai[1] = onesinangle;
        for(var j=i,l=0;l<bend;j++,l++ ){
              ar[0] = twiceonecosangle*ar[1] - ar[2];
              ar[2] = ar[1];ar[1] = ar[0];
              ai[0] = twiceonecosangle*ai[1] - ai[2];
              ai[2] = ai[1];ai[1] = ai[0];
              var k = j + bend;
              tempreal      = ar[0]*Ro[k] - ai[0]*Io[k];
              tempimaginary = ar[0]*Io[k] + ai[0]*Ro[k];
              Ro[k] = Ro[j] - tempreal;
              Io[k] = Io[j] - tempimaginary;
              Ro[j] += tempreal;
              Io[j] += tempimaginary;
        }
      }
      bend = bsize;
    }
    if(dir){
      for(i=0;i<n;i++){
         Ro[i] /= n;Io[i] /= n;
      }
    }
    return [Ro,Io]
  };
  var fftconv = function(ARi,AIi,BRi,BIi,n){
    var CRo = new Array(n);
    var CIo = new Array(n);
    for(var i=0;i<n;i++){
      CRo[i] = ARi[i]*BRi[i]-AIi[i]*BIi[i];
      CIo[i] = ARi[i]*BIi[i]+AIi[i]*BRi[i];
    }
    return [CRo,CIo];
  };
  
  var mulFFT = function(ARi,AIi,n){
    var retlen = 2;
    var ret;
    var fft1;var conv;
  
    while(retlen<2*n)retlen*=2;
    for(i=n;i<retlen;i++){
      ARi.push(0);AIi.push(0);
    }
    fft1   = fftsimple(ARi,AIi,retlen,false);
    conv   = fftconv(fft1[0],fft1[1],fft1[0],fft1[1],retlen);
    return   fftsimple(conv[0],conv[1],2*n,true);
  };

  var ret = new Bigdecimal();
  var ti,bi;

  var td   = this.dup();

  td = td.toString();

  if(arguments.length == 2){
    if(base > 5) return Number.NaN;
    if(base == 5){
      Math.BIGDECIMAL_BASE       = 100000;
      Math.BIGDECIMAL_LOG10_BASE =      5;
    }
    else if(base == 4){
      Math.BIGDECIMAL_BASE       = 10000;
      Math.BIGDECIMAL_LOG10_BASE =     4;
    }
    else if(base == 3){
      Math.BIGDECIMAL_BASE       = 1000;
      Math.BIGDECIMAL_LOG10_BASE =    3;
    }
    else if(base == 2){
      Math.BIGDECIMAL_BASE       = 100;
      Math.BIGDECIMAL_LOG10_BASE =   2;
    }
    else if(base == 1){
      Math.BIGDECIMAL_BASE       = 10;
      Math.BIGDECIMAL_LOG10_BASE =  1;
    }
    else
      return Number.NaN;
  }
  else{
    Math.BIGDECIMAL_BASE       = 1000;
    Math.BIGDECIMAL_LOG10_BASE =    3;
  }

  td = td.toBigdecimal();

  var t    = td.digits;
  var tlen = td.used;

  var pwr2 = nextPow2(tlen);
  var temp1 = new Array(pwr2-tlen).memset(0);
  t = t.concat(temp1);
  tlen = pwr2;

  ti = new Array(tlen).memset(0);

  ret.digits = normalize(mulFFT(t,ti,tlen)[0]);
  ret.used = ret.digits.length;
  ret = ret.toString()
  
  Math.BIGDECIMAL_BASE       = 10000000;
  Math.BIGDECIMAL_LOG10_BASE =        7;
  
  return ret.toBigdecimal();
};

This is the same hack as with the normal FFT.

Bigdecimal.prototype.squareWithPoorFFT = function(limit,base){
  var x0,x1,x1x1,x0x0,t1,t2,b;

  var tlen = this.used;

  x0 = new Bigdecimal();
  x1 = new Bigdecimal();

  var m = (tlen >> 1);
  var M = (tlen);
  if(M <= limit)return this.squareFFT(base)
  x1.digits = this.digits.slice(m,tlen);
  x1.used = x1.digits.length;
  x1.sign = this.sign;

  x0.digits = this.digits.slice(0,m);
  x0.used = x0.digits.length;
  x0.sign = this.sign;


  x1x1 = x1.squareWithPoorFFT(limit,base);
  x0x0 = x0.squareWithPoorFFT(limit,base);

  t1 = x1.add(x0);
  t1 = t1.squareWithPoorFFT(limit,base);

  t2 = x0x0.add(x1x1);
  t1 = t1.sub(t2);

  t1 = t1.dShiftLeft(m);
  x1x1 = x1x1.dShiftLeft(2*m);

  t1 = t1.add(x0x0);
  b = t1.add(x1x1);

  b.sign = 1;
  
  return b;
};

The high-level squaring.

Bigdecimal.prototype.square = function(){
  if(this.used < 2*Bigdecimal.KARATSUBA_CUTOFF){
    return this.squareNormal();
  }
  else if(this.used < 3*Bigdecimal.TOOM_COOK_CUTOFF){
    return this.karatsubaSquare();
  }
  else if(this.used < 4*Bigdecimal.TOOM_COOK4_CUTOFF){
    return this.toomCookSquare();
  //}
  //else if(this.used < 5*Bigdecimal.TOOM_COOK5_CUTOFF){
    //return this.toomCook4Square();
  //}
  //else if(this.used < 5*Bigdecimal.FFT_CUTOFF){
    //return this.toomCook5Square();
  else
    return this.squareFFT();
  } 
  
  return this.squareFFT();
};

All that is poorly tested and may suffer heavily from C&P errors.
But it is a start.

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s