/*
 * a speeded-up Fast Fourier Transform algorithm.
 * Taken from "Numerical Recipes", then modified with a big hammer.
 *
 * $Date: 1995/09/01 17:42:15 $
 */

#include "complex.h"
#include "fft.h"

/* #define DEBUGME(fd, foo...) fprintf(fd, ## foo) */
#define DEBUGME(fd, foo...)

#define SWAP(a,b) tempr=(a);(a)=(b);(b)=tempr

void fft2(real data[], unsigned long nn, int isign)
{
  unsigned long i1,i2,i3,i2rev,i3rev,ifp1;
  unsigned long ibit,k1,k2;
  real tempi,tempr;
  real theta,wi,wpi,wpr,wr,wtemp;

  /* FIRST UNROLLED ITERATION... */
  i2rev=0;
  for (i2=0; i2<nn; i2++) {
    DEBUGME(stderr,"i2 = %ld     i2rev = %ld\n", i2, i2rev);
    if (i2 < i2rev ) {
      for (i3=i2; i3<nn*nn; i3+=nn) {
	i3rev=i2rev+i3-i2;
	SWAP(data[2*i3],data[2*i3rev]);
	SWAP(data[2*i3+1],data[2*i3rev+1]);
	DEBUGME(stderr,"  SWAP.1: %ld <-> %ld\n", 2*i3, 2*i3rev);
      }
    }
    ibit= nn >> 1;
    while (ibit >= 1 && i2rev >= ibit) {
      i2rev -= ibit;
      ibit >>= 1;
    }
    i2rev += ibit;
  }
  ifp1=2;
  while (ifp1 < 2*nn) {
    DEBUGME(stderr,"ifp1 = %ld\n", ifp1);
    theta=isign*6.28318530717959/ifp1;

    wtemp=sin(0.5*theta);
    wpr = -2.0*wtemp*wtemp;
    wpi=sin(theta);
    wr=1.0;
    wi=0.0;
    for (i3=0; i3<ifp1/2; i3++) {
      DEBUGME(stderr,"  i3 = %ld\n", i3);
      for (i2=i3; i2<nn*nn; i2+=ifp1) {
	DEBUGME(stderr,"    i2 = %ld\n", i2);
	k1=2*i2+1;
	k2=k1+ifp1;
	DEBUGME(stderr,"        use.a2: R(%ld)\n", k2-1);
	tempr=(real)wr*data[k2-1]-(real)wi*data[k2];
	tempi=(real)wr*data[k2]+(real)wi*data[k2-1];
	DEBUGME(stderr,"        use.a1: R(%ld)\n", k1-1);
	data[k2-1]=data[k1-1]-tempr;
	data[k2]=data[k1]-tempi;
	data[k1-1] += tempr;
	data[k1] += tempi;
      }
      wr=(wtemp=wr)*wpr-wi*wpi+wr;
      wi=wi*wpr+wtemp*wpi+wi;
    }
    ifp1 <<= 1;
  }

  /* LAST UNROLLED ITERATION... */
  i2rev=0;
  for (i2=0; i2<nn*nn; i2+=nn) {
    DEBUGME(stderr,"i2 = %ld     i2rev = %ld\n", i2, i2rev);
    if (i2 < i2rev) {
      for (i1=i2; i1<i2+nn; i1++) {
	i3rev=i2rev+i1-i2;
	SWAP(data[2*i1],data[2*i3rev]);
	SWAP(data[2*i1+1],data[2*i3rev+1]);
	DEBUGME(stderr,"  SWAP.2: %ld <-> %ld\n", 2*i1, 2*i3rev);
      }
    }
    ibit= (nn*nn) >> 1;
    while (ibit >= nn && i2rev >= ibit) {
      i2rev -= ibit;
      ibit >>= 1;
    }
    i2rev += ibit;
  }
  ifp1=2*nn;
  while (ifp1 < 2*nn*nn) {
    DEBUGME(stderr,"ifp1 = %ld\n", ifp1);
    theta=isign*6.28318530717959/(ifp1/nn);
    wtemp=sin(0.5*theta);
    wpr = -2.0*wtemp*wtemp;
    wpi=sin(theta);
    wr=1.0;
    wi=0.0;
    for (i3=0; i3<ifp1/2; i3+=nn) {
      DEBUGME(stderr,"  i3 = %ld\n", i3);
      for (i1=i3; i1<i3+nn; i1++) {
	DEBUGME(stderr,"    i1 = %ld\n", i1);
	for (i2=i1; 2*i2<2*nn*nn; i2+=ifp1) {
	  DEBUGME(stderr,"      i2 = %ld\n", i2);
	  k1=2*i2+1;
	  k2=k1+ifp1;
	  DEBUGME(stderr,"        use.b2: R(%ld)\n", k2-1);
	  tempr=(real)wr*data[k2-1]-(real)wi*data[k2];
	  tempi=(real)wr*data[k2]+(real)wi*data[k2-1];
	  DEBUGME(stderr,"        use.b1: R(%ld)\n", k1-1);
	  data[k2-1]=data[k1-1]-tempr;
	  data[k2]=data[k1]-tempi;
	  data[k1-1] += tempr;
	  data[k1] += tempi;
	}
      }
      wr=(wtemp=wr)*wpr-wi*wpi+wr;
      wi=wi*wpr+wtemp*wpi+wi;
    }
    ifp1 <<= 1;
  }
}
#undef SWAP
