/*
 * A speeded-up Fast Fourier Transform algorithm.
 * Very ephemerally based in NR, but essentially rewritten from scratch.
 *
 * $Date: 1997/06/09 19:24:39 $
 */

#include "complex.h"
#include "fft.h"
#include "util.h"

#if 0
#include <stdio.h>
#define DO_DEBUG(foo...) fprintf(stderr, ## foo)
#else
#define DO_DEBUG(foo...)
#endif
#define NO_DEBUG(foo...)

#if 1
#define DISP(t) fprintf(stderr, #t " = %" LFMT "f\n", t)
#else
#define DISP(t)
#endif

unsigned long bit_reverse(unsigned long num, unsigned long max)
{
  unsigned long ret = 0;
  for (max >>= 1 ; max ; max >>= 1) {
    ret <<= 1;
    if (num & 1)
      ret++;
    num >>= 1;
  }
  return ret;
}
#define I(x)		(x)
#define IS(x,y)		I(x), I(y), I((x)+(y))
#define R(x)		bit_reverse(x, nn)
#define RS(x,y)		R(y), R(x), R((y)+(x))

#define SWAP(a,b)  tempz=(a);(a)=(b);(b)=tempz

FFT_Context init_nfft2 (unsigned long nn, int isign)
{
  unsigned long first_set, tmp, i;
  real theta;
  FFT_Context cont = safe_malloc(sizeof(struct FFT_Context_Data), "context");

  for (first_set=0, tmp=nn ; tmp ; first_set++, tmp>>=1);

  cont->nn = nn;
  cont->isign = isign;  /* informational only */

  cont->Wbase = safe_malloc(first_set * sizeof(complex), "Wbase");
  for (i=0 ; i<first_set ; i++) {
    theta = isign*M_PI/(1<<i);
    cont->Wbase[i] = (cos(theta) + sin(theta)*1i);
  }

  return cont;
}

void nfft2 (complex data[], FFT_Context fft)
{
  unsigned long blocksize, start, offset, row, col;
  unsigned long k1, k2;
  unsigned long ibit, count, reverse;
  complex tempz, W, Wk;
  complex *Wb;
  unsigned long nn = fft->nn;

  /* Swap in both dimensions simultaneously. */
  reverse=0;
  for (count=0; count<nn; count++) {
    if (count < reverse) {   /* must avoid swapping both A<->B and B<->A */
      for (row=0; row<nn; row++) {
	SWAP(data[nn*row+count],data[nn*row+reverse]);
      }
      for (col=0; col<nn; col++) {
	SWAP(data[nn*count+col],data[nn*reverse+col]);
      }
    }
    /* Increment <reverse>, using opposite bit-ordering than usual.
     * To do this, we turn any 1's at the "least-significant" end of the
     * number into 0's, and the "least-significant" 0 next to them into a
     * 1.  (In adding a 1, carry propagation flips all the 1's and the 0.)
     * Keep in mind that "least-significant" indicates the bit that is the
     * most significant in "normal" numbers, though...
     */
    ibit= nn >> 1;
    while (ibit && reverse >= ibit) {
      reverse -= ibit;
      ibit >>= 1;
    }
    reverse += ibit;
  }

  /* Danielson-Lanczos in the first dimension. */
  Wb = fft->Wbase;
  for (blocksize=1; blocksize < nn; blocksize <<= 1) {
    W = *(Wb++);
    Wk=1.0;
    for (offset=0; offset<blocksize; offset++) {
      for (start=0; start<nn; start+=(2*blocksize)) {
	NO_DEBUG("bs=%3lu (s=%3lu)+(o=%3lu)=%3lu r=*\n",
		 I(blocksize), IS(start, offset));
	for (row=0; row<nn; row++) {
	  k1=nn*row+start+offset;
	  k2=k1+blocksize;
	  tempz = Wk * data[k2];
	  data[k2]=data[k1]-tempz;
	  data[k1] += tempz;
	}
      }
      Wk *= W;
    }
  }

  /* Danielson-Lanczos in the second dimension. */
  Wb = fft->Wbase;
  for (blocksize=1; blocksize < nn; blocksize <<= 1) {
    W = *(Wb++);
    Wk=1.0;
    for (offset=0; offset<blocksize; offset++) {
      for (start=0; start<nn; start+=(2*blocksize)) {
	NO_DEBUG("bs=%3lu (s=%3lu)+(o=%3lu)=%3lu c=*\n",
		 I(blocksize), IS(start, offset));
	for (col=0; col<nn; col++) {
	  k1=col+nn*(start+offset);
	  k2=k1+nn*blocksize;
	  tempz = Wk * data[k2];
	  data[k2]=data[k1]-tempz;
	  data[k1] += tempz;
	}
      }
      Wk *= W;
    }
  }
}

FFT_Context init_mfft2 (unsigned long nn, int isign)
{
  unsigned long first_set, tmp, i;
  real theta;
  FFT_Context cont = safe_malloc(sizeof(struct FFT_Context_Data), "context");

  for (first_set=0, tmp=nn ; tmp ; first_set++, tmp>>=1);

  cont->nn = nn;
  cont->isign = isign;  /* informational only */

  cont->Wbase = safe_malloc(first_set * sizeof(complex), "Wbase");
  for (i=0 ; i<first_set ; i++) {
    theta = isign*M_PI/(1<<i);
    cont->Wbase[i] = (cos(theta) + sin(theta)*1i);
  }

  return cont;
}

void mfft2 (complex data[], FFT_Context fft)
{
  unsigned long blocksize, start, offset, row, col;
  unsigned long k1, k2;
  unsigned long ibit, count, reverse;
  complex tempz, W, Wk;
  complex *Wb;
  unsigned long nn = fft->nn;

  DO_DEBUG(">> 11111\n");
  /* Danielson-Lanczos in the first dimension. */
  Wb = fft->Wbase;
  for (blocksize=nn>>1; blocksize > 0; blocksize >>= 1) {
    DO_DEBUG(">> -----\n");
    W = *(Wb++);
    Wk=1.0;
    /* for (offset=0; offset<nn; offset+=(2*blocksize)) { */
    for (offset=0; R(offset)<R(blocksize); offset=R(1+R(offset))) {
      DO_DEBUG("%lu %lu %lu %ld/%lu*pi\n", offset, R(offset), blocksize,
	       (long)rint(carg(Wk)/M_PI*(nn/blocksize/2)),
	       nn/blocksize/2);
      for (start=0; start<blocksize; start++) {
	for (row=0; row<nn; row++) {
	  k1=nn*row+start+offset;
	  k2=k1+blocksize;
	  tempz = Wk * data[k2];
	  data[k2]=data[k1]-tempz;
	  data[k1] += tempz;
	}
      }
      Wk *= W;
    }
    DO_DEBUG(">> =====\n");
  }

  /* Danielson-Lanczos in the second dimension. */
  Wb = fft->Wbase;
  for (blocksize=nn>>1; blocksize > 0; blocksize >>= 1) {
    W = *(Wb++);
    Wk=1.0;
    for (offset=0; R(offset)<R(blocksize); offset=R(1+R(offset))) {
      for (start=0; start<blocksize; start++) {
	for (col=0; col<nn; col++) {
	  k1=col+nn*(start+offset);
	  k2=k1+nn*blocksize;
	  tempz = Wk * data[k2];
	  data[k2]=data[k1]-tempz;
	  data[k1] += tempz;
	}
      }
      Wk *= W;
    }
  }

  /* Swap in both dimensions simultaneously. */
  reverse=0;
  for (count=0; count<nn; count++) {
    if (count < reverse) {   /* must avoid swapping both A<->B and B<->A */
      for (row=0; row<nn; row++) {
	SWAP(data[nn*row+count],data[nn*row+reverse]);
      }
      for (col=0; col<nn; col++) {
	SWAP(data[nn*count+col],data[nn*reverse+col]);
      }
      NO_DEBUG("swap %lu (%lu) <-> %lu (%lu)\n",
	       count,   bit_reverse(count,  nn),
	       reverse, bit_reverse(reverse,nn));
    }
    /* Increment <reverse>, using opposite bit-ordering than usual.
     * To do this, we turn any 1's at the "least-significant" end of the
     * number into 0's, and the "least-significant" 0 next to them into a
     * 1.  (In adding a 1, carry propagation flips all the 1's and the 0.)
     * Keep in mind that "least-significant" indicates the bit that is the
     * most significant in "normal" numbers, though...
     */
    ibit= nn >> 1;
    while (ibit && reverse >= ibit) {
      reverse -= ibit;
      ibit >>= 1;
    }
    reverse += ibit;
  }
}

#undef SWAP
