642 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
		
		
			
		
	
	
			642 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
|   | #include "config.h"
 | ||
|  | #include "libbench2/bench.h"
 | ||
|  | #include <math.h>
 | ||
|  | 
 | ||
|  | #define DG unsigned short
 | ||
|  | #define ACC unsigned long
 | ||
|  | #define REAL bench_real
 | ||
|  | #define BITS_IN_REAL 53 /* mantissa */
 | ||
|  | 
 | ||
|  | #define SHFT 16
 | ||
|  | #define RADIX 65536L
 | ||
|  | #define IRADIX (1.0 / RADIX)
 | ||
|  | #define LO(x) ((x) & (RADIX - 1))
 | ||
|  | #define HI(x) ((x) >> SHFT)
 | ||
|  | #define HI_SIGNED(x) \
 | ||
|  |    ((((x) + (ACC)(RADIX >> 1) * RADIX) >> SHFT) - (RADIX >> 1)) | ||
|  | #define ZEROEXP (-32768)
 | ||
|  | 
 | ||
|  | #define LEN 10
 | ||
|  | 
 | ||
|  | typedef struct { | ||
|  |      short sign; | ||
|  |      short expt; | ||
|  |      DG d[LEN];  | ||
|  | } N[1]; | ||
|  | 
 | ||
|  | #define EXA a->expt
 | ||
|  | #define EXB b->expt
 | ||
|  | #define EXC c->expt
 | ||
|  | 
 | ||
|  | #define AD a->d
 | ||
|  | #define BD b->d
 | ||
|  | 
 | ||
|  | #define SGNA a->sign
 | ||
|  | #define SGNB b->sign
 | ||
|  | 
 | ||
|  | static const N zero = {{ 1, ZEROEXP, {0} }}; | ||
|  | 
 | ||
|  | static void cpy(const N a, N b) | ||
|  | { | ||
|  |      *b = *a; | ||
|  | } | ||
|  | 
 | ||
|  | static void fromreal(REAL x, N a) | ||
|  | { | ||
|  |      int i, e; | ||
|  | 
 | ||
|  |      cpy(zero, a); | ||
|  |      if (x == 0.0) return; | ||
|  |       | ||
|  |      if (x >= 0) { SGNA = 1; } | ||
|  |      else       { SGNA = -1; x = -x; } | ||
|  | 
 | ||
|  |      e = 0; | ||
|  |      while (x >= 1.0) { x *= IRADIX; ++e; } | ||
|  |      while (x < IRADIX) { x *= RADIX; --e; } | ||
|  |      EXA = e; | ||
|  |       | ||
|  |      for (i = LEN - 1; i >= 0 && x != 0.0; --i) { | ||
|  | 	  REAL y; | ||
|  | 
 | ||
|  | 	  x *= RADIX; | ||
|  | 	  y = (REAL) ((int) x); | ||
|  | 	  AD[i] = (DG)y; | ||
|  | 	  x -= y; | ||
|  |      } | ||
|  | } | ||
|  | 
 | ||
|  | static void fromshort(int x, N a) | ||
|  | { | ||
|  |      cpy(zero, a); | ||
|  | 
 | ||
|  |      if (x < 0) { x = -x; SGNA = -1; }  | ||
|  |      else { SGNA = 1; } | ||
|  |      EXA = 1; | ||
|  |      AD[LEN - 1] = x; | ||
|  | } | ||
|  | 
 | ||
|  | static void pack(DG *d, int e, int s, int l, N a) | ||
|  | { | ||
|  |      int i, j; | ||
|  | 
 | ||
|  |      for (i = l - 1; i >= 0; --i, --e)  | ||
|  | 	  if (d[i] != 0)  | ||
|  | 	       break; | ||
|  | 
 | ||
|  |      if (i < 0) { | ||
|  | 	  /* number is zero */ | ||
|  | 	  cpy(zero, a); | ||
|  |      } else { | ||
|  | 	  EXA = e; | ||
|  | 	  SGNA = s; | ||
|  | 
 | ||
|  | 	  if (i >= LEN - 1) { | ||
|  | 	       for (j = LEN - 1; j >= 0; --i, --j) | ||
|  | 		    AD[j] = d[i]; | ||
|  | 	  } else { | ||
|  | 	       for (j = LEN - 1; i >= 0; --i, --j) | ||
|  | 		    AD[j] = d[i]; | ||
|  | 	       for ( ; j >= 0; --j) | ||
|  | 		    AD[j] = 0; | ||
|  | 	  } | ||
|  |      } | ||
|  | } | ||
|  | 
 | ||
|  | 
 | ||
|  | /* compare absolute values */ | ||
|  | static int abscmp(const N a, const N b) | ||
|  | { | ||
|  |      int i; | ||
|  |      if (EXA > EXB) return 1; | ||
|  |      if (EXA < EXB) return -1; | ||
|  |      for (i = LEN - 1; i >= 0; --i) { | ||
|  | 	  if (AD[i] > BD[i]) | ||
|  | 	       return 1; | ||
|  | 	  if (AD[i] < BD[i]) | ||
|  | 	       return -1; | ||
|  |      } | ||
|  |      return 0; | ||
|  | } | ||
|  | 
 | ||
|  | static int eq(const N a, const N b) | ||
|  | { | ||
|  |      return (SGNA == SGNB) && (abscmp(a, b) == 0); | ||
|  | } | ||
|  | 
 | ||
|  | /* add magnitudes, for |a| >= |b| */ | ||
|  | static void addmag0(int s, const N a, const N b, N c) | ||
|  | { | ||
|  |      int ia, ib; | ||
|  |      ACC r = 0; | ||
|  |      DG d[LEN + 1]; | ||
|  | 
 | ||
|  |      for (ia = 0, ib = EXA - EXB; ib < LEN; ++ia, ++ib) { | ||
|  | 	  r += (ACC)AD[ia] + (ACC)BD[ib]; | ||
|  | 	  d[ia] = LO(r); | ||
|  | 	  r = HI(r); | ||
|  |      }      | ||
|  |      for (; ia < LEN; ++ia) { | ||
|  | 	  r += (ACC)AD[ia]; | ||
|  | 	  d[ia] = LO(r); | ||
|  | 	  r = HI(r); | ||
|  |      } | ||
|  |      d[ia] = LO(r); | ||
|  |      pack(d, EXA + 1, s * SGNA, LEN + 1, c); | ||
|  | } | ||
|  | 
 | ||
|  | static void addmag(int s, const N a, const N b, N c) | ||
|  | { | ||
|  |      if (abscmp(a, b) > 0) addmag0(1, a, b, c); else addmag0(s, b, a, c); | ||
|  | } | ||
|  | 
 | ||
|  | /* subtract magnitudes, for |a| >= |b| */ | ||
|  | static void submag0(int s, const N a, const N b, N c) | ||
|  | { | ||
|  |      int ia, ib; | ||
|  |      ACC r = 0; | ||
|  |      DG d[LEN]; | ||
|  | 
 | ||
|  |      for (ia = 0, ib = EXA - EXB; ib < LEN; ++ia, ++ib) { | ||
|  | 	  r += (ACC)AD[ia] - (ACC)BD[ib]; | ||
|  | 	  d[ia] = LO(r); | ||
|  | 	  r = HI_SIGNED(r); | ||
|  |      }      | ||
|  |      for (; ia < LEN; ++ia) { | ||
|  | 	  r += (ACC)AD[ia]; | ||
|  | 	  d[ia] = LO(r); | ||
|  | 	  r = HI_SIGNED(r); | ||
|  |      } | ||
|  | 
 | ||
|  |      pack(d, EXA, s * SGNA, LEN, c); | ||
|  | } | ||
|  | 
 | ||
|  | static void submag(int s, const N a, const N b, N c) | ||
|  | { | ||
|  |      if (abscmp(a, b) > 0) submag0(1, a, b, c); else submag0(s, b, a, c); | ||
|  | } | ||
|  | 
 | ||
|  | /* c = a + b */ | ||
|  | static void add(const N a, const N b, N c) | ||
|  | { | ||
|  |      if (SGNA == SGNB) addmag(1, a, b, c); else submag(1, a, b, c); | ||
|  | } | ||
|  | 
 | ||
|  | static void sub(const N a, const N b, N c) | ||
|  | { | ||
|  |      if (SGNA == SGNB) submag(-1, a, b, c); else addmag(-1, a, b, c); | ||
|  | } | ||
|  | 
 | ||
|  | static void mul(const N a, const N b, N c) | ||
|  | { | ||
|  |      DG d[2 * LEN]; | ||
|  |      int i, j, k; | ||
|  |      ACC r; | ||
|  | 
 | ||
|  |      for (i = 0; i < LEN; ++i) | ||
|  | 	  d[2 * i] = d[2 * i + 1] = 0; | ||
|  | 
 | ||
|  |      for (i = 0; i < LEN; ++i) { | ||
|  | 	  ACC ai = AD[i]; | ||
|  | 	  if (ai) { | ||
|  | 	       r = 0; | ||
|  | 	       for (j = 0, k = i; j < LEN; ++j, ++k) { | ||
|  | 		    r += ai * (ACC)BD[j] + (ACC)d[k]; | ||
|  | 		    d[k] = LO(r); | ||
|  | 		    r = HI(r); | ||
|  | 	       } | ||
|  | 	       d[k] = LO(r); | ||
|  | 	  } | ||
|  |      } | ||
|  | 
 | ||
|  |      pack(d, EXA + EXB, SGNA * SGNB, 2 * LEN, c); | ||
|  | } | ||
|  | 
 | ||
|  | static REAL toreal(const N a) | ||
|  | { | ||
|  |      REAL h, l, f; | ||
|  |      int i, bits; | ||
|  |      ACC r; | ||
|  |      DG sticky; | ||
|  | 
 | ||
|  |      if (EXA != ZEROEXP) { | ||
|  | 	  f = IRADIX; | ||
|  | 	  i = LEN; | ||
|  | 
 | ||
|  | 	  bits = 0; | ||
|  | 	  h = (r = AD[--i]) * f; f *= IRADIX; | ||
|  | 	  for (bits = 0; r > 0; ++bits) | ||
|  | 	       r >>= 1; | ||
|  | 
 | ||
|  | 	  /* first digit */ | ||
|  | 	  while (bits + SHFT <= BITS_IN_REAL) { | ||
|  | 	       h += AD[--i] * f;  f *= IRADIX; bits += SHFT; | ||
|  | 	  } | ||
|  | 
 | ||
|  | 	  /* guard digit (leave one bit for sticky bit, hence `<' instead
 | ||
|  | 	     of `<=') */ | ||
|  | 	  bits = 0; l = 0.0; | ||
|  | 	  while (bits + SHFT < BITS_IN_REAL) { | ||
|  | 	       l += AD[--i] * f;  f *= IRADIX; bits += SHFT; | ||
|  | 	  } | ||
|  | 	   | ||
|  | 	  /* sticky bit */ | ||
|  | 	  sticky = 0; | ||
|  | 	  while (i > 0)  | ||
|  | 	       sticky |= AD[--i]; | ||
|  | 
 | ||
|  | 	  if (sticky) | ||
|  | 	       l += (RADIX / 2) * f; | ||
|  | 
 | ||
|  | 	  h += l; | ||
|  | 
 | ||
|  | 	  for (i = 0; i < EXA; ++i) h *= (REAL)RADIX; | ||
|  | 	  for (i = 0; i > EXA; --i) h *= IRADIX; | ||
|  | 	  if (SGNA == -1) h = -h; | ||
|  | 	  return h; | ||
|  |      } else { | ||
|  | 	  return 0.0; | ||
|  |      } | ||
|  | } | ||
|  | 
 | ||
|  | static void neg(N a) | ||
|  | { | ||
|  |      SGNA = -SGNA; | ||
|  | } | ||
|  | 
 | ||
|  | static void inv(const N a, N x) | ||
|  | { | ||
|  |      N w, z, one, two; | ||
|  | 
 | ||
|  |      fromreal(1.0 / toreal(a), x); /* initial guess */ | ||
|  |      fromshort(1, one); | ||
|  |      fromshort(2, two); | ||
|  | 
 | ||
|  |      for (;;) { | ||
|  | 	  /* Newton */ | ||
|  | 	  mul(a, x, w); | ||
|  | 	  sub(two, w, z); | ||
|  | 	  if (eq(one, z)) break; | ||
|  | 	  mul(x, z, x); | ||
|  |      } | ||
|  | } | ||
|  | 
 | ||
|  | 
 | ||
|  | /* 2 pi */ | ||
|  | static const N n2pi = {{ | ||
|  |      1, 1, | ||
|  |      {18450, 59017, 1760, 5212, 9779, 4518, 2886, 54545, 18558, 6} | ||
|  | }}; | ||
|  | 
 | ||
|  | /* 1 / 31! */ | ||
|  | static const N i31fac = {{  | ||
|  |      1, -7,  | ||
|  |      {28087, 45433, 51357, 24545, 14291, 3954, 57879, 8109, 38716, 41382} | ||
|  | }}; | ||
|  | 
 | ||
|  | 
 | ||
|  | /* 1 / 32! */ | ||
|  | static const N i32fac = {{ | ||
|  |      1, -7, | ||
|  |      {52078, 60811, 3652, 39679, 37310, 47227, 28432, 57597, 13497, 1293} | ||
|  | }}; | ||
|  | 
 | ||
|  | static void msin(const N a, N b) | ||
|  | { | ||
|  |      N a2, g, k; | ||
|  |      int i; | ||
|  | 
 | ||
|  |      cpy(i31fac, g); | ||
|  |      cpy(g, b); | ||
|  |      mul(a, a, a2); | ||
|  | 
 | ||
|  |      /* Taylor */ | ||
|  |      for (i = 31; i > 1; i -= 2) { | ||
|  | 	  fromshort(i * (i - 1), k); | ||
|  | 	  mul(k, g, g); | ||
|  | 	  mul(a2, b, k); | ||
|  | 	  sub(g, k, b); | ||
|  |      } | ||
|  |      mul(a, b, b); | ||
|  | } | ||
|  | 
 | ||
|  | static void mcos(const N a, N b) | ||
|  | { | ||
|  |      N a2, g, k; | ||
|  |      int i; | ||
|  | 
 | ||
|  |      cpy(i32fac, g); | ||
|  |      cpy(g, b); | ||
|  |      mul(a, a, a2); | ||
|  | 
 | ||
|  |      /* Taylor */ | ||
|  |      for (i = 32; i > 0; i -= 2) { | ||
|  | 	  fromshort(i * (i - 1), k); | ||
|  | 	  mul(k, g, g); | ||
|  | 	  mul(a2, b, k); | ||
|  | 	  sub(g, k, b); | ||
|  |      } | ||
|  | } | ||
|  | 
 | ||
|  | static void by2pi(REAL m, REAL n, N a) | ||
|  | { | ||
|  |      N b; | ||
|  | 
 | ||
|  |      fromreal(n, b); | ||
|  |      inv(b, a); | ||
|  |      fromreal(m, b); | ||
|  |      mul(a, b, a); | ||
|  |      mul(n2pi, a, a); | ||
|  | } | ||
|  | 
 | ||
|  | static void sin2pi(REAL m, REAL n, N a); | ||
|  | static void cos2pi(REAL m, REAL n, N a) | ||
|  | { | ||
|  |      N b; | ||
|  |      if (m < 0) cos2pi(-m, n, a); | ||
|  |      else if (m > n * 0.5) cos2pi(n - m, n, a); | ||
|  |      else if (m > n * 0.25) {sin2pi(m - n * 0.25, n, a); neg(a);} | ||
|  |      else if (m > n * 0.125) sin2pi(n * 0.25 - m, n, a); | ||
|  |      else { by2pi(m, n, b); mcos(b, a); } | ||
|  | } | ||
|  | 
 | ||
|  | static void sin2pi(REAL m, REAL n, N a) | ||
|  | { | ||
|  |      N b; | ||
|  |      if (m < 0)  {sin2pi(-m, n, a); neg(a);} | ||
|  |      else if (m > n * 0.5) {sin2pi(n - m, n, a); neg(a);} | ||
|  |      else if (m > n * 0.25) {cos2pi(m - n * 0.25, n, a);} | ||
|  |      else if (m > n * 0.125) {cos2pi(n * 0.25 - m, n, a);} | ||
|  |      else {by2pi(m, n, b); msin(b, a);} | ||
|  | } | ||
|  | 
 | ||
|  | /*----------------------------------------------------------------------*/ | ||
|  | /* FFT stuff */ | ||
|  | 
 | ||
|  | /* (r0 + i i0)(r1 + i i1) */ | ||
|  | static void cmul(N r0, N i0, N r1, N i1, N r2, N i2) | ||
|  | { | ||
|  |      N s, t, q; | ||
|  |      mul(r0, r1, s); | ||
|  |      mul(i0, i1, t); | ||
|  |      sub(s, t, q); | ||
|  |      mul(r0, i1, s); | ||
|  |      mul(i0, r1, t); | ||
|  |      add(s, t, i2); | ||
|  |      cpy(q, r2); | ||
|  | } | ||
|  | 
 | ||
|  | /* (r0 - i i0)(r1 + i i1) */ | ||
|  | static void cmulj(N r0, N i0, N r1, N i1, N r2, N i2) | ||
|  | { | ||
|  |      N s, t, q; | ||
|  |      mul(r0, r1, s); | ||
|  |      mul(i0, i1, t); | ||
|  |      add(s, t, q); | ||
|  |      mul(r0, i1, s); | ||
|  |      mul(i0, r1, t); | ||
|  |      sub(s, t, i2); | ||
|  |      cpy(q, r2); | ||
|  | } | ||
|  | 
 | ||
|  | static void mcexp(int m, int n, N r, N i) | ||
|  | { | ||
|  |      static int cached_n = -1; | ||
|  |      static N w[64][2]; | ||
|  |      int k, j; | ||
|  |      if (n != cached_n) { | ||
|  | 	  for (j = 1, k = 0; j < n; j += j, ++k) { | ||
|  | 	       cos2pi(j, n, w[k][0]); | ||
|  | 	       sin2pi(j, n, w[k][1]); | ||
|  | 	  } | ||
|  | 	  cached_n = n; | ||
|  |      } | ||
|  | 
 | ||
|  |      fromshort(1, r); | ||
|  |      fromshort(0, i); | ||
|  |      if (m > 0) { | ||
|  | 	  for (k = 0; m; ++k, m >>= 1)  | ||
|  | 	       if (m & 1) | ||
|  | 		    cmul(w[k][0], w[k][1], r, i, r, i); | ||
|  |      } else { | ||
|  | 	  m = -m; | ||
|  | 	  for (k = 0; m; ++k, m >>= 1)  | ||
|  | 	       if (m & 1) | ||
|  | 		    cmulj(w[k][0], w[k][1], r, i, r, i); | ||
|  |      } | ||
|  | } | ||
|  | 
 | ||
|  | static void bitrev(int n, N *a) | ||
|  | { | ||
|  |      int i, j, m; | ||
|  |      for (i = j = 0; i < n - 1; ++i) { | ||
|  | 	  if (i < j) { | ||
|  | 	       N t; | ||
|  | 	       cpy(a[2*i], t); cpy(a[2*j], a[2*i]); cpy(t, a[2*j]); | ||
|  | 	       cpy(a[2*i+1], t); cpy(a[2*j+1], a[2*i+1]); cpy(t, a[2*j+1]); | ||
|  | 	  } | ||
|  | 
 | ||
|  | 	  /* bit reversed counter */ | ||
|  | 	  m = n; do { m >>= 1; j ^= m; } while (!(j & m)); | ||
|  |      } | ||
|  | } | ||
|  | 
 | ||
|  | static void fft0(int n, N *a, int sign) | ||
|  | { | ||
|  |      int i, j, k; | ||
|  | 
 | ||
|  |      bitrev(n, a); | ||
|  |      for (i = 1; i < n; i = 2 * i) { | ||
|  | 	  for (j = 0; j < i; ++j) { | ||
|  | 	       N wr, wi; | ||
|  | 	       mcexp(sign * (int)j, 2 * i, wr, wi); | ||
|  | 	       for (k = j; k < n; k += 2 * i) { | ||
|  | 		    N *a0 = a + 2 * k; | ||
|  | 		    N *a1 = a0 + 2 * i; | ||
|  | 		    N r0, i0, r1, i1, t0, t1, xr, xi; | ||
|  | 		    cpy(a0[0], r0); cpy(a0[1], i0); | ||
|  | 		    cpy(a1[0], r1); cpy(a1[1], i1); | ||
|  | 		    mul(r1, wr, t0); mul(i1, wi, t1); sub(t0, t1, xr); | ||
|  | 		    mul(r1, wi, t0); mul(i1, wr, t1); add(t0, t1, xi); | ||
|  | 		    add(r0, xr, a0[0]);  add(i0, xi, a0[1]); | ||
|  | 		    sub(r0, xr, a1[0]);  sub(i0, xi, a1[1]); | ||
|  | 	       } | ||
|  | 	  } | ||
|  |      } | ||
|  | } | ||
|  | 
 | ||
|  | /* a[2*k]+i*a[2*k+1] = exp(2*pi*i*k^2/(2*n)) */ | ||
|  | static void bluestein_sequence(int n, N *a) | ||
|  | { | ||
|  |      int k, ksq, n2 = 2 * n; | ||
|  | 
 | ||
|  |      ksq = 1; /* (-1)^2 */ | ||
|  |      for (k = 0; k < n; ++k) { | ||
|  | 	  /* careful with overflow */ | ||
|  | 	  ksq = ksq + 2*k - 1; while (ksq > n2) ksq -= n2; | ||
|  | 	  mcexp(ksq, n2, a[2*k], a[2*k+1]); | ||
|  |      } | ||
|  | } | ||
|  | 
 | ||
|  | static int pow2_atleast(int x) | ||
|  | { | ||
|  |      int h; | ||
|  |      for (h = 1; h < x; h = 2 * h) | ||
|  | 	  ; | ||
|  |      return h; | ||
|  | } | ||
|  | 
 | ||
|  | static N *cached_bluestein_w = 0; | ||
|  | static N *cached_bluestein_y = 0; | ||
|  | static int cached_bluestein_n = -1; | ||
|  | 
 | ||
|  | static void bluestein(int n, N *a) | ||
|  | { | ||
|  |      int nb = pow2_atleast(2 * n); | ||
|  |      N *b = (N *)bench_malloc(2 * nb * sizeof(N)); | ||
|  |      N *w = cached_bluestein_w; | ||
|  |      N *y = cached_bluestein_y; | ||
|  |      N nbinv; | ||
|  |      int i; | ||
|  | 
 | ||
|  |      fromreal(1.0 / nb, nbinv); /* exact because nb = 2^k */ | ||
|  | 
 | ||
|  |      if (cached_bluestein_n != n) { | ||
|  | 	  if (w) bench_free(w); | ||
|  | 	  if (y) bench_free(y); | ||
|  | 	  w = (N *)bench_malloc(2 * n * sizeof(N)); | ||
|  | 	  y = (N *)bench_malloc(2 * nb * sizeof(N)); | ||
|  | 	  cached_bluestein_n = n; | ||
|  | 	  cached_bluestein_w = w; | ||
|  | 	  cached_bluestein_y = y; | ||
|  | 
 | ||
|  | 	  bluestein_sequence(n, w); | ||
|  | 	  for (i = 0; i < 2*nb; ++i)  cpy(zero, y[i]); | ||
|  | 
 | ||
|  | 	  for (i = 0; i < n; ++i) { | ||
|  | 	       cpy(w[2*i], y[2*i]); | ||
|  | 	       cpy(w[2*i+1], y[2*i+1]); | ||
|  | 	  } | ||
|  | 	  for (i = 1; i < n; ++i) { | ||
|  | 	       cpy(w[2*i], y[2*(nb-i)]); | ||
|  | 	       cpy(w[2*i+1], y[2*(nb-i)+1]); | ||
|  | 	  } | ||
|  | 
 | ||
|  | 	  fft0(nb, y, -1); | ||
|  |      } | ||
|  | 
 | ||
|  |      for (i = 0; i < 2*nb; ++i)  cpy(zero, b[i]); | ||
|  |       | ||
|  |      for (i = 0; i < n; ++i)  | ||
|  | 	  cmulj(w[2*i], w[2*i+1], a[2*i], a[2*i+1], b[2*i], b[2*i+1]); | ||
|  | 
 | ||
|  |      /* scaled convolution b * y */ | ||
|  |      fft0(nb, b, -1); | ||
|  | 
 | ||
|  |      for (i = 0; i < nb; ++i)  | ||
|  | 	  cmul(b[2*i], b[2*i+1], y[2*i], y[2*i+1], b[2*i], b[2*i+1]); | ||
|  |      fft0(nb, b, 1); | ||
|  | 
 | ||
|  |      for (i = 0; i < n; ++i) { | ||
|  | 	  cmulj(w[2*i], w[2*i+1], b[2*i], b[2*i+1], a[2*i], a[2*i+1]); | ||
|  | 	  mul(nbinv, a[2*i], a[2*i]); | ||
|  | 	  mul(nbinv, a[2*i+1], a[2*i+1]); | ||
|  |      } | ||
|  | 
 | ||
|  |      bench_free(b); | ||
|  | } | ||
|  | 
 | ||
|  | static void swapri(int n, N *a) | ||
|  | { | ||
|  |      int i; | ||
|  |      for (i = 0; i < n; ++i) { | ||
|  | 	  N t; | ||
|  | 	  cpy(a[2 * i], t); | ||
|  | 	  cpy(a[2 * i + 1], a[2 * i]); | ||
|  | 	  cpy(t, a[2 * i + 1]); | ||
|  |      } | ||
|  | } | ||
|  | 
 | ||
|  | static void fft1(int n, N *a, int sign) | ||
|  | { | ||
|  |      if (power_of_two(n)) { | ||
|  | 	  fft0(n, a, sign); | ||
|  |      } else { | ||
|  | 	  if (sign == 1) swapri(n, a); | ||
|  | 	  bluestein(n, a); | ||
|  | 	  if (sign == 1) swapri(n, a); | ||
|  |      } | ||
|  | } | ||
|  | 
 | ||
|  | static void fromrealv(int n, bench_complex *a, N *b) | ||
|  | { | ||
|  |      int i; | ||
|  | 
 | ||
|  |      for (i = 0; i < n; ++i) { | ||
|  | 	  fromreal(c_re(a[i]), b[2 * i]); | ||
|  | 	  fromreal(c_im(a[i]), b[2 * i + 1]); | ||
|  |      } | ||
|  | } | ||
|  | 
 | ||
|  | static void compare(int n, N *a, N *b, double *err) | ||
|  | { | ||
|  |      int i; | ||
|  |      double e1, e2, einf; | ||
|  |      double n1, n2, ninf; | ||
|  | 
 | ||
|  |      e1 = e2 = einf = 0.0; | ||
|  |      n1 = n2 = ninf = 0.0; | ||
|  | 
 | ||
|  | #    define DO(x1, x2, xinf, var) { 			\
 | ||
|  |      double d = var;					\ | ||
|  |      if (d < 0) d = -d;					\ | ||
|  |      x1 += d; x2 += d * d; if (d > xinf) xinf = d;	\ | ||
|  | } | ||
|  | 	   | ||
|  |      for (i = 0; i < 2 * n; ++i) { | ||
|  | 	  N dd; | ||
|  | 	  sub(a[i], b[i], dd); | ||
|  | 	  DO(n1, n2, ninf, toreal(a[i])); | ||
|  | 	  DO(e1, e2, einf, toreal(dd)); | ||
|  |      } | ||
|  | 
 | ||
|  | #    undef DO
 | ||
|  |      err[0] = e1 / n1; | ||
|  |      err[1] = sqrt(e2 / n2); | ||
|  |      err[2] = einf / ninf; | ||
|  | } | ||
|  | 
 | ||
|  | void fftaccuracy(int n, bench_complex *a, bench_complex *ffta, | ||
|  | 		 int sign, double err[6]) | ||
|  | { | ||
|  |      N *b = (N *)bench_malloc(2 * n * sizeof(N)); | ||
|  |      N *fftb = (N *)bench_malloc(2 * n * sizeof(N)); | ||
|  |      N mn, ninv; | ||
|  |      int i; | ||
|  | 
 | ||
|  |      fromreal(n, mn); inv(mn, ninv); | ||
|  | 
 | ||
|  |      /* forward error */ | ||
|  |      fromrealv(n, a, b); fromrealv(n, ffta, fftb); | ||
|  |      fft1(n, b, sign); | ||
|  |      compare(n, b, fftb, err); | ||
|  | 
 | ||
|  |      /* backward error */ | ||
|  |      fromrealv(n, a, b); fromrealv(n, ffta, fftb); | ||
|  |      for (i = 0; i < 2 * n; ++i) mul(fftb[i], ninv, fftb[i]); | ||
|  |      fft1(n, fftb, -sign); | ||
|  |      compare(n, b, fftb, err + 3); | ||
|  | 
 | ||
|  |      bench_free(fftb); | ||
|  |      bench_free(b); | ||
|  | } | ||
|  | 
 | ||
|  | void fftaccuracy_done(void) | ||
|  | { | ||
|  |      if (cached_bluestein_w) bench_free(cached_bluestein_w); | ||
|  |      if (cached_bluestein_y) bench_free(cached_bluestein_y); | ||
|  |      cached_bluestein_w = 0; | ||
|  |      cached_bluestein_y = 0; | ||
|  |      cached_bluestein_n = -1; | ||
|  | } |