908 lines
		
	
	
		
			26 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
		
		
			
		
	
	
			908 lines
		
	
	
		
			26 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
|   | /*
 | ||
|  |  * Copyright (c) 2003, 2007-14 Matteo Frigo | ||
|  |  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology | ||
|  |  * | ||
|  |  * This program is free software; you can redistribute it and/or modify | ||
|  |  * it under the terms of the GNU General Public License as published by | ||
|  |  * the Free Software Foundation; either version 2 of the License, or | ||
|  |  * (at your option) any later version. | ||
|  |  * | ||
|  |  * This program is distributed in the hope that it will be useful, | ||
|  |  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
|  |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||
|  |  * GNU General Public License for more details. | ||
|  |  * | ||
|  |  * You should have received a copy of the GNU General Public License | ||
|  |  * along with this program; if not, write to the Free Software | ||
|  |  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA | ||
|  |  * | ||
|  |  */ | ||
|  | 
 | ||
|  | #include "api/api.h"
 | ||
|  | #include "fftw3-mpi.h"
 | ||
|  | #include "ifftw-mpi.h"
 | ||
|  | #include "mpi-transpose.h"
 | ||
|  | #include "mpi-dft.h"
 | ||
|  | #include "mpi-rdft.h"
 | ||
|  | #include "mpi-rdft2.h"
 | ||
|  | 
 | ||
|  | /* Convert API flags to internal MPI flags. */ | ||
|  | #define MPI_FLAGS(f) ((f) >> 27)
 | ||
|  | 
 | ||
|  | /*************************************************************************/ | ||
|  | 
 | ||
|  | static int mpi_inited = 0; | ||
|  | 
 | ||
|  | static MPI_Comm problem_comm(const problem *p) { | ||
|  |      switch (p->adt->problem_kind) { | ||
|  | 	 case PROBLEM_MPI_DFT: | ||
|  | 	      return ((const problem_mpi_dft *) p)->comm; | ||
|  | 	 case PROBLEM_MPI_RDFT: | ||
|  | 	      return ((const problem_mpi_rdft *) p)->comm; | ||
|  | 	 case PROBLEM_MPI_RDFT2: | ||
|  | 	      return ((const problem_mpi_rdft2 *) p)->comm; | ||
|  | 	 case PROBLEM_MPI_TRANSPOSE: | ||
|  | 	      return ((const problem_mpi_transpose *) p)->comm; | ||
|  | 	 default: | ||
|  | 	      return MPI_COMM_NULL; | ||
|  |      } | ||
|  | } | ||
|  | 
 | ||
|  | /* used to synchronize cost measurements (timing or estimation)
 | ||
|  |    across all processes for an MPI problem, which is critical to | ||
|  |    ensure that all processes decide to use the same MPI plans | ||
|  |    (whereas serial plans need not be syncronized). */ | ||
|  | static double cost_hook(const problem *p, double t, cost_kind k) | ||
|  | { | ||
|  |      MPI_Comm comm = problem_comm(p); | ||
|  |      double tsum; | ||
|  |      if (comm == MPI_COMM_NULL) return t; | ||
|  |      MPI_Allreduce(&t, &tsum, 1, MPI_DOUBLE,  | ||
|  | 		   k == COST_SUM ? MPI_SUM : MPI_MAX, comm); | ||
|  |      return tsum; | ||
|  | } | ||
|  | 
 | ||
|  | /* Used to reject wisdom that is not in sync across all processes
 | ||
|  |    for an MPI problem, which is critical to ensure that all processes | ||
|  |    decide to use the same MPI plans.  (Even though costs are synchronized, | ||
|  |    above, out-of-sync wisdom may result from plans being produced | ||
|  |    by communicators that do not span all processes, either from a | ||
|  |    user-specified communicator or e.g. from transpose-recurse. */ | ||
|  | static int wisdom_ok_hook(const problem *p, flags_t flags) | ||
|  | { | ||
|  |      MPI_Comm comm = problem_comm(p); | ||
|  |      int eq_me, eq_all; | ||
|  |      /* unpack flags bitfield, since MPI communications may involve
 | ||
|  | 	byte-order changes and MPI cannot do this for bit fields */ | ||
|  | #if SIZEOF_UNSIGNED_INT >= 4 /* must be big enough to hold 20-bit fields */
 | ||
|  |      unsigned int f[5]; | ||
|  | #else
 | ||
|  |      unsigned long f[5]; /* at least 32 bits as per C standard */ | ||
|  | #endif
 | ||
|  | 
 | ||
|  |      if (comm == MPI_COMM_NULL) return 1; /* non-MPI wisdom is always ok */ | ||
|  | 
 | ||
|  |      if (XM(any_true)(0, comm)) return 0; /* some process had nowisdom_hook */ | ||
|  | 
 | ||
|  |      /* otherwise, check that the flags and solver index are identical
 | ||
|  | 	on all processes in this problem's communicator. | ||
|  | 
 | ||
|  | 	TO DO: possibly we can relax strict equality, but it is | ||
|  | 	critical to ensure that any flags which affect what plan is | ||
|  | 	created (and whether the solver is applicable) are the same, | ||
|  | 	e.g. DESTROY_INPUT, NO_UGLY, etcetera.  (If the MPI algorithm | ||
|  | 	differs between processes, deadlocks/crashes generally result.) */ | ||
|  |      f[0] = flags.l; | ||
|  |      f[1] = flags.hash_info; | ||
|  |      f[2] = flags.timelimit_impatience; | ||
|  |      f[3] = flags.u; | ||
|  |      f[4] = flags.slvndx; | ||
|  |      MPI_Bcast(f, 5,  | ||
|  | 	       SIZEOF_UNSIGNED_INT >= 4 ? MPI_UNSIGNED : MPI_UNSIGNED_LONG, | ||
|  | 	       0, comm); | ||
|  |      eq_me = f[0] == flags.l && f[1] == flags.hash_info | ||
|  | 	  && f[2] == flags.timelimit_impatience | ||
|  | 	  && f[3] == flags.u && f[4] == flags.slvndx; | ||
|  |      MPI_Allreduce(&eq_me, &eq_all, 1, MPI_INT, MPI_LAND, comm); | ||
|  |      return eq_all; | ||
|  | } | ||
|  | 
 | ||
|  | /* This hook is called when wisdom is not found.  The any_true here
 | ||
|  |    matches up with the any_true in wisdom_ok_hook, in order to handle | ||
|  |    the case where some processes had wisdom (and called wisdom_ok_hook) | ||
|  |    and some processes didn't have wisdom (and called nowisdom_hook). */ | ||
|  | static void nowisdom_hook(const problem *p) | ||
|  | { | ||
|  |      MPI_Comm comm = problem_comm(p); | ||
|  |      if (comm == MPI_COMM_NULL) return; /* nothing to do for non-MPI p */ | ||
|  |      XM(any_true)(1, comm); /* signal nowisdom to any wisdom_ok_hook */ | ||
|  | } | ||
|  | 
 | ||
|  | /* needed to synchronize planner bogosity flag, in case non-MPI problems
 | ||
|  |    on a subset of processes encountered bogus wisdom */ | ||
|  | static wisdom_state_t bogosity_hook(wisdom_state_t state, const problem *p) | ||
|  | { | ||
|  |      MPI_Comm comm = problem_comm(p); | ||
|  |      if (comm != MPI_COMM_NULL /* an MPI problem */ | ||
|  | 	 && XM(any_true)(state == WISDOM_IS_BOGUS, comm)) /* bogus somewhere */ | ||
|  | 	  return WISDOM_IS_BOGUS; | ||
|  |      return state; | ||
|  | } | ||
|  | 
 | ||
|  | void XM(init)(void) | ||
|  | { | ||
|  |      if (!mpi_inited) { | ||
|  | 	  planner *plnr = X(the_planner)(); | ||
|  | 	  plnr->cost_hook = cost_hook; | ||
|  | 	  plnr->wisdom_ok_hook = wisdom_ok_hook; | ||
|  | 	  plnr->nowisdom_hook = nowisdom_hook; | ||
|  | 	  plnr->bogosity_hook = bogosity_hook; | ||
|  |           XM(conf_standard)(plnr); | ||
|  | 	  mpi_inited = 1;	   | ||
|  |      } | ||
|  | } | ||
|  | 
 | ||
|  | void XM(cleanup)(void) | ||
|  | { | ||
|  |      X(cleanup)(); | ||
|  |      mpi_inited = 0; | ||
|  | } | ||
|  | 
 | ||
|  | /*************************************************************************/ | ||
|  | 
 | ||
|  | static dtensor *mkdtensor_api(int rnk, const XM(ddim) *dims0) | ||
|  | { | ||
|  |      dtensor *x = XM(mkdtensor)(rnk); | ||
|  |      int i; | ||
|  |      for (i = 0; i < rnk; ++i) { | ||
|  | 	  x->dims[i].n = dims0[i].n; | ||
|  | 	  x->dims[i].b[IB] = dims0[i].ib; | ||
|  | 	  x->dims[i].b[OB] = dims0[i].ob; | ||
|  |      } | ||
|  |      return x; | ||
|  | } | ||
|  | 
 | ||
|  | static dtensor *default_sz(int rnk, const XM(ddim) *dims0, int n_pes, | ||
|  | 			   int rdft2) | ||
|  | { | ||
|  |      dtensor *sz = XM(mkdtensor)(rnk); | ||
|  |      dtensor *sz0 = mkdtensor_api(rnk, dims0); | ||
|  |      block_kind k; | ||
|  |      int i; | ||
|  | 
 | ||
|  |      for (i = 0; i < rnk; ++i) | ||
|  | 	  sz->dims[i].n = dims0[i].n; | ||
|  | 
 | ||
|  |      if (rdft2) sz->dims[rnk-1].n = dims0[rnk-1].n / 2 + 1; | ||
|  | 
 | ||
|  |      for (i = 0; i < rnk; ++i) { | ||
|  | 	  sz->dims[i].b[IB] = dims0[i].ib ? dims0[i].ib : sz->dims[i].n; | ||
|  | 	  sz->dims[i].b[OB] = dims0[i].ob ? dims0[i].ob : sz->dims[i].n; | ||
|  |      } | ||
|  | 
 | ||
|  |      /* If we haven't used all of the processes yet, and some of the
 | ||
|  | 	block sizes weren't specified (i.e. 0), then set the | ||
|  | 	unspecified blocks so as to use as many processes as | ||
|  | 	possible with as few distributed dimensions as possible. */ | ||
|  |      FORALL_BLOCK_KIND(k) { | ||
|  | 	  INT nb = XM(num_blocks_total)(sz, k); | ||
|  | 	  INT np = n_pes / nb; | ||
|  | 	  for (i = 0; i < rnk && np > 1; ++i) | ||
|  | 	       if (!sz0->dims[i].b[k]) { | ||
|  | 		    sz->dims[i].b[k] = XM(default_block)(sz->dims[i].n, np); | ||
|  | 		    nb *= XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[k]); | ||
|  | 		    np = n_pes / nb; | ||
|  | 	       } | ||
|  |      } | ||
|  | 
 | ||
|  |      if (rdft2) sz->dims[rnk-1].n = dims0[rnk-1].n; | ||
|  | 
 | ||
|  |      /* punt for 1d prime */ | ||
|  |      if (rnk == 1 && X(is_prime)(sz->dims[0].n)) | ||
|  | 	  sz->dims[0].b[IB] = sz->dims[0].b[OB] = sz->dims[0].n; | ||
|  | 
 | ||
|  |      XM(dtensor_destroy)(sz0); | ||
|  |      sz0 = XM(dtensor_canonical)(sz, 0); | ||
|  |      XM(dtensor_destroy)(sz); | ||
|  |      return sz0; | ||
|  | } | ||
|  | 
 | ||
|  | /* allocate simple local (serial) dims array corresponding to n[rnk] */ | ||
|  | static XM(ddim) *simple_dims(int rnk, const ptrdiff_t *n) | ||
|  | { | ||
|  |      XM(ddim) *dims = (XM(ddim) *) MALLOC(sizeof(XM(ddim)) * rnk, | ||
|  | 						TENSORS); | ||
|  |      int i; | ||
|  |      for (i = 0; i < rnk; ++i) | ||
|  | 	  dims[i].n = dims[i].ib = dims[i].ob = n[i]; | ||
|  |      return dims; | ||
|  | } | ||
|  | 
 | ||
|  | /*************************************************************************/ | ||
|  | 
 | ||
|  | static void local_size(int my_pe, const dtensor *sz, block_kind k, | ||
|  | 		       ptrdiff_t *local_n, ptrdiff_t *local_start) | ||
|  | { | ||
|  |      int i; | ||
|  |      if (my_pe >= XM(num_blocks_total)(sz, k)) | ||
|  | 	  for (i = 0; i < sz->rnk; ++i) | ||
|  | 	       local_n[i] = local_start[i] = 0; | ||
|  |      else { | ||
|  | 	  XM(block_coords)(sz, k, my_pe, local_start); | ||
|  | 	  for (i = 0; i < sz->rnk; ++i) { | ||
|  | 	       local_n[i] = XM(block)(sz->dims[i].n, sz->dims[i].b[k], | ||
|  | 				      local_start[i]); | ||
|  | 	       local_start[i] *= sz->dims[i].b[k]; | ||
|  | 	  } | ||
|  |      } | ||
|  | } | ||
|  | 
 | ||
|  | static INT prod(int rnk, const ptrdiff_t *local_n)  | ||
|  | { | ||
|  |      int i; | ||
|  |      INT N = 1; | ||
|  |      for (i = 0; i < rnk; ++i) N *= local_n[i]; | ||
|  |      return N; | ||
|  | } | ||
|  | 
 | ||
|  | ptrdiff_t XM(local_size_guru)(int rnk, const XM(ddim) *dims0, | ||
|  | 			      ptrdiff_t howmany, MPI_Comm comm, | ||
|  | 			      ptrdiff_t *local_n_in, | ||
|  | 			      ptrdiff_t *local_start_in, | ||
|  | 			      ptrdiff_t *local_n_out,  | ||
|  | 			      ptrdiff_t *local_start_out, | ||
|  | 			      int sign, unsigned flags) | ||
|  | { | ||
|  |      INT N; | ||
|  |      int my_pe, n_pes, i; | ||
|  |      dtensor *sz; | ||
|  | 
 | ||
|  |      if (rnk == 0) | ||
|  | 	  return howmany; | ||
|  | 
 | ||
|  |      MPI_Comm_rank(comm, &my_pe); | ||
|  |      MPI_Comm_size(comm, &n_pes); | ||
|  |      sz = default_sz(rnk, dims0, n_pes, 0); | ||
|  | 
 | ||
|  |      /* Now, we must figure out how much local space the user should
 | ||
|  | 	allocate (or at least an upper bound).  This depends strongly | ||
|  | 	on the exact algorithms we employ...ugh!  FIXME: get this info | ||
|  | 	from the solvers somehow? */ | ||
|  |      N = 1; /* never return zero allocation size */ | ||
|  |      if (rnk > 1 && XM(is_block1d)(sz, IB) && XM(is_block1d)(sz, OB)) { | ||
|  | 	  INT Nafter; | ||
|  | 	  ddim odims[2]; | ||
|  | 
 | ||
|  | 	  /* dft-rank-geq2-transposed */ | ||
|  | 	  odims[0] = sz->dims[0]; odims[1] = sz->dims[1]; /* save */ | ||
|  | 	  /* we may need extra space for transposed intermediate data */ | ||
|  | 	  for (i = 0; i < 2; ++i) | ||
|  | 	       if (XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[IB]) == 1 && | ||
|  | 		   XM(num_blocks)(sz->dims[i].n, sz->dims[i].b[OB]) == 1) { | ||
|  | 		    sz->dims[i].b[IB] | ||
|  | 			 = XM(default_block)(sz->dims[i].n, n_pes); | ||
|  | 		    sz->dims[1-i].b[IB] = sz->dims[1-i].n; | ||
|  | 		    local_size(my_pe, sz, IB, local_n_in, local_start_in); | ||
|  | 		    N = X(imax)(N, prod(rnk, local_n_in)); | ||
|  | 		    sz->dims[i] = odims[i]; | ||
|  | 		    sz->dims[1-i] = odims[1-i]; | ||
|  | 		    break; | ||
|  | 	       } | ||
|  | 
 | ||
|  | 	  /* dft-rank-geq2 */ | ||
|  | 	  Nafter = howmany; | ||
|  | 	  for (i = 1; i < sz->rnk; ++i) Nafter *= sz->dims[i].n; | ||
|  | 	  N = X(imax)(N, (sz->dims[0].n | ||
|  | 			  * XM(block)(Nafter, XM(default_block)(Nafter, n_pes), | ||
|  | 				      my_pe) + howmany - 1) / howmany); | ||
|  | 
 | ||
|  | 	  /* dft-rank-geq2 with dimensions swapped */ | ||
|  | 	  Nafter = howmany * sz->dims[0].n; | ||
|  |           for (i = 2; i < sz->rnk; ++i) Nafter *= sz->dims[i].n; | ||
|  |           N = X(imax)(N, (sz->dims[1].n | ||
|  |                           * XM(block)(Nafter, XM(default_block)(Nafter, n_pes), | ||
|  |                                       my_pe) + howmany - 1) / howmany); | ||
|  |      } | ||
|  |      else if (rnk == 1) { | ||
|  | 	  if (howmany >= n_pes && !MPI_FLAGS(flags)) { /* dft-rank1-bigvec */ | ||
|  | 	       ptrdiff_t n[2], start[2]; | ||
|  | 	       dtensor *sz2 = XM(mkdtensor)(2); | ||
|  | 	       sz2->dims[0] = sz->dims[0]; | ||
|  | 	       sz2->dims[0].b[IB] = sz->dims[0].n; | ||
|  | 	       sz2->dims[1].n = sz2->dims[1].b[OB] = howmany; | ||
|  | 	       sz2->dims[1].b[IB] = XM(default_block)(howmany, n_pes); | ||
|  | 	       local_size(my_pe, sz2, IB, n, start); | ||
|  | 	       XM(dtensor_destroy)(sz2); | ||
|  | 	       N = X(imax)(N, (prod(2, n) + howmany - 1) / howmany); | ||
|  | 	  } | ||
|  | 	  else { /* dft-rank1 */ | ||
|  | 	       INT r, m, rblock[2], mblock[2]; | ||
|  | 
 | ||
|  | 	       /* Since the 1d transforms are so different, we require
 | ||
|  | 		  the user to call local_size_1d for this case.  Ugh. */ | ||
|  | 	       CK(sign == FFTW_FORWARD || sign == FFTW_BACKWARD); | ||
|  | 
 | ||
|  | 	       if ((r = XM(choose_radix)(sz->dims[0], n_pes, flags, sign, | ||
|  | 					 rblock, mblock))) { | ||
|  | 		    m = sz->dims[0].n / r; | ||
|  | 		    if (flags & FFTW_MPI_SCRAMBLED_IN) | ||
|  | 			 sz->dims[0].b[IB] = rblock[IB] * m; | ||
|  | 		    else { /* !SCRAMBLED_IN */ | ||
|  | 			 sz->dims[0].b[IB] = r * mblock[IB]; | ||
|  | 			 N = X(imax)(N, rblock[IB] * m); | ||
|  | 		    } | ||
|  | 		    if (flags & FFTW_MPI_SCRAMBLED_OUT) | ||
|  | 			 sz->dims[0].b[OB] = r * mblock[OB]; | ||
|  | 		    else { /* !SCRAMBLED_OUT */ | ||
|  | 			 N = X(imax)(N, r * mblock[OB]); | ||
|  | 			 sz->dims[0].b[OB] = rblock[OB] * m; | ||
|  | 		    } | ||
|  | 	       } | ||
|  | 	  } | ||
|  |      } | ||
|  | 
 | ||
|  |      local_size(my_pe, sz, IB, local_n_in, local_start_in); | ||
|  |      local_size(my_pe, sz, OB, local_n_out, local_start_out); | ||
|  | 
 | ||
|  |      /* at least, make sure we have enough space to store input & output */ | ||
|  |      N = X(imax)(N, X(imax)(prod(rnk, local_n_in), prod(rnk, local_n_out))); | ||
|  | 
 | ||
|  |      XM(dtensor_destroy)(sz); | ||
|  |      return N * howmany; | ||
|  | } | ||
|  | 
 | ||
|  | ptrdiff_t XM(local_size_many_transposed)(int rnk, const ptrdiff_t *n, | ||
|  | 					 ptrdiff_t howmany, | ||
|  | 					 ptrdiff_t xblock, ptrdiff_t yblock, | ||
|  | 					 MPI_Comm comm, | ||
|  | 					 ptrdiff_t *local_nx, | ||
|  | 					 ptrdiff_t *local_x_start, | ||
|  | 					 ptrdiff_t *local_ny, | ||
|  | 					 ptrdiff_t *local_y_start) | ||
|  | { | ||
|  |      ptrdiff_t N; | ||
|  |      XM(ddim) *dims;  | ||
|  |      ptrdiff_t *local; | ||
|  | 
 | ||
|  |      if (rnk == 0) { | ||
|  | 	  *local_nx = *local_ny = 1; | ||
|  | 	  *local_x_start = *local_y_start = 0; | ||
|  | 	  return howmany; | ||
|  |      } | ||
|  | 
 | ||
|  |      dims = simple_dims(rnk, n); | ||
|  |      local = (ptrdiff_t *) MALLOC(sizeof(ptrdiff_t) * rnk * 4, TENSORS); | ||
|  | 
 | ||
|  |      /* default 1d block distribution, with transposed output
 | ||
|  |         if yblock < n[1] */ | ||
|  |      dims[0].ib = xblock; | ||
|  |      if (rnk > 1) { | ||
|  | 	  if (yblock < n[1]) | ||
|  | 	       dims[1].ob = yblock; | ||
|  | 	  else | ||
|  | 	       dims[0].ob = xblock; | ||
|  |      } | ||
|  |      else | ||
|  | 	  dims[0].ob = xblock; /* FIXME: 1d not really supported here 
 | ||
|  | 				         since we don't have flags/sign */ | ||
|  |       | ||
|  |      N = XM(local_size_guru)(rnk, dims, howmany, comm,  | ||
|  | 			     local, local + rnk, | ||
|  | 			     local + 2*rnk, local + 3*rnk, | ||
|  | 			     0, 0); | ||
|  |      *local_nx = local[0]; | ||
|  |      *local_x_start = local[rnk]; | ||
|  |      if (rnk > 1) { | ||
|  | 	  *local_ny = local[2*rnk + 1]; | ||
|  | 	  *local_y_start = local[3*rnk + 1]; | ||
|  |      } | ||
|  |      else { | ||
|  | 	  *local_ny = *local_nx; | ||
|  | 	  *local_y_start = *local_x_start; | ||
|  |      } | ||
|  |      X(ifree)(local); | ||
|  |      X(ifree)(dims); | ||
|  |      return N; | ||
|  | } | ||
|  | 
 | ||
|  | ptrdiff_t XM(local_size_many)(int rnk, const ptrdiff_t *n, | ||
|  | 			      ptrdiff_t howmany,  | ||
|  | 			      ptrdiff_t xblock, | ||
|  | 			      MPI_Comm comm, | ||
|  | 			      ptrdiff_t *local_nx, | ||
|  | 			      ptrdiff_t *local_x_start) | ||
|  | { | ||
|  |      ptrdiff_t local_ny, local_y_start; | ||
|  |      return XM(local_size_many_transposed)(rnk, n, howmany, | ||
|  | 					   xblock, rnk > 1  | ||
|  | 					   ? n[1] : FFTW_MPI_DEFAULT_BLOCK, | ||
|  | 					   comm, | ||
|  | 					   local_nx, local_x_start, | ||
|  | 					   &local_ny, &local_y_start); | ||
|  | } | ||
|  | 
 | ||
|  | 
 | ||
|  | ptrdiff_t XM(local_size_transposed)(int rnk, const ptrdiff_t *n, | ||
|  | 				    MPI_Comm comm, | ||
|  | 				    ptrdiff_t *local_nx, | ||
|  | 				    ptrdiff_t *local_x_start, | ||
|  | 				    ptrdiff_t *local_ny, | ||
|  | 				    ptrdiff_t *local_y_start) | ||
|  | { | ||
|  |      return XM(local_size_many_transposed)(rnk, n, 1, | ||
|  | 					   FFTW_MPI_DEFAULT_BLOCK, | ||
|  | 					   FFTW_MPI_DEFAULT_BLOCK, | ||
|  | 					   comm, | ||
|  | 					   local_nx, local_x_start, | ||
|  | 					   local_ny, local_y_start); | ||
|  | } | ||
|  | 
 | ||
|  | ptrdiff_t XM(local_size)(int rnk, const ptrdiff_t *n, | ||
|  | 			 MPI_Comm comm, | ||
|  | 			 ptrdiff_t *local_nx, | ||
|  | 			 ptrdiff_t *local_x_start) | ||
|  | { | ||
|  |      return XM(local_size_many)(rnk, n, 1, FFTW_MPI_DEFAULT_BLOCK, comm, | ||
|  | 				local_nx, local_x_start); | ||
|  | } | ||
|  | 
 | ||
|  | ptrdiff_t XM(local_size_many_1d)(ptrdiff_t nx, ptrdiff_t howmany,  | ||
|  | 				 MPI_Comm comm, int sign, unsigned flags, | ||
|  | 				 ptrdiff_t *local_nx, ptrdiff_t *local_x_start, | ||
|  | 				 ptrdiff_t *local_ny, ptrdiff_t *local_y_start) | ||
|  | { | ||
|  |      XM(ddim) d; | ||
|  |      d.n = nx; | ||
|  |      d.ib = d.ob = FFTW_MPI_DEFAULT_BLOCK; | ||
|  |      return XM(local_size_guru)(1, &d, howmany, comm, | ||
|  | 				local_nx, local_x_start, | ||
|  | 				local_ny, local_y_start, sign, flags); | ||
|  | } | ||
|  | 
 | ||
|  | ptrdiff_t XM(local_size_1d)(ptrdiff_t nx, | ||
|  | 			    MPI_Comm comm, int sign, unsigned flags, | ||
|  | 			    ptrdiff_t *local_nx, ptrdiff_t *local_x_start, | ||
|  | 			    ptrdiff_t *local_ny, ptrdiff_t *local_y_start) | ||
|  | { | ||
|  |      return XM(local_size_many_1d)(nx, 1, comm, sign, flags, | ||
|  | 				   local_nx, local_x_start, | ||
|  | 				   local_ny, local_y_start); | ||
|  | } | ||
|  | 
 | ||
|  | ptrdiff_t XM(local_size_2d_transposed)(ptrdiff_t nx, ptrdiff_t ny, | ||
|  | 				       MPI_Comm comm, | ||
|  | 				       ptrdiff_t *local_nx, | ||
|  | 				       ptrdiff_t *local_x_start, | ||
|  | 				       ptrdiff_t *local_ny,  | ||
|  | 				       ptrdiff_t *local_y_start) | ||
|  | { | ||
|  |      ptrdiff_t n[2]; | ||
|  |      n[0] = nx; n[1] = ny; | ||
|  |      return XM(local_size_transposed)(2, n, comm, | ||
|  | 				      local_nx, local_x_start, | ||
|  | 				      local_ny, local_y_start); | ||
|  | } | ||
|  | 
 | ||
|  | ptrdiff_t XM(local_size_2d)(ptrdiff_t nx, ptrdiff_t ny, MPI_Comm comm, | ||
|  | 			       ptrdiff_t *local_nx, ptrdiff_t *local_x_start) | ||
|  | { | ||
|  |      ptrdiff_t n[2]; | ||
|  |      n[0] = nx; n[1] = ny; | ||
|  |      return XM(local_size)(2, n, comm, local_nx, local_x_start); | ||
|  | } | ||
|  | 
 | ||
|  | ptrdiff_t XM(local_size_3d_transposed)(ptrdiff_t nx, ptrdiff_t ny, | ||
|  | 				       ptrdiff_t nz, | ||
|  | 				       MPI_Comm comm, | ||
|  | 				       ptrdiff_t *local_nx, | ||
|  | 				       ptrdiff_t *local_x_start, | ||
|  | 				       ptrdiff_t *local_ny,  | ||
|  | 				       ptrdiff_t *local_y_start) | ||
|  | { | ||
|  |      ptrdiff_t n[3]; | ||
|  |      n[0] = nx; n[1] = ny; n[2] = nz; | ||
|  |      return XM(local_size_transposed)(3, n, comm, | ||
|  | 				      local_nx, local_x_start, | ||
|  | 				      local_ny, local_y_start); | ||
|  | } | ||
|  | 
 | ||
|  | ptrdiff_t XM(local_size_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz, | ||
|  | 			    MPI_Comm comm, | ||
|  | 			    ptrdiff_t *local_nx, ptrdiff_t *local_x_start) | ||
|  | { | ||
|  |      ptrdiff_t n[3]; | ||
|  |      n[0] = nx; n[1] = ny; n[2] = nz; | ||
|  |      return XM(local_size)(3, n, comm, local_nx, local_x_start); | ||
|  | } | ||
|  | 
 | ||
|  | /*************************************************************************/ | ||
|  | /* Transpose API */ | ||
|  | 
 | ||
|  | X(plan) XM(plan_many_transpose)(ptrdiff_t nx, ptrdiff_t ny,  | ||
|  | 				ptrdiff_t howmany, | ||
|  | 				ptrdiff_t xblock, ptrdiff_t yblock, | ||
|  | 				R *in, R *out,  | ||
|  | 				MPI_Comm comm, unsigned flags) | ||
|  | { | ||
|  |      int n_pes; | ||
|  |      XM(init)(); | ||
|  | 
 | ||
|  |      if (howmany < 0 || xblock < 0 || yblock < 0 || | ||
|  | 	 nx <= 0 || ny <= 0) return 0; | ||
|  | 
 | ||
|  |      MPI_Comm_size(comm, &n_pes); | ||
|  |      if (!xblock) xblock = XM(default_block)(nx, n_pes); | ||
|  |      if (!yblock) yblock = XM(default_block)(ny, n_pes); | ||
|  |      if (n_pes < XM(num_blocks)(nx, xblock) | ||
|  | 	 || n_pes < XM(num_blocks)(ny, yblock)) | ||
|  | 	  return 0; | ||
|  | 
 | ||
|  |      return  | ||
|  | 	  X(mkapiplan)(FFTW_FORWARD, flags, | ||
|  | 		       XM(mkproblem_transpose)(nx, ny, howmany, | ||
|  | 					       in, out, xblock, yblock, | ||
|  | 					       comm, MPI_FLAGS(flags))); | ||
|  | } | ||
|  | 
 | ||
|  | X(plan) XM(plan_transpose)(ptrdiff_t nx, ptrdiff_t ny, R *in, R *out,  | ||
|  | 			   MPI_Comm comm, unsigned flags) | ||
|  | 			       | ||
|  | { | ||
|  |      return XM(plan_many_transpose)(nx, ny, 1, | ||
|  | 				    FFTW_MPI_DEFAULT_BLOCK, | ||
|  | 				    FFTW_MPI_DEFAULT_BLOCK, | ||
|  | 				    in, out, comm, flags); | ||
|  | } | ||
|  | 
 | ||
|  | /*************************************************************************/ | ||
|  | /* Complex DFT API */ | ||
|  | 
 | ||
|  | X(plan) XM(plan_guru_dft)(int rnk, const XM(ddim) *dims0, | ||
|  | 			  ptrdiff_t howmany, | ||
|  | 			  C *in, C *out, | ||
|  | 			  MPI_Comm comm, int sign, unsigned flags) | ||
|  | { | ||
|  |      int n_pes, i; | ||
|  |      dtensor *sz; | ||
|  |       | ||
|  |      XM(init)(); | ||
|  | 
 | ||
|  |      if (howmany < 0 || rnk < 1) return 0; | ||
|  |      for (i = 0; i < rnk; ++i) | ||
|  | 	  if (dims0[i].n < 1 || dims0[i].ib < 0 || dims0[i].ob < 0) | ||
|  | 	       return 0; | ||
|  | 
 | ||
|  |      MPI_Comm_size(comm, &n_pes); | ||
|  |      sz = default_sz(rnk, dims0, n_pes, 0); | ||
|  | 
 | ||
|  |      if (XM(num_blocks_total)(sz, IB) > n_pes | ||
|  | 	 || XM(num_blocks_total)(sz, OB) > n_pes) { | ||
|  | 	  XM(dtensor_destroy)(sz); | ||
|  | 	  return 0; | ||
|  |      } | ||
|  | 
 | ||
|  |      return | ||
|  |           X(mkapiplan)(sign, flags, | ||
|  |                        XM(mkproblem_dft_d)(sz, howmany, | ||
|  | 					   (R *) in, (R *) out, | ||
|  | 					   comm, sign,  | ||
|  | 					   MPI_FLAGS(flags))); | ||
|  | } | ||
|  | 
 | ||
|  | X(plan) XM(plan_many_dft)(int rnk, const ptrdiff_t *n, | ||
|  | 			  ptrdiff_t howmany, | ||
|  | 			  ptrdiff_t iblock, ptrdiff_t oblock, | ||
|  | 			  C *in, C *out, | ||
|  | 			  MPI_Comm comm, int sign, unsigned flags) | ||
|  | { | ||
|  |      XM(ddim) *dims = simple_dims(rnk, n); | ||
|  |      X(plan) pln; | ||
|  | 
 | ||
|  |      if (rnk == 1) { | ||
|  | 	  dims[0].ib = iblock; | ||
|  | 	  dims[0].ob = oblock; | ||
|  |      } | ||
|  |      else if (rnk > 1) { | ||
|  | 	  dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock; | ||
|  | 	  dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock; | ||
|  |      } | ||
|  | 
 | ||
|  |      pln = XM(plan_guru_dft)(rnk,dims,howmany, in,out, comm, sign, flags); | ||
|  |      X(ifree)(dims); | ||
|  |      return pln; | ||
|  | } | ||
|  | 
 | ||
|  | X(plan) XM(plan_dft)(int rnk, const ptrdiff_t *n, C *in, C *out, | ||
|  | 		     MPI_Comm comm, int sign, unsigned flags) | ||
|  | { | ||
|  |      return XM(plan_many_dft)(rnk, n, 1, | ||
|  | 			      FFTW_MPI_DEFAULT_BLOCK, | ||
|  | 			      FFTW_MPI_DEFAULT_BLOCK, | ||
|  | 			      in, out, comm, sign, flags); | ||
|  | } | ||
|  | 
 | ||
|  | X(plan) XM(plan_dft_1d)(ptrdiff_t nx, C *in, C *out, | ||
|  | 			MPI_Comm comm, int sign, unsigned flags) | ||
|  | { | ||
|  |      return XM(plan_dft)(1, &nx, in, out, comm, sign, flags); | ||
|  | } | ||
|  | 
 | ||
|  | X(plan) XM(plan_dft_2d)(ptrdiff_t nx, ptrdiff_t ny, C *in, C *out, | ||
|  | 			MPI_Comm comm, int sign, unsigned flags) | ||
|  | { | ||
|  |      ptrdiff_t n[2]; | ||
|  |      n[0] = nx; n[1] = ny; | ||
|  |      return XM(plan_dft)(2, n, in, out, comm, sign, flags); | ||
|  | } | ||
|  | 
 | ||
|  | X(plan) XM(plan_dft_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz, | ||
|  | 			C *in, C *out, | ||
|  | 			MPI_Comm comm, int sign, unsigned flags) | ||
|  | { | ||
|  |      ptrdiff_t n[3]; | ||
|  |      n[0] = nx; n[1] = ny; n[2] = nz; | ||
|  |      return XM(plan_dft)(3, n, in, out, comm, sign, flags); | ||
|  | } | ||
|  | 
 | ||
|  | /*************************************************************************/ | ||
|  | /* R2R API */ | ||
|  | 
 | ||
|  | X(plan) XM(plan_guru_r2r)(int rnk, const XM(ddim) *dims0, | ||
|  | 			  ptrdiff_t howmany, | ||
|  | 			  R *in, R *out, | ||
|  | 			  MPI_Comm comm, const X(r2r_kind) *kind, | ||
|  | 			  unsigned flags) | ||
|  | { | ||
|  |      int n_pes, i; | ||
|  |      dtensor *sz; | ||
|  |      rdft_kind *k; | ||
|  |      X(plan) pln; | ||
|  |       | ||
|  |      XM(init)(); | ||
|  | 
 | ||
|  |      if (howmany < 0 || rnk < 1) return 0; | ||
|  |      for (i = 0; i < rnk; ++i) | ||
|  | 	  if (dims0[i].n < 1 || dims0[i].ib < 0 || dims0[i].ob < 0) | ||
|  | 	       return 0; | ||
|  | 
 | ||
|  |      k = X(map_r2r_kind)(rnk, kind); | ||
|  | 
 | ||
|  |      MPI_Comm_size(comm, &n_pes); | ||
|  |      sz = default_sz(rnk, dims0, n_pes, 0); | ||
|  | 
 | ||
|  |      if (XM(num_blocks_total)(sz, IB) > n_pes | ||
|  | 	 || XM(num_blocks_total)(sz, OB) > n_pes) { | ||
|  | 	  XM(dtensor_destroy)(sz); | ||
|  | 	  return 0; | ||
|  |      } | ||
|  | 
 | ||
|  |      pln = X(mkapiplan)(0, flags, | ||
|  | 			XM(mkproblem_rdft_d)(sz, howmany, | ||
|  | 					     in, out, | ||
|  | 					     comm, k, MPI_FLAGS(flags))); | ||
|  |      X(ifree0)(k); | ||
|  |      return pln; | ||
|  | } | ||
|  | 
 | ||
|  | X(plan) XM(plan_many_r2r)(int rnk, const ptrdiff_t *n, | ||
|  | 			  ptrdiff_t howmany, | ||
|  | 			  ptrdiff_t iblock, ptrdiff_t oblock, | ||
|  | 			  R *in, R *out, | ||
|  | 			  MPI_Comm comm, const X(r2r_kind) *kind, | ||
|  | 			  unsigned flags) | ||
|  | { | ||
|  |      XM(ddim) *dims = simple_dims(rnk, n); | ||
|  |      X(plan) pln; | ||
|  | 
 | ||
|  |      if (rnk == 1) { | ||
|  | 	  dims[0].ib = iblock; | ||
|  | 	  dims[0].ob = oblock; | ||
|  |      } | ||
|  |      else if (rnk > 1) { | ||
|  | 	  dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock; | ||
|  | 	  dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock; | ||
|  |      } | ||
|  | 
 | ||
|  |      pln = XM(plan_guru_r2r)(rnk,dims,howmany, in,out, comm, kind, flags); | ||
|  |      X(ifree)(dims); | ||
|  |      return pln; | ||
|  | } | ||
|  | 
 | ||
|  | X(plan) XM(plan_r2r)(int rnk, const ptrdiff_t *n, R *in, R *out, | ||
|  | 		     MPI_Comm comm,  | ||
|  | 		     const X(r2r_kind) *kind, | ||
|  | 		     unsigned flags) | ||
|  | { | ||
|  |      return XM(plan_many_r2r)(rnk, n, 1, | ||
|  | 			      FFTW_MPI_DEFAULT_BLOCK, | ||
|  | 			      FFTW_MPI_DEFAULT_BLOCK, | ||
|  | 			      in, out, comm, kind, flags); | ||
|  | } | ||
|  | 
 | ||
|  | X(plan) XM(plan_r2r_2d)(ptrdiff_t nx, ptrdiff_t ny, R *in, R *out, | ||
|  | 			MPI_Comm comm, | ||
|  | 			X(r2r_kind) kindx, X(r2r_kind) kindy, | ||
|  | 			unsigned flags) | ||
|  | { | ||
|  |      ptrdiff_t n[2]; | ||
|  |      X(r2r_kind) kind[2]; | ||
|  |      n[0] = nx; n[1] = ny; | ||
|  |      kind[0] = kindx; kind[1] = kindy; | ||
|  |      return XM(plan_r2r)(2, n, in, out, comm, kind, flags); | ||
|  | } | ||
|  | 
 | ||
|  | X(plan) XM(plan_r2r_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz, | ||
|  | 			R *in, R *out, | ||
|  | 			MPI_Comm comm,  | ||
|  | 			X(r2r_kind) kindx, X(r2r_kind) kindy, | ||
|  | 			X(r2r_kind) kindz, | ||
|  | 			unsigned flags) | ||
|  | { | ||
|  |      ptrdiff_t n[3]; | ||
|  |      X(r2r_kind) kind[3]; | ||
|  |      n[0] = nx; n[1] = ny; n[2] = nz; | ||
|  |      kind[0] = kindx; kind[1] = kindy; kind[2] = kindz; | ||
|  |      return XM(plan_r2r)(3, n, in, out, comm, kind, flags); | ||
|  | } | ||
|  | 
 | ||
|  | /*************************************************************************/ | ||
|  | /* R2C/C2R API */ | ||
|  | 
 | ||
|  | static X(plan) plan_guru_rdft2(int rnk, const XM(ddim) *dims0, | ||
|  | 			       ptrdiff_t howmany, | ||
|  | 			       R *r, C *c, | ||
|  | 			       MPI_Comm comm, rdft_kind kind, unsigned flags) | ||
|  | { | ||
|  |      int n_pes, i; | ||
|  |      dtensor *sz; | ||
|  |      R *cr = (R *) c; | ||
|  |       | ||
|  |      XM(init)(); | ||
|  | 
 | ||
|  |      if (howmany < 0 || rnk < 2) return 0; | ||
|  |      for (i = 0; i < rnk; ++i) | ||
|  | 	  if (dims0[i].n < 1 || dims0[i].ib < 0 || dims0[i].ob < 0) | ||
|  | 	       return 0; | ||
|  | 
 | ||
|  |      MPI_Comm_size(comm, &n_pes); | ||
|  |      sz = default_sz(rnk, dims0, n_pes, 1); | ||
|  | 
 | ||
|  |      sz->dims[rnk-1].n = dims0[rnk-1].n / 2 + 1; | ||
|  |      if (XM(num_blocks_total)(sz, IB) > n_pes | ||
|  | 	 || XM(num_blocks_total)(sz, OB) > n_pes) { | ||
|  | 	  XM(dtensor_destroy)(sz); | ||
|  | 	  return 0; | ||
|  |      } | ||
|  |      sz->dims[rnk-1].n = dims0[rnk-1].n; | ||
|  | 
 | ||
|  |      if (kind == R2HC) | ||
|  | 	  return X(mkapiplan)(0, flags, | ||
|  | 			      XM(mkproblem_rdft2_d)(sz, howmany, | ||
|  | 						    r, cr, comm, R2HC,  | ||
|  | 						    MPI_FLAGS(flags))); | ||
|  |      else | ||
|  | 	  return X(mkapiplan)(0, flags, | ||
|  | 			      XM(mkproblem_rdft2_d)(sz, howmany, | ||
|  | 						    cr, r, comm, HC2R,  | ||
|  | 						    MPI_FLAGS(flags))); | ||
|  | } | ||
|  | 
 | ||
|  | X(plan) XM(plan_many_dft_r2c)(int rnk, const ptrdiff_t *n, | ||
|  | 			  ptrdiff_t howmany, | ||
|  | 			  ptrdiff_t iblock, ptrdiff_t oblock, | ||
|  | 			  R *in, C *out, | ||
|  | 			  MPI_Comm comm, unsigned flags) | ||
|  | { | ||
|  |      XM(ddim) *dims = simple_dims(rnk, n); | ||
|  |      X(plan) pln; | ||
|  | 
 | ||
|  |      if (rnk == 1) { | ||
|  | 	  dims[0].ib = iblock; | ||
|  | 	  dims[0].ob = oblock; | ||
|  |      } | ||
|  |      else if (rnk > 1) { | ||
|  | 	  dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock; | ||
|  | 	  dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock; | ||
|  |      } | ||
|  | 
 | ||
|  |      pln = plan_guru_rdft2(rnk,dims,howmany, in,out, comm, R2HC, flags); | ||
|  |      X(ifree)(dims); | ||
|  |      return pln; | ||
|  | } | ||
|  | 
 | ||
|  | X(plan) XM(plan_many_dft_c2r)(int rnk, const ptrdiff_t *n, | ||
|  | 			  ptrdiff_t howmany, | ||
|  | 			  ptrdiff_t iblock, ptrdiff_t oblock, | ||
|  | 			  C *in, R *out, | ||
|  | 			  MPI_Comm comm, unsigned flags) | ||
|  | { | ||
|  |      XM(ddim) *dims = simple_dims(rnk, n); | ||
|  |      X(plan) pln; | ||
|  | 
 | ||
|  |      if (rnk == 1) { | ||
|  | 	  dims[0].ib = iblock; | ||
|  | 	  dims[0].ob = oblock; | ||
|  |      } | ||
|  |      else if (rnk > 1) { | ||
|  | 	  dims[0 != (flags & FFTW_MPI_TRANSPOSED_IN)].ib = iblock; | ||
|  | 	  dims[0 != (flags & FFTW_MPI_TRANSPOSED_OUT)].ob = oblock; | ||
|  |      } | ||
|  | 
 | ||
|  |      pln = plan_guru_rdft2(rnk,dims,howmany, out,in, comm, HC2R, flags); | ||
|  |      X(ifree)(dims); | ||
|  |      return pln; | ||
|  | } | ||
|  | 
 | ||
|  | X(plan) XM(plan_dft_r2c)(int rnk, const ptrdiff_t *n, R *in, C *out, | ||
|  | 		     MPI_Comm comm, unsigned flags) | ||
|  | { | ||
|  |      return XM(plan_many_dft_r2c)(rnk, n, 1, | ||
|  | 			      FFTW_MPI_DEFAULT_BLOCK, | ||
|  | 			      FFTW_MPI_DEFAULT_BLOCK, | ||
|  | 			      in, out, comm, flags); | ||
|  | } | ||
|  | 
 | ||
|  | X(plan) XM(plan_dft_r2c_2d)(ptrdiff_t nx, ptrdiff_t ny, R *in, C *out, | ||
|  | 			MPI_Comm comm, unsigned flags) | ||
|  | { | ||
|  |      ptrdiff_t n[2]; | ||
|  |      n[0] = nx; n[1] = ny; | ||
|  |      return XM(plan_dft_r2c)(2, n, in, out, comm, flags); | ||
|  | } | ||
|  | 
 | ||
|  | X(plan) XM(plan_dft_r2c_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz, | ||
|  | 			R *in, C *out, MPI_Comm comm, unsigned flags) | ||
|  | { | ||
|  |      ptrdiff_t n[3]; | ||
|  |      n[0] = nx; n[1] = ny; n[2] = nz; | ||
|  |      return XM(plan_dft_r2c)(3, n, in, out, comm, flags); | ||
|  | } | ||
|  | 
 | ||
|  | X(plan) XM(plan_dft_c2r)(int rnk, const ptrdiff_t *n, C *in, R *out, | ||
|  | 		     MPI_Comm comm, unsigned flags) | ||
|  | { | ||
|  |      return XM(plan_many_dft_c2r)(rnk, n, 1, | ||
|  | 			      FFTW_MPI_DEFAULT_BLOCK, | ||
|  | 			      FFTW_MPI_DEFAULT_BLOCK, | ||
|  | 			      in, out, comm, flags); | ||
|  | } | ||
|  | 
 | ||
|  | X(plan) XM(plan_dft_c2r_2d)(ptrdiff_t nx, ptrdiff_t ny, C *in, R *out, | ||
|  | 			MPI_Comm comm, unsigned flags) | ||
|  | { | ||
|  |      ptrdiff_t n[2]; | ||
|  |      n[0] = nx; n[1] = ny; | ||
|  |      return XM(plan_dft_c2r)(2, n, in, out, comm, flags); | ||
|  | } | ||
|  | 
 | ||
|  | X(plan) XM(plan_dft_c2r_3d)(ptrdiff_t nx, ptrdiff_t ny, ptrdiff_t nz, | ||
|  | 			C *in, R *out, MPI_Comm comm, unsigned flags) | ||
|  | { | ||
|  |      ptrdiff_t n[3]; | ||
|  |      n[0] = nx; n[1] = ny; n[2] = nz; | ||
|  |      return XM(plan_dft_c2r)(3, n, in, out, comm, flags); | ||
|  | } | ||
|  | 
 | ||
|  | /*************************************************************************/ | ||
|  | /* New-array execute functions */ | ||
|  | 
 | ||
|  | void XM(execute_dft)(const X(plan) p, C *in, C *out) { | ||
|  |      /* internally, MPI plans are just rdft plans */ | ||
|  |      X(execute_r2r)(p, (R*) in, (R*) out); | ||
|  | } | ||
|  | 
 | ||
|  | void XM(execute_dft_r2c)(const X(plan) p, R *in, C *out) { | ||
|  |      /* internally, MPI plans are just rdft plans */ | ||
|  |      X(execute_r2r)(p, in, (R*) out); | ||
|  | } | ||
|  | 
 | ||
|  | void XM(execute_dft_c2r)(const X(plan) p, C *in, R *out) { | ||
|  |      /* internally, MPI plans are just rdft plans */ | ||
|  |      X(execute_r2r)(p, (R*) in, out); | ||
|  | } | ||
|  | 
 | ||
|  | void XM(execute_r2r)(const X(plan) p, R *in, R *out) { | ||
|  |      /* internally, MPI plans are just rdft plans */ | ||
|  |      X(execute_r2r)(p, in, out); | ||
|  | } |