84 lines
		
	
	
		
			3.4 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
		
		
			
		
	
	
			84 lines
		
	
	
		
			3.4 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
|   | /*
 | ||
|  |  * Copyright (c) 2003, 2007-14 Matteo Frigo | ||
|  |  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology | ||
|  |  * | ||
|  |  * This program is free software; you can redistribute it and/or modify | ||
|  |  * it under the terms of the GNU General Public License as published by | ||
|  |  * the Free Software Foundation; either version 2 of the License, or | ||
|  |  * (at your option) any later version. | ||
|  |  * | ||
|  |  * This program is distributed in the hope that it will be useful, | ||
|  |  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
|  |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||
|  |  * GNU General Public License for more details. | ||
|  |  * | ||
|  |  * You should have received a copy of the GNU General Public License | ||
|  |  * along with this program; if not, write to the Free Software | ||
|  |  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA | ||
|  |  * | ||
|  |  */ | ||
|  | 
 | ||
|  | #include "ifftw-mpi.h"
 | ||
|  | 
 | ||
|  | /* Return the radix r for a 1d MPI transform of a distributed dimension d,
 | ||
|  |    with the given flags and transform size.   That is, decomposes d.n | ||
|  |    as r * m, Cooley-Tukey style.  Also computes the block sizes rblock | ||
|  |    and mblock.  Returns 0 if such a decomposition is not feasible. | ||
|  |    This is unfortunately somewhat complicated. | ||
|  | 
 | ||
|  |    A distributed Cooley-Tukey algorithm works as follows (see dft-rank1.c): | ||
|  | 
 | ||
|  |    d.n is initially distributed as an m x r array with block size mblock[IB]. | ||
|  |    Then it is internally transposed to an r x m array with block size | ||
|  |    rblock[IB].  Then it is internally transposed to m x r again with block | ||
|  |    size mblock[OB].  Finally, it is transposed to r x m with block size | ||
|  |    rblock[IB]. | ||
|  | 
 | ||
|  |    If flags & SCRAMBLED_IN, then the first transpose is skipped (the array | ||
|  |    starts out as r x m).  If flags & SCRAMBLED_OUT, then the last transpose | ||
|  |    is skipped (the array ends up as m x r).  To make sure the forward | ||
|  |    and backward transforms use the same "scrambling" format, we swap r | ||
|  |    and m when sign != FFT_SIGN. | ||
|  | 
 | ||
|  |    There are some downsides to this, especially in the case where | ||
|  |    either m or r is not divisible by n_pes.  For one thing, it means | ||
|  |    that in general we can't use the same block size for the input and | ||
|  |    output.  For another thing, it means that we can't in general honor | ||
|  |    a user's "requested" block sizes in d.b[].  Therefore, for simplicity, | ||
|  |    we simply ignore d.b[] for now. | ||
|  | */ | ||
|  | INT XM(choose_radix)(ddim d, int n_pes, unsigned flags, int sign, | ||
|  | 		     INT rblock[2], INT mblock[2]) | ||
|  | { | ||
|  |      INT r, m; | ||
|  | 
 | ||
|  |      UNUSED(flags); /* we would need this if we paid attention to d.b[*] */ | ||
|  | 
 | ||
|  |      /* If n_pes is a factor of d.n, then choose r to be d.n / n_pes.
 | ||
|  |         This not only ensures that the input (the m dimension) is | ||
|  |         equally distributed if possible, and at the r dimension is | ||
|  |         maximally equally distributed (if d.n/n_pes >= n_pes), it also | ||
|  |         makes one of the local transpositions in the algorithm | ||
|  |         trivial. */ | ||
|  |      if (d.n % n_pes == 0 /* it's good if n_pes divides d.n ...*/ | ||
|  | 	 && d.n / n_pes >= n_pes /* .. unless we can't use n_pes processes */) | ||
|  | 	  r = d.n / n_pes; | ||
|  |      else {  /* n_pes does not divide d.n, pick a factor close to sqrt(d.n) */ | ||
|  | 	  for (r = X(isqrt)(d.n); d.n % r != 0; ++r) | ||
|  | 	       ; | ||
|  |      } | ||
|  |      if (r == 1 || r == d.n) return 0; /* punt if we can't reduce size */ | ||
|  | 
 | ||
|  |      if (sign != FFT_SIGN) { /* swap {m,r} so that scrambling is reversible */ | ||
|  | 	  m = r; | ||
|  | 	  r = d.n / m; | ||
|  |      } | ||
|  |      else | ||
|  | 	  m = d.n / r; | ||
|  | 
 | ||
|  |      rblock[IB] = rblock[OB] = XM(default_block)(r, n_pes); | ||
|  |      mblock[IB] = mblock[OB] = XM(default_block)(m, n_pes); | ||
|  | 
 | ||
|  |      return r; | ||
|  | } |