778 lines
		
	
	
		
			22 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
		
		
			
		
	
	
			778 lines
		
	
	
		
			22 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| 
								 | 
							
								/*
							 | 
						||
| 
								 | 
							
								 * Copyright (c) 2003, 2007-14 Matteo Frigo
							 | 
						||
| 
								 | 
							
								 * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
							 | 
						||
| 
								 | 
							
								 *
							 | 
						||
| 
								 | 
							
								 * This program is free software; you can redistribute it and/or modify
							 | 
						||
| 
								 | 
							
								 * it under the terms of the GNU General Public License as published by
							 | 
						||
| 
								 | 
							
								 * the Free Software Foundation; either version 2 of the License, or
							 | 
						||
| 
								 | 
							
								 * (at your option) any later version.
							 | 
						||
| 
								 | 
							
								 *
							 | 
						||
| 
								 | 
							
								 * This program is distributed in the hope that it will be useful,
							 | 
						||
| 
								 | 
							
								 * but WITHOUT ANY WARRANTY; without even the implied warranty of
							 | 
						||
| 
								 | 
							
								 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
							 | 
						||
| 
								 | 
							
								 * GNU General Public License for more details.
							 | 
						||
| 
								 | 
							
								 *
							 | 
						||
| 
								 | 
							
								 * You should have received a copy of the GNU General Public License
							 | 
						||
| 
								 | 
							
								 * along with this program; if not, write to the Free Software
							 | 
						||
| 
								 | 
							
								 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
							 | 
						||
| 
								 | 
							
								 *
							 | 
						||
| 
								 | 
							
								 */
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/* rank-0, vector-rank-3, non-square in-place transposition
							 | 
						||
| 
								 | 
							
								   (see rank0.c for square transposition)  */
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#include "rdft/rdft.h"
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#ifdef HAVE_STRING_H
							 | 
						||
| 
								 | 
							
								#include <string.h>		/* for memcpy() */
							 | 
						||
| 
								 | 
							
								#endif
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								struct P_s;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								typedef struct {
							 | 
						||
| 
								 | 
							
								     rdftapply apply;
							 | 
						||
| 
								 | 
							
								     int (*applicable)(const problem_rdft *p, planner *plnr,
							 | 
						||
| 
								 | 
							
										       int dim0, int dim1, int dim2, INT *nbuf);
							 | 
						||
| 
								 | 
							
								     int (*mkcldrn)(const problem_rdft *p, planner *plnr, struct P_s *ego);
							 | 
						||
| 
								 | 
							
								     const char *nam;
							 | 
						||
| 
								 | 
							
								} transpose_adt;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								typedef struct {
							 | 
						||
| 
								 | 
							
								     solver super;
							 | 
						||
| 
								 | 
							
								     const transpose_adt *adt;
							 | 
						||
| 
								 | 
							
								} S;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								typedef struct P_s {
							 | 
						||
| 
								 | 
							
								     plan_rdft super;
							 | 
						||
| 
								 | 
							
								     INT n, m, vl; /* transpose n x m matrix of vl-tuples */
							 | 
						||
| 
								 | 
							
								     INT nbuf; /* buffer size */
							 | 
						||
| 
								 | 
							
								     INT nd, md, d; /* transpose-gcd params */
							 | 
						||
| 
								 | 
							
								     INT nc, mc; /* transpose-cut params */
							 | 
						||
| 
								 | 
							
								     plan *cld1, *cld2, *cld3; /* children, null if unused */
							 | 
						||
| 
								 | 
							
								     const S *slv;
							 | 
						||
| 
								 | 
							
								} P;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/*************************************************************************/
							 | 
						||
| 
								 | 
							
								/* some utilities for the solvers */
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								static INT gcd(INT a, INT b)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     INT r;
							 | 
						||
| 
								 | 
							
								     do {
							 | 
						||
| 
								 | 
							
									  r = a % b;
							 | 
						||
| 
								 | 
							
									  a = b;
							 | 
						||
| 
								 | 
							
									  b = r;
							 | 
						||
| 
								 | 
							
								     } while (r != 0);
							 | 
						||
| 
								 | 
							
								     
							 | 
						||
| 
								 | 
							
								     return a;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/* whether we can transpose with one of our routines expecting
							 | 
						||
| 
								 | 
							
								   contiguous Ntuples */
							 | 
						||
| 
								 | 
							
								static int Ntuple_transposable(const iodim *a, const iodim *b, INT vl, INT vs)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     return (vs == 1 && b->is == vl && a->os == vl &&
							 | 
						||
| 
								 | 
							
									     ((a->n == b->n && a->is == b->os
							 | 
						||
| 
								 | 
							
									       && a->is >= b->n && a->is % vl == 0)
							 | 
						||
| 
								 | 
							
									      || (a->is == b->n * vl && b->os == a->n * vl)));
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/* check whether a and b correspond to the first and second dimensions
							 | 
						||
| 
								 | 
							
								   of a transpose of tuples with vector length = vl, stride = vs. */
							 | 
						||
| 
								 | 
							
								static int transposable(const iodim *a, const iodim *b, INT vl, INT vs)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     return ((a->n == b->n && a->os == b->is && a->is == b->os)
							 | 
						||
| 
								 | 
							
								             || Ntuple_transposable(a, b, vl, vs));
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								static int pickdim(const tensor *s, int *pdim0, int *pdim1, int *pdim2)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     int dim0, dim1;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     for (dim0 = 0; dim0 < s->rnk; ++dim0)
							 | 
						||
| 
								 | 
							
								          for (dim1 = 0; dim1 < s->rnk; ++dim1) {
							 | 
						||
| 
								 | 
							
									       int dim2 = 3 - dim0 - dim1;
							 | 
						||
| 
								 | 
							
									       if (dim0 == dim1) continue;
							 | 
						||
| 
								 | 
							
								               if ((s->rnk == 2 || s->dims[dim2].is == s->dims[dim2].os)
							 | 
						||
| 
								 | 
							
										   && transposable(s->dims + dim0, s->dims + dim1, 
							 | 
						||
| 
								 | 
							
												   s->rnk == 2 ? (INT)1 : s->dims[dim2].n,
							 | 
						||
| 
								 | 
							
												   s->rnk == 2 ? (INT)1 : s->dims[dim2].is)) {
							 | 
						||
| 
								 | 
							
								                    *pdim0 = dim0;
							 | 
						||
| 
								 | 
							
								                    *pdim1 = dim1;
							 | 
						||
| 
								 | 
							
										    *pdim2 = dim2;
							 | 
						||
| 
								 | 
							
								                    return 1;
							 | 
						||
| 
								 | 
							
								               }
							 | 
						||
| 
								 | 
							
									  }
							 | 
						||
| 
								 | 
							
								     return 0;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#define MINBUFDIV 9 /* min factor by which buffer is smaller than data */
							 | 
						||
| 
								 | 
							
								#define MAXBUF 65536 /* maximum non-ugly buffer */
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/* generic applicability function */
							 | 
						||
| 
								 | 
							
								static int applicable(const solver *ego_, const problem *p_, planner *plnr,
							 | 
						||
| 
								 | 
							
										      int *dim0, int *dim1, int *dim2, INT *nbuf)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     const S *ego = (const S *) ego_;
							 | 
						||
| 
								 | 
							
								     const problem_rdft *p = (const problem_rdft *) p_;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     return (1
							 | 
						||
| 
								 | 
							
									     && p->I == p->O
							 | 
						||
| 
								 | 
							
									     && p->sz->rnk == 0
							 | 
						||
| 
								 | 
							
									     && (p->vecsz->rnk == 2 || p->vecsz->rnk == 3)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
									     && pickdim(p->vecsz, dim0, dim1, dim2)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
									     /* UGLY if vecloop in wrong order for locality */
							 | 
						||
| 
								 | 
							
									     && (!NO_UGLYP(plnr) ||
							 | 
						||
| 
								 | 
							
										 p->vecsz->rnk == 2 ||
							 | 
						||
| 
								 | 
							
										 X(iabs)(p->vecsz->dims[*dim2].is)
							 | 
						||
| 
								 | 
							
										 < X(imax)(X(iabs)(p->vecsz->dims[*dim0].is),
							 | 
						||
| 
								 | 
							
											   X(iabs)(p->vecsz->dims[*dim0].os)))
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
									     /* SLOW if non-square */
							 | 
						||
| 
								 | 
							
									     && (!NO_SLOWP(plnr)
							 | 
						||
| 
								 | 
							
										 || p->vecsz->dims[*dim0].n == p->vecsz->dims[*dim1].n)
							 | 
						||
| 
								 | 
							
										      
							 | 
						||
| 
								 | 
							
									     && ego->adt->applicable(p, plnr, *dim0,*dim1,*dim2,nbuf)
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
									     /* buffers too big are UGLY */
							 | 
						||
| 
								 | 
							
									     && ((!NO_UGLYP(plnr) && !CONSERVE_MEMORYP(plnr))
							 | 
						||
| 
								 | 
							
										 || *nbuf <= MAXBUF
							 | 
						||
| 
								 | 
							
										 || *nbuf * MINBUFDIV <= X(tensor_sz)(p->vecsz))
							 | 
						||
| 
								 | 
							
									  );
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								static void get_transpose_vec(const problem_rdft *p, int dim2, INT *vl,INT *vs)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     if (p->vecsz->rnk == 2) {
							 | 
						||
| 
								 | 
							
									  *vl = 1; *vs = 1;
							 | 
						||
| 
								 | 
							
								     }
							 | 
						||
| 
								 | 
							
								     else {
							 | 
						||
| 
								 | 
							
									  *vl = p->vecsz->dims[dim2].n;
							 | 
						||
| 
								 | 
							
									  *vs = p->vecsz->dims[dim2].is; /* == os */
							 | 
						||
| 
								 | 
							
								     }  
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/*************************************************************************/
							 | 
						||
| 
								 | 
							
								/* Cache-oblivious in-place transpose of non-square matrices, based 
							 | 
						||
| 
								 | 
							
								   on transposes of blocks given by the gcd of the dimensions.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								   This algorithm is related to algorithm V5 from Murray Dow,
							 | 
						||
| 
								 | 
							
								   "Transposing a matrix on a vector computer," Parallel Computing 21
							 | 
						||
| 
								 | 
							
								   (12), 1997-2005 (1995), with the modification that we use
							 | 
						||
| 
								 | 
							
								   cache-oblivious recursive transpose subroutines (and we derived
							 | 
						||
| 
								 | 
							
								   it independently).
							 | 
						||
| 
								 | 
							
								   
							 | 
						||
| 
								 | 
							
								   For a p x q matrix, this requires scratch space equal to the size
							 | 
						||
| 
								 | 
							
								   of the matrix divided by gcd(p,q).  Alternatively, see also the
							 | 
						||
| 
								 | 
							
								   "cut" algorithm below, if |p-q| * gcd(p,q) < max(p,q). */
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								static void apply_gcd(const plan *ego_, R *I, R *O)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     const P *ego = (const P *) ego_;
							 | 
						||
| 
								 | 
							
								     INT n = ego->nd, m = ego->md, d = ego->d;
							 | 
						||
| 
								 | 
							
								     INT vl = ego->vl;
							 | 
						||
| 
								 | 
							
								     R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
							 | 
						||
| 
								 | 
							
								     INT i, num_el = n*m*d*vl;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     A(ego->n == n * d && ego->m == m * d);
							 | 
						||
| 
								 | 
							
								     UNUSED(O);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     /* Transpose the matrix I in-place, where I is an (n*d) x (m*d) matrix
							 | 
						||
| 
								 | 
							
									of vl-tuples and buf contains n*m*d*vl elements.  
							 | 
						||
| 
								 | 
							
									
							 | 
						||
| 
								 | 
							
									In general, to transpose a p x q matrix, you should call this
							 | 
						||
| 
								 | 
							
									routine with d = gcd(p, q), n = p/d, and m = q/d.  */
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     A(n > 0 && m > 0 && vl > 0);
							 | 
						||
| 
								 | 
							
								     A(d > 1);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     /* treat as (d x n) x (d' x m) matrix.  (d' = d) */
							 | 
						||
| 
								 | 
							
								     
							 | 
						||
| 
								 | 
							
								     /* First, transpose d x (n x d') x m to d x (d' x n) x m,
							 | 
						||
| 
								 | 
							
									using the buf matrix.  This consists of d transposes
							 | 
						||
| 
								 | 
							
									of contiguous n x d' matrices of m-tuples. */
							 | 
						||
| 
								 | 
							
								     if (n > 1) {
							 | 
						||
| 
								 | 
							
									  rdftapply cldapply = ((plan_rdft *) ego->cld1)->apply;
							 | 
						||
| 
								 | 
							
									  for (i = 0; i < d; ++i) {
							 | 
						||
| 
								 | 
							
									       cldapply(ego->cld1, I + i*num_el, buf);
							 | 
						||
| 
								 | 
							
									       memcpy(I + i*num_el, buf, num_el*sizeof(R));
							 | 
						||
| 
								 | 
							
									  }
							 | 
						||
| 
								 | 
							
								     }
							 | 
						||
| 
								 | 
							
								     
							 | 
						||
| 
								 | 
							
								     /* Now, transpose (d x d') x (n x m) to (d' x d) x (n x m), which
							 | 
						||
| 
								 | 
							
									is a square in-place transpose of n*m-tuples: */
							 | 
						||
| 
								 | 
							
								     {
							 | 
						||
| 
								 | 
							
									  rdftapply cldapply = ((plan_rdft *) ego->cld2)->apply;
							 | 
						||
| 
								 | 
							
									  cldapply(ego->cld2, I, I);
							 | 
						||
| 
								 | 
							
								     }
							 | 
						||
| 
								 | 
							
								     
							 | 
						||
| 
								 | 
							
								     /* Finally, transpose d' x ((d x n) x m) to d' x (m x (d x n)),
							 | 
						||
| 
								 | 
							
									using the buf matrix.  This consists of d' transposes
							 | 
						||
| 
								 | 
							
									of contiguous d*n x m matrices. */
							 | 
						||
| 
								 | 
							
								     if (m > 1) {
							 | 
						||
| 
								 | 
							
									  rdftapply cldapply = ((plan_rdft *) ego->cld3)->apply;
							 | 
						||
| 
								 | 
							
									  for (i = 0; i < d; ++i) {
							 | 
						||
| 
								 | 
							
									       cldapply(ego->cld3, I + i*num_el, buf);
							 | 
						||
| 
								 | 
							
									       memcpy(I + i*num_el, buf, num_el*sizeof(R));
							 | 
						||
| 
								 | 
							
									  }
							 | 
						||
| 
								 | 
							
								     }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     X(ifree)(buf);
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								static int applicable_gcd(const problem_rdft *p, planner *plnr,
							 | 
						||
| 
								 | 
							
											  int dim0, int dim1, int dim2, INT *nbuf)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     INT n = p->vecsz->dims[dim0].n;
							 | 
						||
| 
								 | 
							
								     INT m = p->vecsz->dims[dim1].n;
							 | 
						||
| 
								 | 
							
								     INT d, vl, vs;
							 | 
						||
| 
								 | 
							
								     get_transpose_vec(p, dim2, &vl, &vs);
							 | 
						||
| 
								 | 
							
								     d = gcd(n, m);
							 | 
						||
| 
								 | 
							
								     *nbuf = n * (m / d) * vl;
							 | 
						||
| 
								 | 
							
								     return (!NO_SLOWP(plnr) /* FIXME: not really SLOW for large 1d ffts */
							 | 
						||
| 
								 | 
							
									     && n != m
							 | 
						||
| 
								 | 
							
									     && d > 1
							 | 
						||
| 
								 | 
							
									     && Ntuple_transposable(p->vecsz->dims + dim0,
							 | 
						||
| 
								 | 
							
												    p->vecsz->dims + dim1,
							 | 
						||
| 
								 | 
							
												    vl, vs));
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								static int mkcldrn_gcd(const problem_rdft *p, planner *plnr, P *ego)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     INT n = ego->nd, m = ego->md, d = ego->d;
							 | 
						||
| 
								 | 
							
								     INT vl = ego->vl;
							 | 
						||
| 
								 | 
							
								     R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
							 | 
						||
| 
								 | 
							
								     INT num_el = n*m*d*vl;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     if (n > 1) {
							 | 
						||
| 
								 | 
							
									  ego->cld1 = X(mkplan_d)(plnr,
							 | 
						||
| 
								 | 
							
												  X(mkproblem_rdft_0_d)(
							 | 
						||
| 
								 | 
							
												       X(mktensor_3d)(n, d*m*vl, m*vl,
							 | 
						||
| 
								 | 
							
														      d, m*vl, n*m*vl,
							 | 
						||
| 
								 | 
							
														      m*vl, 1, 1),
							 | 
						||
| 
								 | 
							
												       TAINT(p->I, num_el), buf));
							 | 
						||
| 
								 | 
							
									  if (!ego->cld1)
							 | 
						||
| 
								 | 
							
									       goto nada;
							 | 
						||
| 
								 | 
							
									  X(ops_madd)(d, &ego->cld1->ops, &ego->super.super.ops,
							 | 
						||
| 
								 | 
							
										      &ego->super.super.ops);
							 | 
						||
| 
								 | 
							
									  ego->super.super.ops.other += num_el * d * 2;
							 | 
						||
| 
								 | 
							
								     }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     ego->cld2 = X(mkplan_d)(plnr,
							 | 
						||
| 
								 | 
							
											     X(mkproblem_rdft_0_d)(
							 | 
						||
| 
								 | 
							
												  X(mktensor_3d)(d, d*n*m*vl, n*m*vl,
							 | 
						||
| 
								 | 
							
														 d, n*m*vl, d*n*m*vl,
							 | 
						||
| 
								 | 
							
														 n*m*vl, 1, 1),
							 | 
						||
| 
								 | 
							
												  p->I, p->I));
							 | 
						||
| 
								 | 
							
								     if (!ego->cld2)
							 | 
						||
| 
								 | 
							
									  goto nada;
							 | 
						||
| 
								 | 
							
								     X(ops_add2)(&ego->cld2->ops, &ego->super.super.ops);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     if (m > 1) {
							 | 
						||
| 
								 | 
							
									  ego->cld3 = X(mkplan_d)(plnr,
							 | 
						||
| 
								 | 
							
												  X(mkproblem_rdft_0_d)(
							 | 
						||
| 
								 | 
							
												       X(mktensor_3d)(d*n, m*vl, vl,
							 | 
						||
| 
								 | 
							
														      m, vl, d*n*vl,
							 | 
						||
| 
								 | 
							
														      vl, 1, 1),
							 | 
						||
| 
								 | 
							
												       TAINT(p->I, num_el), buf));
							 | 
						||
| 
								 | 
							
									  if (!ego->cld3)
							 | 
						||
| 
								 | 
							
									       goto nada;
							 | 
						||
| 
								 | 
							
									  X(ops_madd2)(d, &ego->cld3->ops, &ego->super.super.ops);
							 | 
						||
| 
								 | 
							
									  ego->super.super.ops.other += num_el * d * 2;
							 | 
						||
| 
								 | 
							
								     }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     X(ifree)(buf);
							 | 
						||
| 
								 | 
							
								     return 1;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								 nada:
							 | 
						||
| 
								 | 
							
								     X(ifree)(buf);
							 | 
						||
| 
								 | 
							
								     return 0;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								static const transpose_adt adt_gcd =
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     apply_gcd, applicable_gcd, mkcldrn_gcd,
							 | 
						||
| 
								 | 
							
								     "rdft-transpose-gcd"
							 | 
						||
| 
								 | 
							
								};
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/*************************************************************************/
							 | 
						||
| 
								 | 
							
								/* Cache-oblivious in-place transpose of non-square n x m matrices,
							 | 
						||
| 
								 | 
							
								   based on transposing a sub-matrix first and then transposing the
							 | 
						||
| 
								 | 
							
								   remainder(s) with the help of a buffer.  See also transpose-gcd,
							 | 
						||
| 
								 | 
							
								   above, if gcd(n,m) is large.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								   This algorithm is related to algorithm V3 from Murray Dow,
							 | 
						||
| 
								 | 
							
								   "Transposing a matrix on a vector computer," Parallel Computing 21
							 | 
						||
| 
								 | 
							
								   (12), 1997-2005 (1995), with the modifications that we use
							 | 
						||
| 
								 | 
							
								   cache-oblivious recursive transpose subroutines and we have the
							 | 
						||
| 
								 | 
							
								   generalization for large |n-m| below.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								   The best case, and the one described by Dow, is for |n-m| small, in
							 | 
						||
| 
								 | 
							
								   which case we transpose a square sub-matrix of size min(n,m),
							 | 
						||
| 
								 | 
							
								   handling the remainder via a buffer.  This requires scratch space
							 | 
						||
| 
								 | 
							
								   equal to the size of the matrix times |n-m| / max(n,m).
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								   As a generalization when |n-m| is not small, we also support cutting
							 | 
						||
| 
								 | 
							
								   *both* dimensions to an nc x mc matrix which is *not* necessarily
							 | 
						||
| 
								 | 
							
								   square, but has a large gcd (and can therefore use transpose-gcd).
							 | 
						||
| 
								 | 
							
								*/
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								static void apply_cut(const plan *ego_, R *I, R *O)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     const P *ego = (const P *) ego_;
							 | 
						||
| 
								 | 
							
								     INT n = ego->n, m = ego->m, nc = ego->nc, mc = ego->mc, vl = ego->vl;
							 | 
						||
| 
								 | 
							
								     INT i;
							 | 
						||
| 
								 | 
							
								     R *buf1 = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
							 | 
						||
| 
								 | 
							
								     UNUSED(O);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     if (m > mc) {
							 | 
						||
| 
								 | 
							
									  ((plan_rdft *) ego->cld1)->apply(ego->cld1, I + mc*vl, buf1);
							 | 
						||
| 
								 | 
							
									  for (i = 0; i < nc; ++i)
							 | 
						||
| 
								 | 
							
									       memmove(I + (mc*vl) * i, I + (m*vl) * i, sizeof(R) * (mc*vl));
							 | 
						||
| 
								 | 
							
								     }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     ((plan_rdft *) ego->cld2)->apply(ego->cld2, I, I); /* nc x mc transpose */
							 | 
						||
| 
								 | 
							
								     
							 | 
						||
| 
								 | 
							
								     if (n > nc) {
							 | 
						||
| 
								 | 
							
									  R *buf2 = buf1 + (m-mc)*(nc*vl); /* FIXME: force better alignment? */
							 | 
						||
| 
								 | 
							
									  memcpy(buf2, I + nc*(m*vl), (n-nc)*(m*vl)*sizeof(R));
							 | 
						||
| 
								 | 
							
									  for (i = mc-1; i >= 0; --i)
							 | 
						||
| 
								 | 
							
									       memmove(I + (n*vl) * i, I + (nc*vl) * i, sizeof(R) * (n*vl));
							 | 
						||
| 
								 | 
							
									  ((plan_rdft *) ego->cld3)->apply(ego->cld3, buf2, I + nc*vl);
							 | 
						||
| 
								 | 
							
								     }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     if (m > mc) {
							 | 
						||
| 
								 | 
							
									  if (n > nc)
							 | 
						||
| 
								 | 
							
									       for (i = mc; i < m; ++i)
							 | 
						||
| 
								 | 
							
										    memcpy(I + i*(n*vl), buf1 + (i-mc)*(nc*vl),
							 | 
						||
| 
								 | 
							
											   (nc*vl)*sizeof(R));
							 | 
						||
| 
								 | 
							
									  else
							 | 
						||
| 
								 | 
							
									       memcpy(I + mc*(n*vl), buf1, (m-mc)*(n*vl)*sizeof(R));
							 | 
						||
| 
								 | 
							
								     }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     X(ifree)(buf1);
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/* only cut one dimension if the resulting buffer is small enough */
							 | 
						||
| 
								 | 
							
								static int cut1(INT n, INT m, INT vl)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     return (X(imax)(n,m) >= X(iabs)(n-m) * MINBUFDIV
							 | 
						||
| 
								 | 
							
									     || X(imin)(n,m) * X(iabs)(n-m) * vl <= MAXBUF);
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#define CUT_NSRCH 32 /* range of sizes to search for possible cuts */
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								static int applicable_cut(const problem_rdft *p, planner *plnr,
							 | 
						||
| 
								 | 
							
											  int dim0, int dim1, int dim2, INT *nbuf)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     INT n = p->vecsz->dims[dim0].n;
							 | 
						||
| 
								 | 
							
								     INT m = p->vecsz->dims[dim1].n;
							 | 
						||
| 
								 | 
							
								     INT vl, vs;
							 | 
						||
| 
								 | 
							
								     get_transpose_vec(p, dim2, &vl, &vs);
							 | 
						||
| 
								 | 
							
								     *nbuf = 0; /* always small enough to be non-UGLY (?) */
							 | 
						||
| 
								 | 
							
								     A(MINBUFDIV <= CUT_NSRCH); /* assumed to avoid inf. loops below */
							 | 
						||
| 
								 | 
							
								     return (!NO_SLOWP(plnr) /* FIXME: not really SLOW for large 1d ffts? */
							 | 
						||
| 
								 | 
							
									     && n != m
							 | 
						||
| 
								 | 
							
									     
							 | 
						||
| 
								 | 
							
									     /* Don't call transpose-cut recursively (avoid inf. loops):
							 | 
						||
| 
								 | 
							
									        the non-square sub-transpose produced when !cut1
							 | 
						||
| 
								 | 
							
									        should always have gcd(n,m) >= min(CUT_NSRCH,n,m),
							 | 
						||
| 
								 | 
							
									        for which transpose-gcd is applicable */
							 | 
						||
| 
								 | 
							
									     && (cut1(n, m, vl)
							 | 
						||
| 
								 | 
							
										 || gcd(n, m) < X(imin)(MINBUFDIV, X(imin)(n,m)))
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
									     && Ntuple_transposable(p->vecsz->dims + dim0,
							 | 
						||
| 
								 | 
							
												    p->vecsz->dims + dim1,
							 | 
						||
| 
								 | 
							
												    vl, vs));
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								static int mkcldrn_cut(const problem_rdft *p, planner *plnr, P *ego)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     INT n = ego->n, m = ego->m, nc, mc;
							 | 
						||
| 
								 | 
							
								     INT vl = ego->vl;
							 | 
						||
| 
								 | 
							
								     R *buf;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     /* pick the "best" cut */
							 | 
						||
| 
								 | 
							
								     if (cut1(n, m, vl)) {
							 | 
						||
| 
								 | 
							
									  nc = mc = X(imin)(n,m);
							 | 
						||
| 
								 | 
							
								     }
							 | 
						||
| 
								 | 
							
								     else {
							 | 
						||
| 
								 | 
							
									  INT dc, ns, ms;
							 | 
						||
| 
								 | 
							
									  dc = gcd(m, n); nc = n; mc = m;
							 | 
						||
| 
								 | 
							
									  /* search for cut with largest gcd
							 | 
						||
| 
								 | 
							
									     (TODO: different optimality criteria? different search range?) */
							 | 
						||
| 
								 | 
							
									  for (ms = m; ms > 0 && ms > m - CUT_NSRCH; --ms) {
							 | 
						||
| 
								 | 
							
									       for (ns = n; ns > 0 && ns > n - CUT_NSRCH; --ns) {
							 | 
						||
| 
								 | 
							
										    INT ds = gcd(ms, ns);
							 | 
						||
| 
								 | 
							
										    if (ds > dc) {
							 | 
						||
| 
								 | 
							
											 dc = ds; nc = ns; mc = ms;
							 | 
						||
| 
								 | 
							
											 if (dc == X(imin)(ns, ms))
							 | 
						||
| 
								 | 
							
											      break; /* cannot get larger than this */
							 | 
						||
| 
								 | 
							
										    }
							 | 
						||
| 
								 | 
							
									       }
							 | 
						||
| 
								 | 
							
									       if (dc == X(imin)(n, ms))
							 | 
						||
| 
								 | 
							
										    break; /* cannot get larger than this */
							 | 
						||
| 
								 | 
							
									  }
							 | 
						||
| 
								 | 
							
									  A(dc >= X(imin)(CUT_NSRCH, X(imin)(n, m)));
							 | 
						||
| 
								 | 
							
								     }
							 | 
						||
| 
								 | 
							
								     ego->nc = nc;
							 | 
						||
| 
								 | 
							
								     ego->mc = mc;
							 | 
						||
| 
								 | 
							
								     ego->nbuf = (m-mc)*(nc*vl) + (n-nc)*(m*vl);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     if (m > mc) {
							 | 
						||
| 
								 | 
							
									  ego->cld1 = X(mkplan_d)(plnr,
							 | 
						||
| 
								 | 
							
												  X(mkproblem_rdft_0_d)(
							 | 
						||
| 
								 | 
							
												       X(mktensor_3d)(nc, m*vl, vl,
							 | 
						||
| 
								 | 
							
														      m-mc, vl, nc*vl,
							 | 
						||
| 
								 | 
							
														      vl, 1, 1),
							 | 
						||
| 
								 | 
							
												       p->I + mc*vl, buf));
							 | 
						||
| 
								 | 
							
									  if (!ego->cld1)
							 | 
						||
| 
								 | 
							
									       goto nada;
							 | 
						||
| 
								 | 
							
									  X(ops_add2)(&ego->cld1->ops, &ego->super.super.ops);
							 | 
						||
| 
								 | 
							
								     }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     ego->cld2 = X(mkplan_d)(plnr,
							 | 
						||
| 
								 | 
							
											     X(mkproblem_rdft_0_d)(
							 | 
						||
| 
								 | 
							
												  X(mktensor_3d)(nc, mc*vl, vl,
							 | 
						||
| 
								 | 
							
														 mc, vl, nc*vl,
							 | 
						||
| 
								 | 
							
														 vl, 1, 1),
							 | 
						||
| 
								 | 
							
												  p->I, p->I));
							 | 
						||
| 
								 | 
							
								     if (!ego->cld2)
							 | 
						||
| 
								 | 
							
									  goto nada;
							 | 
						||
| 
								 | 
							
								     X(ops_add2)(&ego->cld2->ops, &ego->super.super.ops);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     if (n > nc) {
							 | 
						||
| 
								 | 
							
									  ego->cld3 = X(mkplan_d)(plnr,
							 | 
						||
| 
								 | 
							
												  X(mkproblem_rdft_0_d)(
							 | 
						||
| 
								 | 
							
												       X(mktensor_3d)(n-nc, m*vl, vl,
							 | 
						||
| 
								 | 
							
														      m, vl, n*vl,
							 | 
						||
| 
								 | 
							
														      vl, 1, 1),
							 | 
						||
| 
								 | 
							
												       buf + (m-mc)*(nc*vl), p->I + nc*vl));
							 | 
						||
| 
								 | 
							
									  if (!ego->cld3)
							 | 
						||
| 
								 | 
							
									       goto nada;
							 | 
						||
| 
								 | 
							
									  X(ops_add2)(&ego->cld3->ops, &ego->super.super.ops);
							 | 
						||
| 
								 | 
							
								     }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     /* memcpy/memmove operations */
							 | 
						||
| 
								 | 
							
								     ego->super.super.ops.other += 2 * vl * (nc*mc * ((m > mc) + (n > nc))
							 | 
						||
| 
								 | 
							
													     + (n-nc)*m + (m-mc)*nc);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     X(ifree)(buf);
							 | 
						||
| 
								 | 
							
								     return 1;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								 nada:
							 | 
						||
| 
								 | 
							
								     X(ifree)(buf);
							 | 
						||
| 
								 | 
							
								     return 0;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								static const transpose_adt adt_cut =
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     apply_cut, applicable_cut, mkcldrn_cut,
							 | 
						||
| 
								 | 
							
								     "rdft-transpose-cut"
							 | 
						||
| 
								 | 
							
								};
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/*************************************************************************/
							 | 
						||
| 
								 | 
							
								/* In-place transpose routine from TOMS, which follows the cycles of
							 | 
						||
| 
								 | 
							
								   the permutation so that it writes to each location only once.
							 | 
						||
| 
								 | 
							
								   Because of cache-line and other issues, however, this routine is
							 | 
						||
| 
								 | 
							
								   typically much slower than transpose-gcd or transpose-cut, even
							 | 
						||
| 
								 | 
							
								   though the latter do some extra writes.  On the other hand, if the
							 | 
						||
| 
								 | 
							
								   vector length is large then the TOMS routine is best.
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								   The TOMS routine also has the advantage of requiring less buffer
							 | 
						||
| 
								 | 
							
								   space for the case of gcd(nx,ny) small.  However, in this case it
							 | 
						||
| 
								 | 
							
								   has been superseded by the combination of the generalized
							 | 
						||
| 
								 | 
							
								   transpose-cut method with the transpose-gcd method, which can
							 | 
						||
| 
								 | 
							
								   always transpose with buffers a small fraction of the array size
							 | 
						||
| 
								 | 
							
								   regardless of gcd(nx,ny). */
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/*
							 | 
						||
| 
								 | 
							
								 * TOMS Transpose.  Algorithm 513 (Revised version of algorithm 380).
							 | 
						||
| 
								 | 
							
								 * 
							 | 
						||
| 
								 | 
							
								 * These routines do in-place transposes of arrays.
							 | 
						||
| 
								 | 
							
								 * 
							 | 
						||
| 
								 | 
							
								 * [ Cate, E.G. and Twigg, D.W., ACM Transactions on Mathematical Software, 
							 | 
						||
| 
								 | 
							
								 *   vol. 3, no. 1, 104-110 (1977) ]
							 | 
						||
| 
								 | 
							
								 * 
							 | 
						||
| 
								 | 
							
								 * C version by Steven G. Johnson (February 1997).
							 | 
						||
| 
								 | 
							
								 */
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/*
							 | 
						||
| 
								 | 
							
								 * "a" is a 1D array of length ny*nx*N which constains the nx x ny
							 | 
						||
| 
								 | 
							
								 * matrix of N-tuples to be transposed.  "a" is stored in row-major
							 | 
						||
| 
								 | 
							
								 * order (last index varies fastest).  move is a 1D array of length
							 | 
						||
| 
								 | 
							
								 * move_size used to store information to speed up the process.  The
							 | 
						||
| 
								 | 
							
								 * value move_size=(ny+nx)/2 is recommended.  buf should be an array
							 | 
						||
| 
								 | 
							
								 * of length 2*N.
							 | 
						||
| 
								 | 
							
								 * 
							 | 
						||
| 
								 | 
							
								 */
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								static void transpose_toms513(R *a, INT nx, INT ny, INT N,
							 | 
						||
| 
								 | 
							
								                              char *move, INT move_size, R *buf)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     INT i, im, mn;
							 | 
						||
| 
								 | 
							
								     R *b, *c, *d;
							 | 
						||
| 
								 | 
							
								     INT ncount;
							 | 
						||
| 
								 | 
							
								     INT k;
							 | 
						||
| 
								 | 
							
								     
							 | 
						||
| 
								 | 
							
								     /* check arguments and initialize: */
							 | 
						||
| 
								 | 
							
								     A(ny > 0 && nx > 0 && N > 0 && move_size > 0);
							 | 
						||
| 
								 | 
							
								     
							 | 
						||
| 
								 | 
							
								     b = buf;
							 | 
						||
| 
								 | 
							
								     
							 | 
						||
| 
								 | 
							
								     /* Cate & Twigg have a special case for nx == ny, but we don't
							 | 
						||
| 
								 | 
							
									bother, since we already have special code for this case elsewhere. */
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     c = buf + N;
							 | 
						||
| 
								 | 
							
								     ncount = 2;		/* always at least 2 fixed points */
							 | 
						||
| 
								 | 
							
								     k = (mn = ny * nx) - 1;
							 | 
						||
| 
								 | 
							
								     
							 | 
						||
| 
								 | 
							
								     for (i = 0; i < move_size; ++i)
							 | 
						||
| 
								 | 
							
									  move[i] = 0;
							 | 
						||
| 
								 | 
							
								     
							 | 
						||
| 
								 | 
							
								     if (ny >= 3 && nx >= 3)
							 | 
						||
| 
								 | 
							
									  ncount += gcd(ny - 1, nx - 1) - 1;	/* # fixed points */
							 | 
						||
| 
								 | 
							
								     
							 | 
						||
| 
								 | 
							
								     i = 1;
							 | 
						||
| 
								 | 
							
								     im = ny;
							 | 
						||
| 
								 | 
							
								     
							 | 
						||
| 
								 | 
							
								     while (1) {
							 | 
						||
| 
								 | 
							
									  INT i1, i2, i1c, i2c;
							 | 
						||
| 
								 | 
							
									  INT kmi;
							 | 
						||
| 
								 | 
							
									  
							 | 
						||
| 
								 | 
							
									  /** Rearrange the elements of a loop
							 | 
						||
| 
								 | 
							
									      and its companion loop: **/
							 | 
						||
| 
								 | 
							
									  
							 | 
						||
| 
								 | 
							
									  i1 = i;
							 | 
						||
| 
								 | 
							
									  kmi = k - i;
							 | 
						||
| 
								 | 
							
									  i1c = kmi;
							 | 
						||
| 
								 | 
							
									  switch (N) {
							 | 
						||
| 
								 | 
							
									      case 1:
							 | 
						||
| 
								 | 
							
										   b[0] = a[i1];
							 | 
						||
| 
								 | 
							
										   c[0] = a[i1c];
							 | 
						||
| 
								 | 
							
										   break;
							 | 
						||
| 
								 | 
							
									      case 2:
							 | 
						||
| 
								 | 
							
										   b[0] = a[2*i1];
							 | 
						||
| 
								 | 
							
										   b[1] = a[2*i1+1];
							 | 
						||
| 
								 | 
							
										   c[0] = a[2*i1c];
							 | 
						||
| 
								 | 
							
										   c[1] = a[2*i1c+1];
							 | 
						||
| 
								 | 
							
										   break;
							 | 
						||
| 
								 | 
							
									      default:
							 | 
						||
| 
								 | 
							
										   memcpy(b, &a[N * i1], N * sizeof(R));
							 | 
						||
| 
								 | 
							
										   memcpy(c, &a[N * i1c], N * sizeof(R));
							 | 
						||
| 
								 | 
							
									  }
							 | 
						||
| 
								 | 
							
									  while (1) {
							 | 
						||
| 
								 | 
							
									       i2 = ny * i1 - k * (i1 / nx);
							 | 
						||
| 
								 | 
							
									       i2c = k - i2;
							 | 
						||
| 
								 | 
							
									       if (i1 < move_size)
							 | 
						||
| 
								 | 
							
										    move[i1] = 1;
							 | 
						||
| 
								 | 
							
									       if (i1c < move_size)
							 | 
						||
| 
								 | 
							
										    move[i1c] = 1;
							 | 
						||
| 
								 | 
							
									       ncount += 2;
							 | 
						||
| 
								 | 
							
									       if (i2 == i)
							 | 
						||
| 
								 | 
							
										    break;
							 | 
						||
| 
								 | 
							
									       if (i2 == kmi) {
							 | 
						||
| 
								 | 
							
										    d = b;
							 | 
						||
| 
								 | 
							
										    b = c;
							 | 
						||
| 
								 | 
							
										    c = d;
							 | 
						||
| 
								 | 
							
										    break;
							 | 
						||
| 
								 | 
							
									       }
							 | 
						||
| 
								 | 
							
									       switch (N) {
							 | 
						||
| 
								 | 
							
										   case 1:
							 | 
						||
| 
								 | 
							
											a[i1] = a[i2];
							 | 
						||
| 
								 | 
							
											a[i1c] = a[i2c];
							 | 
						||
| 
								 | 
							
											break;
							 | 
						||
| 
								 | 
							
										   case 2:
							 | 
						||
| 
								 | 
							
											a[2*i1] = a[2*i2];
							 | 
						||
| 
								 | 
							
											a[2*i1+1] = a[2*i2+1];
							 | 
						||
| 
								 | 
							
											a[2*i1c] = a[2*i2c];
							 | 
						||
| 
								 | 
							
											a[2*i1c+1] = a[2*i2c+1];
							 | 
						||
| 
								 | 
							
											break;
							 | 
						||
| 
								 | 
							
										   default:
							 | 
						||
| 
								 | 
							
											memcpy(&a[N * i1], &a[N * i2], 
							 | 
						||
| 
								 | 
							
											       N * sizeof(R));
							 | 
						||
| 
								 | 
							
											memcpy(&a[N * i1c], &a[N * i2c], 
							 | 
						||
| 
								 | 
							
											       N * sizeof(R));
							 | 
						||
| 
								 | 
							
									       }
							 | 
						||
| 
								 | 
							
									       i1 = i2;
							 | 
						||
| 
								 | 
							
									       i1c = i2c;
							 | 
						||
| 
								 | 
							
									  }
							 | 
						||
| 
								 | 
							
									  switch (N) {
							 | 
						||
| 
								 | 
							
									      case 1:
							 | 
						||
| 
								 | 
							
										   a[i1] = b[0];
							 | 
						||
| 
								 | 
							
										   a[i1c] = c[0];
							 | 
						||
| 
								 | 
							
										   break;
							 | 
						||
| 
								 | 
							
									      case 2:
							 | 
						||
| 
								 | 
							
										   a[2*i1] = b[0];
							 | 
						||
| 
								 | 
							
										   a[2*i1+1] = b[1];
							 | 
						||
| 
								 | 
							
										   a[2*i1c] = c[0];
							 | 
						||
| 
								 | 
							
										   a[2*i1c+1] = c[1];
							 | 
						||
| 
								 | 
							
										   break;
							 | 
						||
| 
								 | 
							
									      default:
							 | 
						||
| 
								 | 
							
										   memcpy(&a[N * i1], b, N * sizeof(R));
							 | 
						||
| 
								 | 
							
										   memcpy(&a[N * i1c], c, N * sizeof(R));
							 | 
						||
| 
								 | 
							
									  }
							 | 
						||
| 
								 | 
							
									  if (ncount >= mn)
							 | 
						||
| 
								 | 
							
									       break;	/* we've moved all elements */
							 | 
						||
| 
								 | 
							
									  
							 | 
						||
| 
								 | 
							
									  /** Search for loops to rearrange: **/
							 | 
						||
| 
								 | 
							
									  
							 | 
						||
| 
								 | 
							
									  while (1) {
							 | 
						||
| 
								 | 
							
									       INT max = k - i;
							 | 
						||
| 
								 | 
							
									       ++i;
							 | 
						||
| 
								 | 
							
									       A(i <= max);
							 | 
						||
| 
								 | 
							
									       im += ny;
							 | 
						||
| 
								 | 
							
									       if (im > k)
							 | 
						||
| 
								 | 
							
										    im -= k;
							 | 
						||
| 
								 | 
							
									       i2 = im;
							 | 
						||
| 
								 | 
							
									       if (i == i2)
							 | 
						||
| 
								 | 
							
										    continue;
							 | 
						||
| 
								 | 
							
									       if (i >= move_size) {
							 | 
						||
| 
								 | 
							
										    while (i2 > i && i2 < max) {
							 | 
						||
| 
								 | 
							
											 i1 = i2;
							 | 
						||
| 
								 | 
							
											 i2 = ny * i1 - k * (i1 / nx);
							 | 
						||
| 
								 | 
							
										    }
							 | 
						||
| 
								 | 
							
										    if (i2 == i)
							 | 
						||
| 
								 | 
							
											 break;
							 | 
						||
| 
								 | 
							
									       } else if (!move[i])
							 | 
						||
| 
								 | 
							
										    break;
							 | 
						||
| 
								 | 
							
									  }
							 | 
						||
| 
								 | 
							
								     }
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								static void apply_toms513(const plan *ego_, R *I, R *O)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     const P *ego = (const P *) ego_;
							 | 
						||
| 
								 | 
							
								     INT n = ego->n, m = ego->m;
							 | 
						||
| 
								 | 
							
								     INT vl = ego->vl;
							 | 
						||
| 
								 | 
							
								     R *buf = (R *)MALLOC(sizeof(R) * ego->nbuf, BUFFERS);
							 | 
						||
| 
								 | 
							
								     UNUSED(O);
							 | 
						||
| 
								 | 
							
								     transpose_toms513(I, n, m, vl, (char *) (buf + 2*vl), (n+m)/2, buf);
							 | 
						||
| 
								 | 
							
								     X(ifree)(buf);
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								static int applicable_toms513(const problem_rdft *p, planner *plnr,
							 | 
						||
| 
								 | 
							
											   int dim0, int dim1, int dim2, INT *nbuf)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     INT n = p->vecsz->dims[dim0].n;
							 | 
						||
| 
								 | 
							
								     INT m = p->vecsz->dims[dim1].n;
							 | 
						||
| 
								 | 
							
								     INT vl, vs;
							 | 
						||
| 
								 | 
							
								     get_transpose_vec(p, dim2, &vl, &vs);
							 | 
						||
| 
								 | 
							
								     *nbuf = 2*vl 
							 | 
						||
| 
								 | 
							
									  + ((n + m) / 2 * sizeof(char) + sizeof(R) - 1) / sizeof(R);
							 | 
						||
| 
								 | 
							
								     return (!NO_SLOWP(plnr)
							 | 
						||
| 
								 | 
							
									     && (vl > 8 || !NO_UGLYP(plnr)) /* UGLY for small vl */
							 | 
						||
| 
								 | 
							
									     && n != m
							 | 
						||
| 
								 | 
							
									     && Ntuple_transposable(p->vecsz->dims + dim0,
							 | 
						||
| 
								 | 
							
												    p->vecsz->dims + dim1,
							 | 
						||
| 
								 | 
							
												    vl, vs));
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								static int mkcldrn_toms513(const problem_rdft *p, planner *plnr, P *ego)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     UNUSED(p); UNUSED(plnr);
							 | 
						||
| 
								 | 
							
								     /* heuristic so that TOMS algorithm is last resort for small vl */
							 | 
						||
| 
								 | 
							
								     ego->super.super.ops.other += ego->n * ego->m * 2 * (ego->vl + 30);
							 | 
						||
| 
								 | 
							
								     return 1;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								static const transpose_adt adt_toms513 =
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     apply_toms513, applicable_toms513, mkcldrn_toms513,
							 | 
						||
| 
								 | 
							
								     "rdft-transpose-toms513"
							 | 
						||
| 
								 | 
							
								};
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/*-----------------------------------------------------------------------*/
							 | 
						||
| 
								 | 
							
								/*-----------------------------------------------------------------------*/
							 | 
						||
| 
								 | 
							
								/* generic stuff: */
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								static void awake(plan *ego_, enum wakefulness wakefulness)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     P *ego = (P *) ego_;
							 | 
						||
| 
								 | 
							
								     X(plan_awake)(ego->cld1, wakefulness);
							 | 
						||
| 
								 | 
							
								     X(plan_awake)(ego->cld2, wakefulness);
							 | 
						||
| 
								 | 
							
								     X(plan_awake)(ego->cld3, wakefulness);
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								static void print(const plan *ego_, printer *p)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     const P *ego = (const P *) ego_;
							 | 
						||
| 
								 | 
							
								     p->print(p, "(%s-%Dx%D%v", ego->slv->adt->nam,
							 | 
						||
| 
								 | 
							
									      ego->n, ego->m, ego->vl);
							 | 
						||
| 
								 | 
							
								     if (ego->cld1) p->print(p, "%(%p%)", ego->cld1);
							 | 
						||
| 
								 | 
							
								     if (ego->cld2) p->print(p, "%(%p%)", ego->cld2);
							 | 
						||
| 
								 | 
							
								     if (ego->cld3) p->print(p, "%(%p%)", ego->cld3);
							 | 
						||
| 
								 | 
							
								     p->print(p, ")");
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								static void destroy(plan *ego_)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     P *ego = (P *) ego_;
							 | 
						||
| 
								 | 
							
								     X(plan_destroy_internal)(ego->cld3);
							 | 
						||
| 
								 | 
							
								     X(plan_destroy_internal)(ego->cld2);
							 | 
						||
| 
								 | 
							
								     X(plan_destroy_internal)(ego->cld1);
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								static plan *mkplan(const solver *ego_, const problem *p_, planner *plnr)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     const S *ego = (const S *) ego_;
							 | 
						||
| 
								 | 
							
								     const problem_rdft *p;
							 | 
						||
| 
								 | 
							
								     int dim0, dim1, dim2;
							 | 
						||
| 
								 | 
							
								     INT nbuf, vs;
							 | 
						||
| 
								 | 
							
								     P *pln;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     static const plan_adt padt = {
							 | 
						||
| 
								 | 
							
									  X(rdft_solve), awake, print, destroy
							 | 
						||
| 
								 | 
							
								     };
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     if (!applicable(ego_, p_, plnr, &dim0, &dim1, &dim2, &nbuf))
							 | 
						||
| 
								 | 
							
								          return (plan *) 0;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     p = (const problem_rdft *) p_;
							 | 
						||
| 
								 | 
							
								     pln = MKPLAN_RDFT(P, &padt, ego->adt->apply);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     pln->n = p->vecsz->dims[dim0].n;
							 | 
						||
| 
								 | 
							
								     pln->m = p->vecsz->dims[dim1].n;
							 | 
						||
| 
								 | 
							
								     get_transpose_vec(p, dim2, &pln->vl, &vs);
							 | 
						||
| 
								 | 
							
								     pln->nbuf = nbuf;
							 | 
						||
| 
								 | 
							
								     pln->d = gcd(pln->n, pln->m);
							 | 
						||
| 
								 | 
							
								     pln->nd = pln->n / pln->d;
							 | 
						||
| 
								 | 
							
								     pln->md = pln->m / pln->d;
							 | 
						||
| 
								 | 
							
								     pln->slv = ego;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     X(ops_zero)(&pln->super.super.ops); /* mkcldrn is responsible for ops */
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     pln->cld1 = pln->cld2 = pln->cld3 = 0;
							 | 
						||
| 
								 | 
							
								     if (!ego->adt->mkcldrn(p, plnr, pln)) {
							 | 
						||
| 
								 | 
							
									  X(plan_destroy_internal)(&(pln->super.super));
							 | 
						||
| 
								 | 
							
									  return 0;
							 | 
						||
| 
								 | 
							
								     }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								     return &(pln->super.super);
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								static solver *mksolver(const transpose_adt *adt)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     static const solver_adt sadt = { PROBLEM_RDFT, mkplan, 0 };
							 | 
						||
| 
								 | 
							
								     S *slv = MKSOLVER(S, &sadt);
							 | 
						||
| 
								 | 
							
								     slv->adt = adt;
							 | 
						||
| 
								 | 
							
								     return &(slv->super);
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								void X(rdft_vrank3_transpose_register)(planner *p)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								     unsigned i;
							 | 
						||
| 
								 | 
							
								     static const transpose_adt *const adts[] = {
							 | 
						||
| 
								 | 
							
									  &adt_gcd, &adt_cut,
							 | 
						||
| 
								 | 
							
									  &adt_toms513
							 | 
						||
| 
								 | 
							
								     };
							 | 
						||
| 
								 | 
							
								     for (i = 0; i < sizeof(adts) / sizeof(adts[0]); ++i)
							 | 
						||
| 
								 | 
							
								          REGISTER_SOLVER(p, mksolver(adts[i]));
							 | 
						||
| 
								 | 
							
								}
							 |