SFEMaNS/html/fft__parallel__obsolete_8f90_source.html

 !
 !Authors:  Katarzyna Boronska, Jean-Luc Guermond, Copyrights 2007
 !
 MODULE sft_parallele_obsolete

   IMPLICIT NONE
   PUBLIC :: fft_par_cross_prod, fft_par_dot_prod, fft_par_allen_cahn, &
        fft_par_prod, fft_par_real, ref, fft_par_cross_prod_dcl, fft_par_dot_prod_dcl, &
        fft_par_prod_dcl, fft_par_compressive_visc_dcl, fft_heaviside_dcl
   PRIVATE

 CONTAINS
   SUBROUTINE fft_par_real(communicator, V1_in, V_out, opt_nb_plane)
     USE my_util
     IMPLICIT NONE
     include 'fftw3.f'
     ! Format: V_1in(1:np,1:6,1:m_max_c)
     ! INPUT ARE COSINE AND SINE COEFFICIENTS
     ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(IN)    :: V1_in
     REAL(KIND=8), DIMENSION(:,:,:), ALLOCATABLE    :: V_out
 ! FL-CN possible faute ??? 19/03/2013
     !REAL(KIND=8), DIMENSION(:,:,:), POINTER        :: V_out
     INTEGER, OPTIONAL                              :: opt_nb_plane
     INTEGER                                        :: np, bloc_size, nb_field, &
          m_max, m_max_c, rang, nb_procs, MPID, m_max_pad, N_r_pad
     INTEGER(KIND=8)                                :: fftw_plan_multi_c2r
     COMPLEX(KIND=8), ALLOCATABLE, DIMENSION(:,:,:) :: cu
     INTEGER :: nb, nf, shiftc, shiftl, jindex, longueur_tranche, i, n, code
     REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:)    :: dist_field, combined_field
     INTEGER               :: fft_dim, howmany, istride, ostride, idist, odist
     INTEGER, DIMENSION(1) :: dim, inembed, onembed
     ! Recall complexes must be rescaled
     ! End FFTW parameters
 #include "petsc/finclude/petsc.h"
     mpi_comm :: communicator

     CALL mpi_comm_size(communicator, nb_procs, code)
     CALL mpi_comm_rank(communicator, rang, code)

     np       = SIZE(v1_in,1)
     nb_field = SIZE(v1_in,2)    ! Number of fields
     m_max_c  = SIZE(v1_in,3)    ! Number of complex (cosines + sines) coefficients per point
     m_max    = m_max_c*nb_procs ! Number of comlex coefficients per point per processor
     IF (mod(nb_field,2)/=0 .OR. m_max_c==0) THEN
        CALL error_petsc(.OR.'Bug in FFT_PAR_REAL: MOD(nb_field,2)/=0  m_max_c==0')
     END IF

     !===Bloc_size is the number of points that are handled by one processor
     !===once the Fourier modes are all collected on the processor
     IF (modulo(np,nb_procs)==0) THEN
        bloc_size = np/nb_procs
     ELSE
        CALL error_petsc('Bug in FFT_PAR_REAL: np is not a multiple of nb_procs')
     END IF

     IF (PRESENT(opt_nb_plane)) THEN
        IF (opt_nb_plane> 2*m_max-1) THEN
           m_max_pad = (opt_nb_plane+1)/2
        ELSE
           m_max_pad = m_max
        END IF
     ELSE
        m_max_pad = m_max
     END IF
     n_r_pad=2*m_max_pad-1

     ALLOCATE(cu(m_max_pad,nb_field/2, bloc_size))
     ALLOCATE(dist_field(m_max_c,nb_field,np))
     ALLOCATE(combined_field(m_max_c,nb_field,np))

     DO i = 1, m_max_c
        dist_field(i,:,:) = transpose(v1_in(:,:,i))
     END DO

     longueur_tranche=bloc_size*m_max_c*nb_field

     mpid=mpi_double_precision
     CALL mpi_alltoall (dist_field,longueur_tranche, mpid, combined_field, longueur_tranche, &
          mpid, communicator, code)

     cu = 0.d0
     DO n = 1, bloc_size
        DO nb = 1, nb_procs
           shiftc = (nb-1)*bloc_size
           shiftl = (nb-1)*m_max_c
           jindex = n + shiftc
           DO nf = 1, nb_field/2
              !===Put real and imaginary parts in a complex
              !===nf=1,2,3 => V1_in
              !===INPUT ARE COSINE AND SINE COEFFICIENTS
              !===THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
              cu(shiftl+1:shiftl+m_max_c,nf,n) = cmplx(combined_field(:,2*nf-1,jindex),&
                   -combined_field(:,2*nf,jindex),kind=8)/2
           END DO
        END DO
     END DO
     cu(1,:,:) = 2*cmplx(REAL(cu(1,:,:),KIND=8),0.d0,KIND=8)
     !===Padding is done by initialization of cu: cu = 0
     !===This is equivalent to cu(m_max+1:m_max_pad,:,:) = 0.d0

     !===Set the parameters for dfftw
     fft_dim   = 1
     istride   = 1
     ostride   = 1
     idist     = n_r_pad
     inembed(1)= n_r_pad
     dim(1)    = n_r_pad
     odist     = m_max_pad
     onembed(1)= m_max_pad
     howmany   = bloc_size*nb_field/2

 ! FL-CN possible faute ??? 19/03/2013
 !   IF (ASSOCIATED(V_out)) NULLIFY(V_out)
 !   IF (ASSOCIATED(V_out)) DEALLOCATE(V_out)
 ! pb sur la ligne suivante
     IF (ALLOCATED(v_out)) DEALLOCATE(v_out)
     ALLOCATE(v_out(n_r_pad,nb_field/2,bloc_size))

     CALL dfftw_plan_many_dft_c2r(fftw_plan_multi_c2r, fft_dim, dim, howmany, cu, &
          onembed, ostride, odist, v_out, inembed, istride, idist, fftw_estimate)
     CALL dfftw_execute(fftw_plan_multi_c2r)

     DEALLOCATE(cu, dist_field, combined_field)

   END SUBROUTINE fft_par_real

   SUBROUTINE fft_par_cross_prod_bug(communicator,V1_in, V2_in, V_out, nb_procs, bloc_size, m_max_pad, temps)
     !This a de-aliased version of the code, FEB 4, 2011, JLG
     IMPLICIT NONE
     include 'fftw3.f'
     ! Format: V_1in(1:np,1:6,1:m_max_c)
     ! INPUT ARE COSINE AND SINE COEFFICIENTS
     ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(IN)  :: V1_in, V2_in
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(OUT) :: V_out
     REAL(KIND=8), DIMENSION(:), OPTIONAL, INTENT(INOUT) :: temps
     INTEGER,                         INTENT(IN)  :: nb_procs, bloc_size, m_max_pad

     INTEGER                                      :: np, np_tot,  nb_field, &
          m_max, m_max_c,  MPID,  N_r_pad
     INTEGER(KIND=8)                              :: fftw_plan_multi_c2r, fftw_plan_multi_r2c
     COMPLEX(KIND=8), DIMENSION(m_max_pad,SIZE(V1_in,2),bloc_size)       :: cu
     COMPLEX(KIND=8), DIMENSION(2*m_max_pad-1,SIZE(V1_in,2)/2,bloc_size) :: prod_cu
     REAL(KIND=8),    DIMENSION(2*m_max_pad-1,SIZE(V1_in,2),bloc_size)   :: ru
     REAL(KIND=8),    DIMENSION(2*m_max_pad-1,SIZE(V1_in,2)/2,bloc_size) :: prod_ru
     COMPLEX(KIND=8), DIMENSION(SIZE(V1_in,2)/2,bloc_size)               :: intermediate
     REAL(KIND=8), DIMENSION(SIZE(V1_in,3),2*SIZE(V1_in,2),bloc_size*nb_procs)    :: dist_field, combined_field
     COMPLEX(KIND=8), DIMENSION(SIZE(V1_in,2)/2,bloc_size,SIZE(V1_in,3)*nb_procs) :: combined_prod_cu, dist_prod_cu
     COMPLEX(KIND=8), DIMENSION(SIZE(V1_in,3),bloc_size*nb_procs,SIZE(V1_in,2)/2) :: out_prod_cu

     INTEGER :: i_field
     INTEGER ::   nb, nf, shiftc, shiftl, jindex, longueur_tranche, i, n, code
     REAL(KIND=8) :: t

     ! FFTW parameters
     INTEGER   :: fft_dim, howmany, istride, ostride, idist, odist
     INTEGER, DIMENSION(1) :: dim, inembed, onembed
     ! Recall complexes must be rescaled
     ! End FFTW parameters

     !Temps(1) = Temps de communication
     !Temps(2) = Temps de calcul
     !Temps(3) = Temps de changement de format

     !EXTERNAL hostnm
     !EXTERNAL gethostname
 #include "petsc/finclude/petsc.h"
     mpi_comm :: communicator

     IF (PRESENT(temps)) temps = 0.d0

     nb_field= SIZE(v1_in,2)
     m_max_c = SIZE(v1_in,3) ! Number of complex (cosines + sines) coefficients per point
     m_max = m_max_c*nb_procs! Number of comlex coefficients per point per processor
     np_tot = nb_procs*bloc_size
     np = SIZE(v1_in,1)
     n_r_pad=2*m_max_pad-1

     IF (mod(nb_field,2)/=0 .OR. m_max_c==0) THEN
        WRITE(*,*) ' BUG '
        stop
     END IF

     ! Packing all 3 complex components of both v1 and v2 input fields
     ! into dist_field, where the dimension indexing the nodal points varies the least rapidly,
     ! so that after distributing the data to the processes, each one will obtain a part
     ! on nodal points
     ! TRANSPOSE pr que la variable i associee aux modes soit la 1ere sur laquelle on va faire la FFT
     t = mpi_wtime()

     DO i = 1, m_max_c
        dist_field(i,1:nb_field,1:np) = transpose(v1_in(:,:,i))
        dist_field(i,nb_field+1:2*nb_field,1:np) = transpose(v2_in(:,:,i))
     END DO
     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     IF (np/=np_tot) dist_field(:,:,np+1:np_tot) = 1.d100

     longueur_tranche=bloc_size*m_max_c*nb_field*2

     t = mpi_wtime()
     mpid=mpi_double_precision
     CALL mpi_alltoall (dist_field,longueur_tranche, mpid, combined_field, longueur_tranche, &
          mpid, communicator, code)
     IF (PRESENT(temps)) temps(1) = temps(1) + mpi_wtime() -t

     t = mpi_wtime()
     !JLG, FEB 4, 2011
     cu = 0.d0
     !JLG, FEB 4, 2011
     DO n = 1, bloc_size
        DO nb = 1, nb_procs
           shiftc = (nb-1)*bloc_size
           shiftl = (nb-1)*m_max_c
           jindex = n + shiftc
           DO nf = 1, nb_field
              ! Put real and imaginary parts in a complex
              ! nf=1,2,3 => V1_in
              ! nf=4,5,6 => V2_in
              ! INPUT ARE COSINE AND SINE COEFFICIENTS
              ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
              cu(shiftl+1:shiftl+m_max_c,nf,n) = cmplx(combined_field(:,2*nf-1,jindex),&
                   -combined_field(:,2*nf,jindex),kind=8)/2
           END DO
        END DO
     END DO
     cu(1,:,:) = 2*cmplx(REAL(cu(1,:,:),KIND=8),0.d0,KIND=8)
     !JLG, FEB 4, 2011
     !Padding is done by initialization of cu: cu = 0
     !This is eequivalent to cu(m_max+1:m_max_pad,:,:) = 0.d0
     !JLG, FEB 4, 2011

     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     ! Set the parameters for dfftw
     fft_dim=1; istride=1; ostride=1;
     !JLG, FEB 4, 2011
 !!$       idist=N_r;   inembed(1)=N_r; DIM(1)=N_r
 !!$       odist=m_max; onembed(1)=m_max
     idist=n_r_pad;   inembed(1)=n_r_pad; dim(1)=n_r_pad
     odist=m_max_pad; onembed(1)=m_max_pad
     !JLG, FEB 4, 2011

     howmany=bloc_size*nb_field

     t = mpi_wtime()
     CALL dfftw_plan_many_dft_c2r(fftw_plan_multi_c2r, fft_dim, dim, howmany, cu, &
          onembed, ostride, odist, ru, inembed, istride, idist, fftw_estimate)
     !write(*,*) ' FFT_PAR_CROSS_PROD: fftw_plan_multi_c2r', fftw_plan_multi_c2r
     CALL dfftw_execute(fftw_plan_multi_c2r)

     ! CROSS PRODDUCT
     IF (nb_field==6) THEN
        prod_ru(:,1,:) = ru(:,2,:)*ru(:,6,:) - ru(:,3,:)*ru(:,5,:)
        prod_ru(:,2,:) = ru(:,3,:)*ru(:,4,:) - ru(:,1,:)*ru(:,6,:)
        prod_ru(:,3,:) = ru(:,1,:)*ru(:,5,:) - ru(:,2,:)*ru(:,4,:)
     END IF
     ! CROSS PRODUCT

     howmany = howmany/2
     CALL dfftw_plan_many_dft_r2c(fftw_plan_multi_r2c, fft_dim, dim, howmany, prod_ru, &
          inembed, istride, idist, prod_cu, onembed, ostride, odist, fftw_estimate)
     !write(*,*) ' FFT_PAR_CROSS_PROD: fftw_plan_multi_r2c', fftw_plan_multi_r2c
     CALL dfftw_execute(fftw_plan_multi_r2c)
     !JLG, FEB 4, 2011
 !!$       prod_cu = prod_cu/N_r !Scaling
     prod_cu = prod_cu/n_r_pad !Scaling
     !JLG, FEB 4, 2011
     IF (PRESENT(temps)) temps(2) = temps(2) + mpi_wtime() -t

     !Now we need to redistribute the Fourier coefficients on each processor

     t = mpi_wtime()
     combined_prod_cu(:,:,1)=prod_cu(1,:,:)
     DO n=2, m_max
        !combined_prod_cu(:,:,n)=prod_cu(n,:,:)
        combined_prod_cu(:,:,n)=2*conjg(prod_cu(n,:,:))
     END DO

     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     t = mpi_wtime()
     longueur_tranche=bloc_size*m_max_c*nb_field
     mpid=mpi_double_precision
     CALL mpi_alltoall (combined_prod_cu,longueur_tranche,mpid, dist_prod_cu,longueur_tranche, &
          mpid,communicator,code)
     IF (PRESENT(temps)) temps(1) = temps(1) + mpi_wtime() -t
     ! dimensions:
     t = mpi_wtime()
     DO i = 1, m_max_c
        DO nb = 1, nb_procs
           shiftc = (nb-1)*bloc_size
           shiftl = (nb-1)*m_max_c
           intermediate = dist_prod_cu(:,:,shiftl+i)
           DO n = 1, bloc_size
              IF (n+shiftc > np ) cycle
              DO i_field = 1, nb_field/2
                 v_out(n+shiftc, i_field*2-1, i) = REAL (intermediate(i_field,n),KIND=8)
                 v_out(n+shiftc, i_field*2 , i)  = aimag(intermediate(i_field,n))
              END DO
           END DO
        END DO
     END DO
     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t


   END SUBROUTINE fft_par_cross_prod_bug

   SUBROUTINE fft_par_cross_prod_dcl(communicator,V1_in, V2_in, V_out, nb_procs, bloc_size, m_max_pad, temps, padding)
     !This a de-aliased version of the code, FEB 4, 2011, JLG
     IMPLICIT NONE
     include 'fftw3.f'
     ! Format: V_1in(1:np,1:6,1:m_max_c)
     ! INPUT ARE COSINE AND SINE COEFFICIENTS
     ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(IN)  :: V1_in, V2_in
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(OUT) :: V_out
     REAL(KIND=8), DIMENSION(:), OPTIONAL, INTENT(INOUT) :: temps
     LOGICAL,                    OPTIONAL, INTENT(IN)    :: padding
     INTEGER, INTENT(IN)                                 :: bloc_size, m_max_pad, nb_procs
     INTEGER          :: np, np_tot, nb_field, m_max, m_max_c, MPID, N_r_pad
     INTEGER(KIND=8)  :: fftw_plan_multi_c2r, fftw_plan_multi_r2c

     COMPLEX(KIND=8), DIMENSION(m_max_pad, SIZE(V1_in,2), bloc_size)  :: cu
     REAL(KIND=8), DIMENSION(2*m_max_pad-1,SIZE(V1_in,2),bloc_size)   :: ru
     COMPLEX(KIND=8), DIMENSION(m_max_pad,SIZE(V1_in,2)/2,bloc_size)  :: prod_cu
     REAL(KIND=8), DIMENSION(2*m_max_pad-1,SIZE(V1_in,2)/2,bloc_size) :: prod_ru
     COMPLEX(KIND=8), DIMENSION(SIZE(V1_in,2)/2,bloc_size)            :: intermediate
     REAL(KIND=8), DIMENSION(SIZE(V1_in,3),2*SIZE(V1_in,2),bloc_size*nb_procs)    :: dist_field, combined_field
     COMPLEX(KIND=8), DIMENSION(SIZE(V1_in,2)/2,bloc_size,SIZE(V1_in,3)*nb_procs) :: combined_prod_cu
     COMPLEX(KIND=8), DIMENSION(SIZE(V1_in,2)/2,bloc_size,SIZE(V1_in,3)*nb_procs) :: dist_prod_cu

     INTEGER :: i_field
     INTEGER :: nb, nf, shiftc, shiftl, jindex, longueur_tranche, i, n, code
     REAL(KIND=8) :: t

     ! FFTW parameters
     INTEGER   :: fft_dim, howmany, istride, ostride, idist, odist
     INTEGER, DIMENSION(1) :: dim, inembed, onembed
     ! Recall complexes must be rescaled
     ! End FFTW parameters
 #include "petsc/finclude/petsc.h"
     mpi_comm :: communicator

     IF (PRESENT(temps)) temps = 0.d0

     np      = SIZE(v1_in,1)
     nb_field= SIZE(v1_in,2) ! Number of fields
     m_max_c = SIZE(v1_in,3) ! Number of complex (cosines + sines) coefficients per point
     m_max = m_max_c*nb_procs! Number of comlex coefficients per point per processor
     n_r_pad=2*m_max_pad-1
     np_tot = nb_procs*bloc_size

     IF (mod(nb_field,2)/=0 .OR. m_max_c==0) THEN
        WRITE(*,*) ' BUG '
        stop
     END IF

     ! Bloc_size is the number of points that are handled by one processor
     ! once the Fourier modes are all collected
     ! Computation of bloc_size and np_tot
     ! fin de la repartition des points

     ! Packing all 3 complex components of both v1 and v2 input fields
     ! into dist_field, where the dimension indexing the nodal points varies the least rapidly,
     ! so that after distributing the data to the processes, each one will obtain a part
     ! on nodal points
     ! TRANSPOSE pr que la variable i associee aux modes soit la 1ere sur laquelle on va faire la FFT
     t = mpi_wtime()

     DO i = 1, m_max_c
        dist_field(i,1:nb_field,1:np) = transpose(v1_in(:,:,i))
        dist_field(i,nb_field+1:2*nb_field,1:np) = transpose(v2_in(:,:,i))
     END DO
     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     IF (np/=np_tot) dist_field(:,:,np+1:np_tot) = 1.d100

     longueur_tranche=bloc_size*m_max_c*nb_field*2

     t = mpi_wtime()
     mpid=mpi_double_precision
     CALL mpi_alltoall (dist_field,longueur_tranche, mpid, combined_field, longueur_tranche, &
          mpid, communicator, code)
     IF (PRESENT(temps)) temps(1) = temps(1) + mpi_wtime() -t

     t = mpi_wtime()
     !JLG, FEB 4, 2011
     cu = 0.d0
     !JLG, FEB 4, 2011
     DO n = 1, bloc_size
        DO nb = 1, nb_procs
           shiftc = (nb-1)*bloc_size
           shiftl = (nb-1)*m_max_c
           jindex = n + shiftc
           DO nf = 1, nb_field
              ! Put real and imaginary parts in a complex
              ! nf=1,2,3 => V1_in
              ! nf=4,5,6 => V2_in
              ! INPUT ARE COSINE AND SINE COEFFICIENTS
              ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
              cu(shiftl+1:shiftl+m_max_c,nf,n) = cmplx(combined_field(:,2*nf-1,jindex),&
                   -combined_field(:,2*nf,jindex),kind=8)/2
           END DO
        END DO
     END DO
     cu(1,:,:) = 2*cmplx(REAL(cu(1,:,:),KIND=8),0.d0,KIND=8)
     !JLG, FEB 4, 2011
     !Padding is done by initialization of cu: cu = 0
     !This is eequivalent to cu(m_max+1:m_max_pad,:,:) = 0.d0
     !JLG, FEB 4, 2011

     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     ! Set the parameters for dfftw
     fft_dim=1; istride=1; ostride=1;
     !JLG, FEB 4, 2011
 !!$       idist=N_r;   inembed(1)=N_r; DIM(1)=N_r
 !!$       odist=m_max; onembed(1)=m_max
     idist=n_r_pad;   inembed(1)=n_r_pad; dim(1)=n_r_pad
     odist=m_max_pad; onembed(1)=m_max_pad
     !JLG, FEB 4, 2011

     howmany=bloc_size*nb_field


     t = mpi_wtime()
     CALL dfftw_plan_many_dft_c2r(fftw_plan_multi_c2r, fft_dim, dim, howmany, cu, &
          onembed, ostride, odist, ru, inembed, istride, idist, fftw_estimate)
     !write(*,*) ' FFT_PAR_CROSS_PROD: fftw_plan_multi_c2r', fftw_plan_multi_c2r
     CALL dfftw_execute(fftw_plan_multi_c2r)

     ! CROSS PRODDUCT
     IF (nb_field==6) THEN
        prod_ru(:,1,:) = ru(:,2,:)*ru(:,6,:) - ru(:,3,:)*ru(:,5,:)
        prod_ru(:,2,:) = ru(:,3,:)*ru(:,4,:) - ru(:,1,:)*ru(:,6,:)
        prod_ru(:,3,:) = ru(:,1,:)*ru(:,5,:) - ru(:,2,:)*ru(:,4,:)
     END IF
     ! CROSS PRODUCT

     howmany = howmany/2
     CALL dfftw_plan_many_dft_r2c(fftw_plan_multi_r2c, fft_dim, dim, howmany, prod_ru, &
          inembed, istride, idist, prod_cu, onembed, ostride, odist, fftw_estimate)
     !write(*,*) ' FFT_PAR_CROSS_PROD: fftw_plan_multi_r2c', fftw_plan_multi_r2c
     CALL dfftw_execute(fftw_plan_multi_r2c)
     !JLG, FEB 4, 2011
 !!$       prod_cu = prod_cu/N_r !Scaling
     prod_cu = prod_cu/n_r_pad !Scaling
     !JLG, FEB 4, 2011
     IF (PRESENT(temps)) temps(2) = temps(2) + mpi_wtime() -t

     !Now we need to redistribute the Fourier coefficients on each processor

     t = mpi_wtime()
     combined_prod_cu(:,:,1)=prod_cu(1,:,:)
     DO n=2, m_max
        !combined_prod_cu(:,:,n)=prod_cu(n,:,:)
        combined_prod_cu(:,:,n)=2*conjg(prod_cu(n,:,:))
     END DO

     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     t = mpi_wtime()
     longueur_tranche=bloc_size*m_max_c*nb_field
     mpid=mpi_double_precision
     CALL mpi_alltoall (combined_prod_cu,longueur_tranche,mpid, dist_prod_cu,longueur_tranche, &
          mpid,communicator,code)
     IF (PRESENT(temps)) temps(1) = temps(1) + mpi_wtime() -t
     ! dimensions:
     t = mpi_wtime()
     DO i = 1, m_max_c
        DO nb = 1, nb_procs
           shiftc = (nb-1)*bloc_size
           shiftl = (nb-1)*m_max_c
           intermediate = dist_prod_cu(:,:,shiftl+i)
           DO n = 1, bloc_size
              IF (n+shiftc > np ) cycle
              DO i_field = 1, nb_field/2
                 v_out(n+shiftc, i_field*2-1, i) = REAL (intermediate(i_field,n),KIND=8)
                 v_out(n+shiftc, i_field*2 , i)  = aimag(intermediate(i_field,n))
              END DO
           END DO
        END DO
     END DO
     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t


   END SUBROUTINE fft_par_cross_prod_dcl

   SUBROUTINE fft_par_compressive_visc_dcl(communicator,V1_in, V2_in, V_out, pb, nb_procs, &
                                   bloc_size, m_max_pad,l_g, opt_norm_out, opt_m_vel, opt_norm, temps, padding)
     !This a de-aliased version of the code, FEB 4, 2011, JLG
     USE my_util
     IMPLICIT NONE
     include 'fftw3.f'
     ! Format: V_1in(1:np,1:6,1:m_max_c)
     ! INPUT ARE COSINE AND SINE COEFFICIENTS
     ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(IN)  :: V1_in, V2_in
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(OUT) :: V_out
     REAL(KIND=8), DIMENSION(:,:), OPTIONAL, INTENT(OUT) :: opt_norm_out
     REAL(KIND=8), DIMENSION(:,:), OPTIONAL, INTENT(IN)  :: opt_norm
     REAL(KIND=8), DIMENSION(:), OPTIONAL, INTENT(INOUT) :: temps
     LOGICAL,                    OPTIONAL, INTENT(IN)    :: padding
     INTEGER, INTENT(IN)                                 :: bloc_size, m_max_pad, nb_procs
     REAL(KIND=8),               OPTIONAL, INTENT(IN)    :: opt_M_vel
     INTEGER,                              INTENT(IN)    :: l_G
     INTEGER, INTENT(IN)                                 :: pb
     INTEGER          :: np, np_tot, nb_field1, nb_field2, m_max, m_max_c, MPID, N_r_pad
     INTEGER(KIND=8)  :: fftw_plan_multi_c2r, fftw_plan_multi_r2c

     COMPLEX(KIND=8), DIMENSION(m_max_pad, (SIZE(V1_in,2)+SIZE(V2_in,2))/2, bloc_size)  :: cu
     REAL(KIND=8), DIMENSION(2*m_max_pad-1,(SIZE(V1_in,2)+SIZE(V2_in,2))/2,bloc_size)   :: ru
     COMPLEX(KIND=8), DIMENSION(m_max_pad,SIZE(V1_in,2)/2,bloc_size)  :: prod_cu
     REAL(KIND=8), DIMENSION(2*m_max_pad-1,SIZE(V1_in,2)/2,bloc_size) :: prod_ru
     COMPLEX(KIND=8), DIMENSION(SIZE(V1_in,2)/2, bloc_size)           :: intermediate
     REAL(KIND=8), DIMENSION(2*m_max_pad-1, bloc_size)                :: norm_grad_phi
     REAL(KIND=8), DIMENSION(2*m_max_pad-1, bloc_size)                :: norm_vel
     REAL(KIND=8), DIMENSION(2*m_max_pad-1, bloc_size/l_G)        :: norm_vel_int

     REAL(KIND=8), DIMENSION(SIZE(V1_in,3),SIZE(V1_in,2)+SIZE(V2_in,2),bloc_size*nb_procs) :: dist_field, combined_field
     COMPLEX(KIND=8), DIMENSION(SIZE(V1_in,2)/2,bloc_size,SIZE(V1_in,3)*nb_procs) :: combined_prod_cu
     COMPLEX(KIND=8), DIMENSION(SIZE(V1_in,2)/2,bloc_size,SIZE(V1_in,3)*nb_procs) :: dist_prod_cu

     INTEGER :: i_field
     INTEGER :: nb, nf, shiftc, shiftl, jindex, longueur_tranche, i, n, code, l
     REAL(KIND=8) :: t, x

     ! FFTW parameters
     INTEGER   :: fft_dim, howmany, istride, ostride, idist, odist
     INTEGER, DIMENSION(1) :: dim, inembed, onembed
     ! Recall complexes must be rescaled
     ! End FFTW parameters
 #include "petsc/finclude/petsc.h"
     mpi_comm :: communicator

     IF (PRESENT(temps)) temps = 0.d0

     np      = SIZE(v1_in,1)
     nb_field1= SIZE(v1_in,2) ! Number of fields
     nb_field2= SIZE(v2_in,2) ! Number of fields
     m_max_c = SIZE(v1_in,3) ! Number of complex (cosines + sines) coefficients per point
     m_max = m_max_c*nb_procs! Number of comlex coefficients per point per processor
     n_r_pad=2*m_max_pad-1
     np_tot = nb_procs*bloc_size

     IF (mod(nb_field1,2)/=0 .OR. mod(nb_field2,2)/=0 .OR. m_max_c==0) THEN
        WRITE(*,*) ' BUG '
        stop
     END IF

     ! Bloc_size is the number of points that are handled by one processor
     ! once the Fourier modes are all collected
     ! Computation of bloc_size and np_tot
     ! fin de la repartition des points

     ! Packing all 3 complex components of both v1 and v2 input fields
     ! into dist_field, where the dimension indexing the nodal points varies the least rapidly,
     ! so that after distributing the data to the processes, each one will obtain a part
     ! on nodal points
     ! TRANSPOSE pr que la variable i associee aux modes soit la 1ere sur laquelle on va faire la FFT
     t = mpi_wtime()

     DO i = 1, m_max_c
        dist_field(i,1:nb_field1,1:np) = transpose(v1_in(:,:,i))
        dist_field(i,nb_field1+1:nb_field1+nb_field2,1:np) = transpose(v2_in(:,:,i))
     END DO
     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     IF (np/=np_tot) dist_field(:,:,np+1:np_tot) = 1.d100

     longueur_tranche=bloc_size*m_max_c*(nb_field1+nb_field2)

     t = mpi_wtime()
     mpid=mpi_double_precision
     CALL mpi_alltoall (dist_field,longueur_tranche, mpid, combined_field, longueur_tranche, &
          mpid, communicator, code)
     IF (PRESENT(temps)) temps(1) = temps(1) + mpi_wtime() -t

     t = mpi_wtime()
     !JLG, FEB 4, 2011
     cu = 0.d0
     !JLG, FEB 4, 2011
     DO n = 1, bloc_size
        DO nb = 1, nb_procs
           shiftc = (nb-1)*bloc_size
           shiftl = (nb-1)*m_max_c
           jindex = n + shiftc
           DO nf = 1, (nb_field1+nb_field2)/2
              ! Put real and imaginary parts in a complex
              ! nf=1,2,3 => V1_in
              ! nf=4 => V2_in
              ! INPUT ARE COSINE AND SINE COEFFICIENTS
              ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
              cu(shiftl+1:shiftl+m_max_c,nf,n) = cmplx(combined_field(:,2*nf-1,jindex),&
                   -combined_field(:,2*nf,jindex),kind=8)/2
           END DO
        END DO
     END DO
     cu(1,:,:) = 2*cmplx(REAL(cu(1,:,:),KIND=8),0.d0,KIND=8)
     !JLG, FEB 4, 2011
     !Padding is done by initialization of cu: cu = 0
     !This is eequivalent to cu(m_max+1:m_max_pad,:,:) = 0.d0
     !JLG, FEB 4, 2011

     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     ! Set the parameters for dfftw
     fft_dim=1; istride=1; ostride=1;
     !JLG, FEB 4, 2011
 !!$       idist=N_r;   inembed(1)=N_r; DIM(1)=N_r
 !!$       odist=m_max; onembed(1)=m_max
     idist=n_r_pad;   inembed(1)=n_r_pad; dim(1)=n_r_pad
     odist=m_max_pad; onembed(1)=m_max_pad
     !JLG, FEB 4, 2011

     howmany=bloc_size*(nb_field1+nb_field2)/2

     t = mpi_wtime()
     CALL dfftw_plan_many_dft_c2r(fftw_plan_multi_c2r, fft_dim, dim, howmany, cu, &
          onembed, ostride, odist, ru, inembed, istride, idist, fftw_estimate)
     !write(*,*) ' FFT_PAR_CROSS_PROD: fftw_plan_multi_c2r', fftw_plan_multi_c2r
     CALL dfftw_execute(fftw_plan_multi_c2r)

     IF (pb==1) THEN
        ! (Max_vel-norm(chmp_vit))*Grad(phi)
        IF (nb_field1 == 6 .AND. nb_field2 == 6) THEN
           norm_vel(:,:) = sqrt(ru(:,4,:)**2 + ru(:,5,:)**2 + ru(:,6,:)**2)
              DO i = 1, 2*m_max_pad - 1
                 DO l = 1, bloc_size/l_g
                    x = maxval(norm_vel(i,(l-1)*l_g+1:l*l_g))
                    norm_vel_int(i,l) = x
                 END DO
              END DO
              DO l = 1, bloc_size/l_g
                 DO i = 2, 2*m_max_pad - 2
                    norm_vel(i,(l-1)*l_g+1:l*l_g) = maxval(norm_vel_int(i-1:i+1,l))
                 END DO
                 norm_vel(1,(l-1)*l_g+1:l*l_g) = max(norm_vel_int(1,l),norm_vel_int(2,l),norm_vel_int(2*m_max_pad - 1,l))
                 norm_vel(2*m_max_pad - 1,(l-1)*l_g+1:l*l_g) = &
                      max(norm_vel_int(2*m_max_pad - 2,l),norm_vel_int(2*m_max_pad - 1,l),norm_vel_int(1,l))
              END DO
           prod_ru(:,1,:) = (opt_m_vel - norm_vel(:,:))*ru(:,1,:)
           prod_ru(:,2,:) = (opt_m_vel - norm_vel(:,:))*ru(:,2,:)
           prod_ru(:,3,:) = (opt_m_vel - norm_vel(:,:))*ru(:,3,:)
        END IF
        opt_norm_out = norm_vel
     ELSE IF (pb==2) THEN
        ! phi*(1-phi)*GRAD(phi_reg)/norm(GRAD(phi_reg))
        IF (nb_field1 == 6 .AND. nb_field2 == 2) THEN
           norm_grad_phi(:,:) = sqrt(ru(:,1,:)**2 + ru(:,2,:)**2 + ru(:,3,:)**2) + 1.d-14
           prod_ru(:,1,:) = opt_norm(:,:)*ru(:,4,:)*(1.d0 - ru(:,4,:))*ru(:,1,:)/norm_grad_phi(:,:)
           prod_ru(:,2,:) = opt_norm(:,:)*ru(:,4,:)*(1.d0 - ru(:,4,:))*ru(:,2,:)/norm_grad_phi(:,:)
           prod_ru(:,3,:) = opt_norm(:,:)*ru(:,4,:)*(1.d0 - ru(:,4,:))*ru(:,3,:)/norm_grad_phi(:,:)
        END IF
     ELSE
        CALL error_petsc('error in problem type while calling FFT_PAR_COMPRESSIVE_VISC_DCL ')
     END IF

     howmany = bloc_size*nb_field1/2

     CALL dfftw_plan_many_dft_r2c(fftw_plan_multi_r2c, fft_dim, dim, howmany, prod_ru, &
          inembed, istride, idist, prod_cu, onembed, ostride, odist, fftw_estimate)
     !write(*,*) ' FFT_PAR_CROSS_PROD: fftw_plan_multi_r2c', fftw_plan_multi_r2c
     CALL dfftw_execute(fftw_plan_multi_r2c)
     !JLG, FEB 4, 2011
 !!$       prod_cu = prod_cu/N_r !Scaling
     prod_cu = prod_cu/n_r_pad !Scaling
     !JLG, FEB 4, 2011
     IF (PRESENT(temps)) temps(2) = temps(2) + mpi_wtime() -t

     !Now we need to redistribute the Fourier coefficients on each processor

     t = mpi_wtime()
     combined_prod_cu(:,:,1)=prod_cu(1,:,:)
     DO n=2, m_max
        !combined_prod_cu(:,:,n)=prod_cu(n,:,:)
        combined_prod_cu(:,:,n)=2*conjg(prod_cu(n,:,:))
     END DO

     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     longueur_tranche=bloc_size*m_max_c*nb_field1

     t = mpi_wtime()
     mpid=mpi_double_precision
     CALL mpi_alltoall (combined_prod_cu,longueur_tranche,mpid, dist_prod_cu,longueur_tranche, &
          mpid,communicator,code)

     IF (PRESENT(temps)) temps(1) = temps(1) + mpi_wtime() -t
     ! dimensions:
     t = mpi_wtime()

     DO i = 1, m_max_c
        DO nb = 1, nb_procs
           shiftc = (nb-1)*bloc_size
           shiftl = (nb-1)*m_max_c
           intermediate = dist_prod_cu(:,:,shiftl+i)
           DO n = 1, bloc_size
              IF (n+shiftc > np ) cycle
              DO i_field = 1, nb_field1/2
                 v_out(n+shiftc, i_field*2-1, i) = REAL (intermediate(i_field,n),KIND=8)
                 v_out(n+shiftc, i_field*2 , i)  = aimag(intermediate(i_field,n))
              END DO
           END DO
        END DO
     END DO
     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

   END SUBROUTINE fft_par_compressive_visc_dcl

   SUBROUTINE fft_par_dot_prod_dcl(communicator,V1_in, V2_in, c_out, nb_procs, bloc_size, m_max_pad, temps)
     !FFT (FFT(-1) V1 . FFT(-1) V2) = c_out
     !This a de-aliased version of the code, FEB 4, 2011, JLG
     IMPLICIT NONE
     include 'fftw3.f'
     ! Format: V_1in(1:np,1:6,1:m_max_c)
     ! INPUT ARE COSINE AND SINE COEFFICIENTS
     ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(IN)  :: V1_in, V2_in
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(OUT) :: c_out
     REAL(KIND=8), DIMENSION(:), OPTIONAL, INTENT(INOUT) :: temps
     INTEGER, INTENT(IN)                                 :: nb_procs, bloc_size, m_max_pad
     COMPLEX(KIND=8), DIMENSION(m_max_pad, SIZE(V1_in,2), bloc_size)  :: cu
     REAL(KIND=8), DIMENSION(2*m_max_pad-1,SIZE(V1_in,2),bloc_size)   :: ru
     COMPLEX(KIND=8), DIMENSION(m_max_pad,bloc_size)  :: prod_cu
     REAL(KIND=8), DIMENSION(2*m_max_pad-1,bloc_size) :: prod_ru
     COMPLEX(KIND=8), DIMENSION(bloc_size)            :: intermediate
     REAL(KIND=8), DIMENSION(SIZE(V1_in,3),2*SIZE(V1_in,2),bloc_size*nb_procs)    :: dist_field, combined_field
     COMPLEX(KIND=8), DIMENSION(bloc_size,SIZE(V1_in,3)*nb_procs) :: combined_prod_cu
     COMPLEX(KIND=8), DIMENSION(bloc_size,SIZE(V1_in,3)*nb_procs) :: dist_prod_cu

     INTEGER  :: np, np_tot, nb_field,  m_max, m_max_c, MPID,  N_r_pad
     INTEGER(KIND=8) :: fftw_plan_multi_c2r, fftw_plan_multi_r2c
     INTEGER ::   nb, nf, shiftc, shiftl, jindex, longueur_tranche, i, n, code
     REAL(KIND=8) :: t
     ! FFTW parameters
     INTEGER   :: fft_dim, howmany, istride, ostride, idist, odist
     INTEGER, DIMENSION(1) :: dim, inembed, onembed
     ! Recall complexes must be rescaled
     ! End FFTW parameters
 #include "petsc/finclude/petsc.h"
     mpi_comm :: communicator

     IF (PRESENT(temps)) temps = 0.d0

     np      = SIZE(v1_in,1)
     nb_field= SIZE(v1_in,2) ! Number of fields
     m_max_c = SIZE(v1_in,3) ! Number of complex (cosines + sines) coefficients per point
     m_max = m_max_c*nb_procs! Number of comlex coefficients per point per processor
     np_tot = nb_procs*bloc_size
     n_r_pad=2*m_max_pad-1

     IF (mod(nb_field,2)/=0 .OR. m_max_c==0) THEN
        WRITE(*,*) ' BUG '
        stop
     END IF

     ! Packing all 3 complex components of both v1 and v2 input fields
     ! into dist_field, where the dimension indexing the nodal points varies the least rapidly,
     ! so that after distributing the data to the processes, each one will obtain a part
     ! on nodal points
     ! TRANSPOSE pr que la variable i associee aux modes soit la 1ere sur laquelle on va faire la FFT
     t = mpi_wtime()

     DO i = 1, m_max_c
        dist_field(i,1:nb_field,1:np) = transpose(v1_in(:,:,i))
        dist_field(i,nb_field+1:2*nb_field,1:np) = transpose(v2_in(:,:,i))
     END DO
     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     IF (np/=np_tot) dist_field(:,:,np+1:np_tot) = 1.d100

     longueur_tranche=bloc_size*m_max_c*nb_field*2

     t = mpi_wtime()
     mpid=mpi_double_precision
     CALL mpi_alltoall (dist_field,longueur_tranche, mpid, combined_field, longueur_tranche, &
          mpid, communicator, code)
     IF (PRESENT(temps)) temps(1) = temps(1) + mpi_wtime() -t

     t = mpi_wtime()
     !JLG, FEB 4, 2011
     cu = 0.d0
     !JLG, FEB 4, 2011
     DO n = 1, bloc_size
        DO nb = 1, nb_procs
           shiftc = (nb-1)*bloc_size
           shiftl = (nb-1)*m_max_c
           jindex = n + shiftc
           DO nf = 1, nb_field
              ! Put real and imaginary parts in a complex
              ! nf=1,2,3 => V1_in
              ! nf=4,5,6 => V2_in
              ! INPUT ARE COSINE AND SINE COEFFICIENTS
              ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
              cu(shiftl+1:shiftl+m_max_c,nf,n) = cmplx(combined_field(:,2*nf-1,jindex),&
                   -combined_field(:,2*nf,jindex),kind=8)/2
           END DO
        END DO
     END DO
     cu(1,:,:) = 2*cmplx(REAL(cu(1,:,:),KIND=8),0.d0,KIND=8)
     !JLG, FEB 4, 2011
     !Padding is done by initialization of cu: cu = 0
     !This is eequivalent to cu(m_max+1:m_max_pad,:,:) = 0.d0
     !JLG, FEB 4, 2011

     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     ! Set the parameters for dfftw
     fft_dim=1; istride=1; ostride=1;
     !JLG, FEB 4, 2011
 !!$       idist=N_r;   inembed(1)=N_r; DIM(1)=N_r
 !!$       odist=m_max; onembed(1)=m_max
     idist=n_r_pad;   inembed(1)=n_r_pad; dim(1)=n_r_pad
     odist=m_max_pad; onembed(1)=m_max_pad
     !JLG, FEB 4, 2011

     howmany=bloc_size*nb_field


     t = mpi_wtime()
     CALL dfftw_plan_many_dft_c2r(fftw_plan_multi_c2r, fft_dim, dim, howmany, cu, &
          onembed, ostride, odist, ru, inembed, istride, idist, fftw_estimate)
     !write(*,*) ' FFT_PAR_DOT_PROD: fftw_plan_multi_c2r', fftw_plan_multi_c2r
     CALL dfftw_execute(fftw_plan_multi_c2r)

     ! DOT PRODDUCT
     IF (nb_field==6) THEN
        prod_ru(:,:) = ru(:,1,:)*ru(:,4,:) + ru(:,2,:)*ru(:,5,:) + ru(:,3,:)*ru(:,6,:)
     END IF
     ! DOT PRODUCT

     howmany = bloc_size*1
     CALL dfftw_plan_many_dft_r2c(fftw_plan_multi_r2c, fft_dim, dim, howmany, prod_ru, &
          inembed, istride, idist, prod_cu, onembed, ostride, odist, fftw_estimate)
     !write(*,*) ' FFT_PAR_DOT_PROD: fftw_plan_multi_r2c', fftw_plan_multi_r2c
     CALL dfftw_execute(fftw_plan_multi_r2c)
     !JLG, FEB 4, 2011
 !!$       prod_cu = prod_cu/N_r !Scaling
     prod_cu = prod_cu/n_r_pad !Scaling
     !JLG, FEB 4, 2011
     IF (PRESENT(temps)) temps(2) = temps(2) + mpi_wtime() -t

     !Now we need to redistribute the Fourier coefficients on each processor
     t = mpi_wtime()
     combined_prod_cu(:,1)=prod_cu(1,:)
     DO n=2, m_max
        combined_prod_cu(:,n)=2*conjg(prod_cu(n,:))
     END DO

     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     t = mpi_wtime()
     longueur_tranche=bloc_size*m_max_c*2
     mpid=mpi_double_precision
     CALL mpi_alltoall (combined_prod_cu,longueur_tranche,mpid, dist_prod_cu,longueur_tranche, &
          mpid,communicator,code)
     IF (PRESENT(temps)) temps(1) = temps(1) + mpi_wtime() -t

     t = mpi_wtime()
     DO i = 1, m_max_c
        DO nb = 1, nb_procs
           shiftc = (nb-1)*bloc_size
           shiftl = (nb-1)*m_max_c
           intermediate = dist_prod_cu(:,shiftl+i)
           DO n = 1, bloc_size
              IF (n+shiftc > np ) cycle
              c_out(n+shiftc, 1, i) = REAL (intermediate(n),KIND=8)
              c_out(n+shiftc, 2 , i)  = aimag(intermediate(n))
           END DO
        END DO
     END DO
     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

   END SUBROUTINE fft_par_dot_prod_dcl

   SUBROUTINE fft_par_prod_dcl(communicator, c1_in, c2_in, c_out, nb_procs, bloc_size, m_max_pad, temps)
     !FFT (FFT(-1) V1 . FFT(-1) V2) = c_out
     !This a de-aliased version of the code, FEB 4, 2011, JLG
     USE my_util
     IMPLICIT NONE
     include 'fftw3.f'
     ! Format: c_1in(1:np,1:2,1:m_max_c)
     ! INPUT ARE COSINE AND SINE COEFFICIENTS
     ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(IN)  :: c1_in, c2_in
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(OUT) :: c_out
     REAL(KIND=8), DIMENSION(:), OPTIONAL, INTENT(INOUT) :: temps
     INTEGER, INTENT(IN)                                 :: nb_procs, bloc_size, m_max_pad

     COMPLEX(KIND=8), DIMENSION(m_max_pad, 2, bloc_size)             :: cu
     REAL(KIND=8),    DIMENSION(2*m_max_pad-1, 2, bloc_size)         :: ru
     COMPLEX(KIND=8), DIMENSION(m_max_pad, bloc_size)                :: prod_cu
     REAL(KIND=8),    DIMENSION(2*m_max_pad-1,bloc_size)             :: prod_ru
     REAL(KIND=8),    DIMENSION(SIZE(c1_in,3),4, bloc_size*nb_procs) :: dist_field, combined_field
     COMPLEX(KIND=8), DIMENSION(bloc_size,SIZE(c1_in,3)*nb_procs)    :: dist_prod_cu, combined_prod_cu
     COMPLEX(KIND=8), DIMENSION(bloc_size)                           :: intermediate

     INTEGER   :: np, np_tot, m_max, m_max_c, MPID,  N_r_pad
     INTEGER(KIND=8) :: fftw_plan_multi_c2r, fftw_plan_multi_r2c
     INTEGER ::   nb, nf, shiftc, shiftl, jindex, longueur_tranche, i, n, code
     REAL(KIND=8) :: t
     ! FFTW parameters
     INTEGER   :: fft_dim, howmany, istride, ostride, idist, odist
     INTEGER, DIMENSION(1) :: dim, inembed, onembed
     ! Recall complexes must be rescaled
     ! End FFTW parameters
 #include "petsc/finclude/petsc.h"
     mpi_comm :: communicator

     IF (PRESENT(temps)) temps = 0.d0

     np      = SIZE(c1_in,1)
     m_max_c = SIZE(c1_in,3) ! Number of complex (cosines + sines) coefficients per point
     m_max = m_max_c*nb_procs! Number of comlex coefficients per point per processor
     np_tot = nb_procs*bloc_size
     n_r_pad=2*m_max_pad-1

     IF (m_max_c==0) THEN
        WRITE(*,*) ' BUG '
        stop
     END IF

     ! Packing all 3 complex components of both v1 and v2 input fields
     ! into dist_field, where the dimension indexing the nodal points varies the least rapidly,
     ! so that after distributing the data to the processes, each one will obtain a part
     ! on nodal points
     ! TRANSPOSE pr que la variable i associee aux modes soit la 1ere sur laquelle on va faire la FFT
     t = mpi_wtime()
     DO i = 1, m_max_c
        dist_field(i,1:2,1:np) = transpose(c1_in(:,:,i))
        dist_field(i,3:4,1:np) = transpose(c2_in(:,:,i))
     END DO
     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     IF (np/=np_tot) dist_field(:,:,np+1:np_tot) = 1.d100

     longueur_tranche=bloc_size*m_max_c*4

     t = mpi_wtime()
     mpid=mpi_double_precision
     CALL mpi_alltoall (dist_field,longueur_tranche, mpid, combined_field, longueur_tranche, &
          mpid, communicator, code)
     IF (PRESENT(temps)) temps(1) = temps(1) + mpi_wtime() -t

     t = mpi_wtime()
     !JLG, FEB 4, 2011
     cu = 0.d0
     !JLG, FEB 4, 2011
     DO n = 1, bloc_size
        DO nb = 1, nb_procs
           shiftc = (nb-1)*bloc_size
           shiftl = (nb-1)*m_max_c
           jindex = n + shiftc
           DO nf = 1, 2
              ! Put real and imaginary parts in a complex
              ! nf=1 => c1_in
              ! nf=2 => c2_in
              ! INPUT ARE COSINE AND SINE COEFFICIENTS
              ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
              cu(shiftl+1:shiftl+m_max_c,nf,n) = cmplx(combined_field(:,2*nf-1,jindex),&
                   -combined_field(:,2*nf,jindex),kind=8)/2
           END DO
        END DO
     END DO
     cu(1,:,:) = 2*cmplx(REAL(cu(1,:,:),KIND=8),0.d0,KIND=8)
     !JLG, FEB 4, 2011
     !Padding is done by initialization of cu: cu = 0
     !This is eequivalent to cu(m_max+1:m_max_pad,:,:) = 0.d0
     !JLG, FEB 4, 2011

     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     ! Set the parameters for dfftw
     fft_dim=1; istride=1; ostride=1;
     !JLG, FEB 4, 2011
 !!$       idist=N_r;   inembed(1)=N_r; DIM(1)=N_r
 !!$       odist=m_max; onembed(1)=m_max
     idist=n_r_pad;   inembed(1)=n_r_pad; dim(1)=n_r_pad
     odist=m_max_pad; onembed(1)=m_max_pad
     !JLG, FEB 4, 2011

     howmany=bloc_size*2

     t = mpi_wtime()
     CALL dfftw_plan_many_dft_c2r(fftw_plan_multi_c2r, fft_dim, dim, howmany, cu, &
          onembed, ostride, odist, ru, inembed, istride, idist, fftw_estimate)
     CALL dfftw_execute(fftw_plan_multi_c2r)

     !PRODDUCT
     prod_ru(:,:) = ru(:,1,:)*ru(:,2,:)
     !PRODUCT

     howmany = bloc_size*1
     CALL dfftw_plan_many_dft_r2c(fftw_plan_multi_r2c, fft_dim, dim, howmany, prod_ru, &
          inembed, istride, idist, prod_cu, onembed, ostride, odist, fftw_estimate)
     CALL dfftw_execute(fftw_plan_multi_r2c)
     !JLG, FEB 4, 2011
 !!$       prod_cu = prod_cu/N_r !Scaling
     prod_cu = prod_cu/n_r_pad !Scaling
     !JLG, FEB 4, 2011
     IF (PRESENT(temps)) temps(2) = temps(2) + mpi_wtime() -t

     !Now we need to redistribute the Fourier coefficients on each processor
     t = mpi_wtime()
     combined_prod_cu(:,1)=prod_cu(1,:)
     DO n=2, m_max
        combined_prod_cu(:,n)=2*conjg(prod_cu(n,:))
     END DO

     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     t = mpi_wtime()
     longueur_tranche=bloc_size*m_max_c*2
     mpid=mpi_double_precision
     CALL mpi_alltoall (combined_prod_cu,longueur_tranche,mpid, dist_prod_cu,longueur_tranche, &
          mpid,communicator,code)
     IF (PRESENT(temps)) temps(1) = temps(1) + mpi_wtime() -t

     t = mpi_wtime()
     DO i = 1, m_max_c
        DO nb = 1, nb_procs
           shiftc = (nb-1)*bloc_size
           shiftl = (nb-1)*m_max_c
           intermediate = dist_prod_cu(:,shiftl+i)
           DO n = 1, bloc_size
              IF (n+shiftc > np ) cycle
              c_out(n+shiftc, 1, i) = REAL (intermediate(n),KIND=8)
              c_out(n+shiftc, 2 , i)  = aimag(intermediate(n))
           END DO
        END DO
     END DO
     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

   END SUBROUTINE fft_par_prod_dcl

   SUBROUTINE fft_par_dot_prod_bis(communicator,V1_in, V2_in, V_out, nb_procs, bloc_size, m_max_pad, temps, padding)
     !This a de-aliased version of the code, FEB 4, 2011, JLG
     USE my_util
     IMPLICIT NONE
     include 'fftw3.f'
     ! Format: V_1in(1:np,1:6,1:m_max_c)
     ! INPUT ARE COSINE AND SINE COEFFICIENTS
     ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(IN)  :: V1_in, V2_in
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(OUT) :: V_out
     INTEGER, INTENT(IN)                          :: bloc_size, m_max_pad, nb_procs
     COMPLEX(KIND=8), DIMENSION(m_max_pad, SIZE(V1_in,2), bloc_size)  :: cu
     REAL(KIND=8), DIMENSION(2*m_max_pad-1,SIZE(V1_in,2),bloc_size)   :: ru
     COMPLEX(KIND=8), DIMENSION(m_max_pad,bloc_size)  :: prod_cu
     REAL(KIND=8), DIMENSION(2*m_max_pad-1,bloc_size) :: prod_ru
     COMPLEX(KIND=8), DIMENSION(bloc_size)            :: intermediate
     REAL(KIND=8), DIMENSION(SIZE(V1_in,3),2*SIZE(V1_in,2),bloc_size*nb_procs)    :: dist_field, combined_field
     COMPLEX(KIND=8), DIMENSION(bloc_size,SIZE(V1_in,3)*nb_procs) :: combined_prod_cu
     COMPLEX(KIND=8), DIMENSION(bloc_size,SIZE(V1_in,3)*nb_procs) :: dist_prod_cu
     REAL(KIND=8), DIMENSION(:), OPTIONAL, INTENT(INOUT) :: temps
     LOGICAL,                    OPTIONAL, INTENT(IN)    :: padding
     INTEGER          :: np, np_tot, nb_field, m_max, m_max_c, MPID, N_r_pad
     INTEGER(KIND=8)  :: fftw_plan_multi_c2r, fftw_plan_multi_r2c
     INTEGER :: i_field
     INTEGER :: nb, nf, shiftc, shiftl, jindex, longueur_tranche, i, n, code
     REAL(KIND=8) :: t

     ! FFTW parameters
     INTEGER   :: fft_dim, howmany, istride, ostride, idist, odist
     INTEGER, DIMENSION(1) :: dim, inembed, onembed
     ! Recall complexes must be rescaled
     ! End FFTW parameters
 #include "petsc/finclude/petsc.h"
     mpi_comm :: communicator

     IF (PRESENT(temps)) temps = 0.d0

     np      = SIZE(v1_in,1)
     nb_field= SIZE(v1_in,2) ! Number of fields
     m_max_c = SIZE(v1_in,3) ! Number of complex (cosines + sines) coefficients per point
     m_max = m_max_c*nb_procs! Number of comlex coefficients per point per processor
     n_r_pad=2*m_max_pad-1
     np_tot = nb_procs*bloc_size

     IF (mod(nb_field,2)/=0 .OR. m_max_c==0) THEN
        WRITE(*,*) ' BUG '
        stop
     END IF

     ! Bloc_size is the number of points that are handled by one processor
     ! once the Fourier modes are all collected
     ! Computation of bloc_size and np_tot
     ! fin de la repartition des points

     ! Packing all 3 complex components of both v1 and v2 input fields
     ! into dist_field, where the dimension indexing the nodal points varies the least rapidly,
     ! so that after distributing the data to the processes, each one will obtain a part
     ! on nodal points
     ! TRANSPOSE pr que la variable i associee aux modes soit la 1ere sur laquelle on va faire la FFT
     t = mpi_wtime()

     DO i = 1, m_max_c
        dist_field(i,1:nb_field,1:np) = transpose(v1_in(:,:,i))
        dist_field(i,nb_field+1:2*nb_field,1:np) = transpose(v2_in(:,:,i))
     END DO
     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     IF (np/=np_tot) dist_field(:,:,np+1:np_tot) = 1.d100

     longueur_tranche=bloc_size*m_max_c*nb_field*2

     t = mpi_wtime()
     mpid=mpi_double_precision
     CALL mpi_alltoall (dist_field,longueur_tranche, mpid, combined_field, longueur_tranche, &
          mpid, communicator, code)
     IF (PRESENT(temps)) temps(1) = temps(1) + mpi_wtime() -t

     t = mpi_wtime()
     !JLG, FEB 4, 2011
     cu = 0.d0
     !JLG, FEB 4, 2011
     DO n = 1, bloc_size
        DO nb = 1, nb_procs
           shiftc = (nb-1)*bloc_size
           shiftl = (nb-1)*m_max_c
           jindex = n + shiftc
           DO nf = 1, nb_field
              ! Put real and imaginary parts in a complex
              ! nf=1,2,3 => V1_in
              ! nf=4,5,6 => V2_in
              ! INPUT ARE COSINE AND SINE COEFFICIENTS
              ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
              cu(shiftl+1:shiftl+m_max_c,nf,n) = cmplx(combined_field(:,2*nf-1,jindex),&
                   -combined_field(:,2*nf,jindex),kind=8)/2
           END DO
        END DO
     END DO
     cu(1,:,:) = 2*cmplx(REAL(cu(1,:,:),KIND=8),0.d0,KIND=8)
     !JLG, FEB 4, 2011
     !Padding is done by initialization of cu: cu = 0
     !This is eequivalent to cu(m_max+1:m_max_pad,:,:) = 0.d0
     !JLG, FEB 4, 2011

     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     ! Set the parameters for dfftw
     fft_dim=1; istride=1; ostride=1;
     !JLG, FEB 4, 2011
 !!$       idist=N_r;   inembed(1)=N_r; DIM(1)=N_r
 !!$       odist=m_max; onembed(1)=m_max
     idist=n_r_pad;   inembed(1)=n_r_pad; dim(1)=n_r_pad
     odist=m_max_pad; onembed(1)=m_max_pad
     !JLG, FEB 4, 2011

     howmany=bloc_size*nb_field


     t = mpi_wtime()
     CALL dfftw_plan_many_dft_c2r(fftw_plan_multi_c2r, fft_dim, dim, howmany, cu, &
          onembed, ostride, odist, ru, inembed, istride, idist, fftw_estimate)
     !write(*,*) ' FFT_PAR_CROSS_PROD: fftw_plan_multi_c2r', fftw_plan_multi_c2r
     CALL dfftw_execute(fftw_plan_multi_c2r)

     ! DOT PRODDUCT
     IF (nb_field==6) THEN
        prod_ru(:,:) = ru(:,1,:)*ru(:,4,:) + ru(:,2,:)*ru(:,5,:) + ru(:,3,:)*ru(:,6,:)
     END IF
     ! DOT PRODUCT

     howmany = bloc_size
     CALL dfftw_plan_many_dft_r2c(fftw_plan_multi_r2c, fft_dim, dim, howmany, prod_ru, &
          inembed, istride, idist, prod_cu, onembed, ostride, odist, fftw_estimate)
     !write(*,*) ' FFT_PAR_CROSS_PROD: fftw_plan_multi_r2c', fftw_plan_multi_r2c
     CALL dfftw_execute(fftw_plan_multi_r2c)
     !JLG, FEB 4, 2011
       ! prod_cu = prod_cu/N_r !Scaling
     prod_cu = prod_cu/n_r_pad !Scaling
     !JLG, FEB 4, 2011
     IF (PRESENT(temps)) temps(2) = temps(2) + mpi_wtime() -t

     !Now we need to redistribute the Fourier coefficients on each processor

     t = mpi_wtime()
     combined_prod_cu(:,1)=prod_cu(1,:)
     DO n=2, m_max
        !combined_prod_cu(:,:,n)=prod_cu(n,:,:)
        combined_prod_cu(:,n)=2*conjg(prod_cu(n,:))
     END DO

     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     t = mpi_wtime()
     longueur_tranche=bloc_size*m_max_c*2
     mpid=mpi_double_precision
     CALL mpi_alltoall (combined_prod_cu,longueur_tranche,mpid, dist_prod_cu,longueur_tranche, &
          mpid,communicator,code)
     IF (PRESENT(temps)) temps(1) = temps(1) + mpi_wtime() -t
     ! dimensions:
     t = mpi_wtime()
     DO i = 1, m_max_c
        DO nb = 1, nb_procs
           shiftc = (nb-1)*bloc_size
           shiftl = (nb-1)*m_max_c
           intermediate = dist_prod_cu(:,shiftl+i)
           DO n = 1, bloc_size
              IF (n+shiftc > np ) cycle
              v_out(n+shiftc, 1, i) = REAL (intermediate(n),KIND=8)
              v_out(n+shiftc, 2, i) = aimag(intermediate(n))
           END DO
        END DO
     END DO
     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t


   END SUBROUTINE fft_par_dot_prod_bis


   SUBROUTINE fft_par_cross_prod(communicator,V1_in, V2_in, V_out, temps, padding)
 !This a de-aliased version of the code, FEB 4, 2011, JLG
     IMPLICIT NONE
     include 'fftw3.f'
     ! Format: V_1in(1:np,1:6,1:m_max_c)
     ! INPUT ARE COSINE AND SINE COEFFICIENTS
     ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(IN)  :: V1_in, V2_in
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(OUT) :: V_out
     REAL(KIND=8), DIMENSION(:), OPTIONAL, INTENT(INOUT) :: temps
     LOGICAL,                    OPTIONAL, INTENT(IN)    :: padding
     ! Saved variables
     LOGICAL, SAVE   :: once=.true.
     INTEGER, SAVE   :: np_ref, np, np_tot, bloc_size, nb_field, &
          m_max, m_max_c, N_r, rang, nb_procs, MPID, m_max_pad, N_r_pad
     INTEGER(KIND=8), SAVE :: fftw_plan_multi_c2r, fftw_plan_multi_r2c
     COMPLEX(KIND=8), ALLOCATABLE, DIMENSION(:,:,:), SAVE :: cu, prod_cu
     REAL(KIND=8),    ALLOCATABLE, DIMENSION(:,:,:), SAVE :: ru, prod_ru
     ! End saved variables

     INTEGER :: i_field
     INTEGER :: np_glob, np_loc, reste, np_alloc, nn, nb_bloc, n_sup, n_inf, n_up, n_cache
     INTEGER ::   nb, nf, shiftc, shiftl, jindex, longueur_tranche, i, n, code
     REAL(KIND=8) :: t
     REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:) :: dist_field, combined_field
     COMPLEX(KIND=8), ALLOCATABLE, DIMENSION(:,:,:) :: combined_prod_cu, dist_prod_cu, out_prod_cu

     !Vectors to speed up the format changes
     COMPLEX(KIND=8), ALLOCATABLE, DIMENSION(:,:)  :: intermediate
     !COMPLEX, ALLOCATABLE, DIMENSION(:,:)  :: intermediate !There was a bug !15/09/2010
     !Vectors to speed up the format changes

     ! FFTW parameters
     INTEGER   :: fft_dim, howmany, istride, ostride, idist, odist
     INTEGER, DIMENSION(1) :: dim, inembed, onembed
     ! Recall complexes must be rescaled
     ! End FFTW parameters

     !Temps(1) = Temps de communication
     !Temps(2) = Temps de calcul
     !Temps(3) = Temps de changement de format

     !EXTERNAL hostnm
     !EXTERNAL gethostname
 #include "petsc/finclude/petsc.h"
     mpi_comm :: communicator

     CALL mpi_comm_size(communicator,nb_procs,code)
     CALL mpi_comm_rank(communicator,rang,code)

     IF (PRESENT(temps)) temps = 0.d0

     IF (.NOT.once) THEN
        IF (SIZE(v1_in,1).NE.np_ref) THEN
           once = .true. !Something wrong happened, reinitialize
           np_ref = SIZE(v1_in,1)
        END IF
     END IF

    once = .true.
    n_cache = 150000

     np_glob      = SIZE(v1_in,1)
     m_max_c = SIZE(v1_in,3)
     np_loc = n_cache/(12*m_max_c)
     nb_bloc = max(np_glob/np_loc,1)

 100 np_loc = np_glob/nb_bloc
     reste = np_glob - np_loc*nb_bloc
     np_alloc = np_loc + reste
     reste = np_alloc*nb_bloc - np_glob
     IF (reste>np_alloc) THEN
        nb_bloc = nb_bloc - 1
        GO TO 100
     END IF
     ! nb_bloc = nbre de blocs qui decoupe le plan meridien
     ! np_alloc = nbre de points ds 1 bloc

     n_sup = 0
     DO nn= 1, nb_bloc
        IF (once) THEN
           nb_field= SIZE(v1_in,2) ! Number of fields
           m_max_c = SIZE(v1_in,3) ! Number of complex (cosines + sines) coefficients per point
           m_max = m_max_c*nb_procs! Number of comlex coefficients per point per processor
           n_r=2*m_max-1           ! Number of Real coefficients per point
           IF (mod(nb_field,2)/=0 .OR. m_max_c==0) THEN
              WRITE(*,*) ' BUG '
              stop
           END IF

           ! Bloc_size is the number of points that are handled by one processor
           ! once the Fourier modes are all collected
           ! Computation of bloc_size and np_tot
           np = np_alloc
           IF (modulo(np,nb_procs)==0) THEN
              bloc_size = np/nb_procs
           ELSE
              bloc_size = np/nb_procs + 1
           END IF
           np_tot = nb_procs*bloc_size
           ! fin de la repartition des points


           !JLG, FEB 4, 2011
           !Only ru, cu, prod_ru, prod_cu are modified
           IF (PRESENT(padding)) THEN
              IF (padding) THEN
                 m_max_pad = 3*m_max/2
              ELSE
                 WRITE(*,*) ' NO PADDING '
                 m_max_pad = m_max
              END IF
           ELSE
              m_max_pad = 3*m_max/2
           END IF
           n_r_pad=2*m_max_pad-1

           IF (ALLOCATED(ru)) DEALLOCATE(ru,cu,prod_ru,prod_cu)
           ALLOCATE(cu(m_max_pad,nb_field,bloc_size))
           ALLOCATE(ru(n_r_pad,  nb_field,bloc_size))
           ALLOCATE(prod_cu(m_max_pad,nb_field/2,bloc_size))
           ALLOCATE(prod_ru(n_r_pad,  nb_field/2,bloc_size))
           !JLG, FEB 4, 2001
           ALLOCATE(intermediate(nb_field/2,bloc_size))
           ALLOCATE(    dist_field(m_max_c,2*nb_field,np_tot))
           ALLOCATE(combined_field(m_max_c,2*nb_field,np_tot))
           ALLOCATE(dist_prod_cu(nb_field/2,bloc_size,m_max))
           ALLOCATE(combined_prod_cu(nb_field/2,bloc_size,m_max))
           ALLOCATE(out_prod_cu(m_max_c,np_tot,nb_field/2))
        END IF


        ! Packing all 3 complex components of both v1 and v2 input fields
        ! into dist_field, where the dimension indexing the nodal points varies the least rapidly,
        ! so that after distributing the data to the processes, each one will obtain a part
        ! on nodal points
        ! TRANSPOSE pr que la variable i associee aux modes soit la 1ere sur laquelle on va faire la FFT
        t = mpi_wtime()
        n_inf = n_sup + 1
        IF (nn == nb_bloc) THEN
           n_up  = np_glob - n_inf + 1
        ELSE
           n_up  = np_alloc
        END IF
        n_sup = n_inf + n_up - 1

        DO i = 1, m_max_c
           dist_field(i,1:nb_field,1:n_up) = transpose(v1_in(n_inf:n_sup,:,i))
           dist_field(i,nb_field+1:2*nb_field,1:n_up) = transpose(v2_in(n_inf:n_sup,:,i))
        END DO
        IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

        IF (np/=np_tot) dist_field(:,:,np+1:np_tot) = 1.d100

        longueur_tranche=bloc_size*m_max_c*nb_field*2

        t = mpi_wtime()
        mpid=mpi_double_precision
        CALL mpi_alltoall (dist_field,longueur_tranche, mpid, combined_field, longueur_tranche, &
             mpid, communicator, code)
        IF (PRESENT(temps)) temps(1) = temps(1) + mpi_wtime() -t

        t = mpi_wtime()
        !JLG, FEB 4, 2011
        cu = 0.d0
        !JLG, FEB 4, 2011
        DO n = 1, bloc_size
           DO nb = 1, nb_procs
              shiftc = (nb-1)*bloc_size
              shiftl = (nb-1)*m_max_c
              jindex = n + shiftc
              DO nf = 1, nb_field
                 ! Put real and imaginary parts in a complex
                 ! nf=1,2,3 => V1_in
                 ! nf=4,5,6 => V2_in
                 ! INPUT ARE COSINE AND SINE COEFFICIENTS
                 ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
                 cu(shiftl+1:shiftl+m_max_c,nf,n) = cmplx(combined_field(:,2*nf-1,jindex),&
                                                         -combined_field(:,2*nf,jindex),kind=8)/2
              END DO
           END DO
        END DO
        cu(1,:,:) = 2*cmplx(REAL(cu(1,:,:),KIND=8),0.d0,KIND=8)
        !JLG, FEB 4, 2011
       !Padding is done by initialization of cu: cu = 0
       !This is eequivalent to cu(m_max+1:m_max_pad,:,:) = 0.d0
        !JLG, FEB 4, 2011

        IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

        ! Set the parameters for dfftw
        fft_dim=1; istride=1; ostride=1;
        !JLG, FEB 4, 2011
 !!$       idist=N_r;   inembed(1)=N_r; DIM(1)=N_r
 !!$       odist=m_max; onembed(1)=m_max
        idist=n_r_pad;   inembed(1)=n_r_pad; dim(1)=n_r_pad
        odist=m_max_pad; onembed(1)=m_max_pad
        !JLG, FEB 4, 2011

        howmany=bloc_size*nb_field


        t = mpi_wtime()
        IF (once) CALL dfftw_plan_many_dft_c2r(fftw_plan_multi_c2r, fft_dim, dim, howmany, cu, &
             onembed, ostride, odist, ru, inembed, istride, idist, fftw_estimate)
 !write(*,*) ' FFT_PAR_CROSS_PROD: fftw_plan_multi_c2r', fftw_plan_multi_c2r
        CALL dfftw_execute(fftw_plan_multi_c2r)

        ! CROSS PRODDUCT
        IF (nb_field==6) THEN
           prod_ru(:,1,:) = ru(:,2,:)*ru(:,6,:) - ru(:,3,:)*ru(:,5,:)
           prod_ru(:,2,:) = ru(:,3,:)*ru(:,4,:) - ru(:,1,:)*ru(:,6,:)
           prod_ru(:,3,:) = ru(:,1,:)*ru(:,5,:) - ru(:,2,:)*ru(:,4,:)
        END IF
        ! CROSS PRODUCT

        howmany = howmany/2
        IF (once) CALL dfftw_plan_many_dft_r2c(fftw_plan_multi_r2c, fft_dim, dim, howmany, prod_ru, &
             inembed, istride, idist, prod_cu, onembed, ostride, odist, fftw_estimate)
 !write(*,*) ' FFT_PAR_CROSS_PROD: fftw_plan_multi_r2c', fftw_plan_multi_r2c
        CALL dfftw_execute(fftw_plan_multi_r2c)
        !JLG, FEB 4, 2011
 !!$       prod_cu = prod_cu/N_r !Scaling
        prod_cu = prod_cu/n_r_pad !Scaling
        !JLG, FEB 4, 2011
        IF (PRESENT(temps)) temps(2) = temps(2) + mpi_wtime() -t

        !Now we need to redistribute the Fourier coefficients on each processor

        t = mpi_wtime()
        combined_prod_cu(:,:,1)=prod_cu(1,:,:)
        DO n=2, m_max
           !combined_prod_cu(:,:,n)=prod_cu(n,:,:)
           combined_prod_cu(:,:,n)=2*conjg(prod_cu(n,:,:))
        END DO

        IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

        t = mpi_wtime()
        longueur_tranche=bloc_size*m_max_c*nb_field
        mpid=mpi_double_precision
        CALL mpi_alltoall (combined_prod_cu,longueur_tranche,mpid, dist_prod_cu,longueur_tranche, &
             mpid,communicator,code)
        IF (PRESENT(temps)) temps(1) = temps(1) + mpi_wtime() -t
        ! dimensions:
        t = mpi_wtime()
        DO i = 1, m_max_c
           DO nb = 1, nb_procs
              shiftc = (nb-1)*bloc_size
              shiftl = (nb-1)*m_max_c
              intermediate = dist_prod_cu(:,:,shiftl+i)
              DO n = 1, bloc_size
                 IF (n_inf-1+n+shiftc > np_glob ) cycle
                 DO i_field = 1, nb_field/2
                    v_out(n_inf-1+n+shiftc, i_field*2-1, i) = REAL (intermediate(i_field,n),KIND=8)
                    v_out(n_inf-1+n+shiftc, i_field*2 , i)  = aimag(intermediate(i_field,n))
                 END DO
              END DO
           END DO
        END DO
        IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

        once = .false.

     END DO

     DEALLOCATE(dist_field, combined_field, dist_prod_cu, combined_prod_cu, intermediate, out_prod_cu)

   END SUBROUTINE fft_par_cross_prod

   SUBROUTINE fft_par_dot_prod(communicator,V1_in, V2_in, c_out, temps, padding)
     !FFT (FFT(-1) V1 . FFT(-1) V2) = c_out
     !This a de-aliased version of the code, FEB 4, 2011, JLG
     IMPLICIT NONE
     include 'fftw3.f'
     ! Format: V_1in(1:np,1:6,1:m_max_c)
     ! INPUT ARE COSINE AND SINE COEFFICIENTS
     ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(IN)  :: V1_in, V2_in
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(OUT) :: c_out
     REAL(KIND=8), DIMENSION(:), OPTIONAL, INTENT(INOUT) :: temps
     LOGICAL,                    OPTIONAL, INTENT(IN)    :: padding
     ! Saved variables
     LOGICAL, SAVE   :: once=.true.
     INTEGER, SAVE   :: np_ref, np, np_tot, bloc_size, nb_field, &
          m_max, m_max_c, N_r, rang, nb_procs, MPID, m_max_pad, N_r_pad
     INTEGER(KIND=8), SAVE :: fftw_plan_multi_c2r, fftw_plan_multi_r2c
     COMPLEX(KIND=8), ALLOCATABLE, DIMENSION(:,:,:), SAVE :: cu
     REAL(KIND=8),    ALLOCATABLE, DIMENSION(:,:,:), SAVE :: ru
     COMPLEX(KIND=8), ALLOCATABLE, DIMENSION(:,:), SAVE :: prod_cu
     REAL(KIND=8),    ALLOCATABLE, DIMENSION(:,:), SAVE :: prod_ru
     ! End saved variables

     INTEGER :: np_glob, np_loc, reste, np_alloc, nn, nb_bloc, n_sup, n_inf, n_up, n_cache
     INTEGER ::   nb, nf, shiftc, shiftl, jindex, longueur_tranche, i, n, code
     REAL(KIND=8) :: t
     REAL(KIND=8),  ALLOCATABLE, DIMENSION(:,:,:) :: dist_field, combined_field
     COMPLEX(KIND=8), ALLOCATABLE, DIMENSION(:,:) :: combined_prod_cu, dist_prod_cu, out_prod_cu

     !Vectors to speed up the format changes
     COMPLEX(KIND=8), ALLOCATABLE, DIMENSION(:)  :: intermediate
     !Vectors to speed up the format changes

     ! FFTW parameters
     INTEGER   :: fft_dim, howmany, istride, ostride, idist, odist
     INTEGER, DIMENSION(1) :: dim, inembed, onembed
     ! Recall complexes must be rescaled
     ! End FFTW parameters

     !Temps(1) = Temps de communication
     !Temps(2) = Temps de calcul
     !Temps(3) = Temps de changement de format

     !EXTERNAL hostnm
     !EXTERNAL gethostname
 #include "petsc/finclude/petsc.h"
     mpi_comm :: communicator

     CALL mpi_comm_size(communicator,nb_procs,code)
     CALL mpi_comm_rank(communicator,rang,code)

     IF (PRESENT(temps)) temps = 0.d0

     IF (.NOT.once) THEN
        IF (SIZE(v1_in,1).NE.np_ref) THEN
           once = .true. !Something wrong happened, reinitialize
           np_ref = SIZE(v1_in,1)
        END IF
     END IF

     once = .true.
     n_cache = 150000
     np_glob      = SIZE(v1_in,1)
     m_max_c = SIZE(v1_in,3)
     np_loc = n_cache/(12*m_max_c)
     nb_bloc = max(np_glob/np_loc,1)

 100 np_loc = np_glob/nb_bloc
     reste = np_glob - np_loc*nb_bloc
     np_alloc = np_loc + reste
     reste = np_alloc*nb_bloc - np_glob
     IF (reste>np_alloc) THEN
        nb_bloc = nb_bloc - 1
        GO TO 100
     END IF
     ! nb_bloc = nbre de blocs qui decoupe le plan meridien
     ! np_alloc = nbre de points ds 1 bloc

     n_sup = 0
     DO nn= 1, nb_bloc
        IF (once) THEN
           nb_field= SIZE(v1_in,2) ! Number of fields
           m_max_c = SIZE(v1_in,3) ! Number of complex (cosines + sines) coefficients per point
           m_max = m_max_c*nb_procs! Number of comlex coefficients per point per processor
           n_r=2*m_max-1           ! Number of Real coefficients per point
           IF (mod(nb_field,2)/=0 .OR. m_max_c==0) THEN
              WRITE(*,*) ' BUG '
              stop
           END IF

           ! Bloc_size is the number of points that are handled by one processor
           ! once the Fourier modes are all collected
           ! Computation of bloc_size and np_tot
           np = np_alloc
           IF (modulo(np,nb_procs)==0) THEN
              bloc_size = np/nb_procs
           ELSE
              bloc_size = np/nb_procs + 1
           END IF
           np_tot = nb_procs*bloc_size
           ! fin de la repartition des points


           !JLG, FEB 4, 2011
           !Only ru, cu, prod_ru, prod_cu are modified
           IF (PRESENT(padding)) THEN
              IF (padding) THEN
                 m_max_pad = 3*m_max/2
              ELSE
                 WRITE(*,*) ' NO PADDING '
                 m_max_pad = m_max
              END IF
           ELSE
              m_max_pad = 3*m_max/2
           END IF
           n_r_pad=2*m_max_pad-1

           IF (ALLOCATED(ru)) DEALLOCATE(ru,cu,prod_ru,prod_cu)
           ALLOCATE(cu(m_max_pad,nb_field,bloc_size))
           ALLOCATE(ru(n_r_pad,  nb_field,bloc_size))
           ALLOCATE(prod_cu(m_max_pad,bloc_size))
           ALLOCATE(prod_ru(n_r_pad,  bloc_size))
           !JLG, FEB 4, 2001
           ALLOCATE(intermediate(bloc_size))
           ALLOCATE(    dist_field(m_max_c,2*nb_field,np_tot))
           ALLOCATE(combined_field(m_max_c,2*nb_field,np_tot))
           ALLOCATE(dist_prod_cu(bloc_size,m_max))
           ALLOCATE(combined_prod_cu(bloc_size,m_max))
           ALLOCATE(out_prod_cu(m_max_c,np_tot))
        END IF

        ! Packing all 3 complex components of both v1 and v2 input fields
        ! into dist_field, where the dimension indexing the nodal points varies the least rapidly,
        ! so that after distributing the data to the processes, each one will obtain a part
        ! on nodal points
        ! TRANSPOSE pr que la variable i associee aux modes soit la 1ere sur laquelle on va faire la FFT
        t = mpi_wtime()
        n_inf = n_sup + 1
        IF (nn == nb_bloc) THEN
           n_up  = np_glob - n_inf + 1
        ELSE
           n_up  = np_alloc
        END IF
        n_sup = n_inf + n_up - 1

        DO i = 1, m_max_c
           dist_field(i,1:nb_field,1:n_up) = transpose(v1_in(n_inf:n_sup,:,i))
           dist_field(i,nb_field+1:2*nb_field,1:n_up) = transpose(v2_in(n_inf:n_sup,:,i))
        END DO
        IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

        IF (np/=np_tot) dist_field(:,:,np+1:np_tot) = 1.d100

        longueur_tranche=bloc_size*m_max_c*nb_field*2

        t = mpi_wtime()
        mpid=mpi_double_precision
        CALL mpi_alltoall (dist_field,longueur_tranche, mpid, combined_field, longueur_tranche, &
             mpid, communicator, code)
        IF (PRESENT(temps)) temps(1) = temps(1) + mpi_wtime() -t

        t = mpi_wtime()
        !JLG, FEB 4, 2011
        cu = 0.d0
        !JLG, FEB 4, 2011
        DO n = 1, bloc_size
           DO nb = 1, nb_procs
              shiftc = (nb-1)*bloc_size
              shiftl = (nb-1)*m_max_c
              jindex = n + shiftc
              DO nf = 1, nb_field
                 ! Put real and imaginary parts in a complex
                 ! nf=1,2,3 => V1_in
                 ! nf=4,5,6 => V2_in
                 ! INPUT ARE COSINE AND SINE COEFFICIENTS
                 ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
                 cu(shiftl+1:shiftl+m_max_c,nf,n) = cmplx(combined_field(:,2*nf-1,jindex),&
                      -combined_field(:,2*nf,jindex),kind=8)/2
              END DO
           END DO
        END DO
        cu(1,:,:) = 2*cmplx(REAL(cu(1,:,:),KIND=8),0.d0,KIND=8)
        !JLG, FEB 4, 2011
        !Padding is done by initialization of cu: cu = 0
        !This is eequivalent to cu(m_max+1:m_max_pad,:,:) = 0.d0
        !JLG, FEB 4, 2011

        IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

        ! Set the parameters for dfftw
        fft_dim=1; istride=1; ostride=1;
        !JLG, FEB 4, 2011
 !!$       idist=N_r;   inembed(1)=N_r; DIM(1)=N_r
 !!$       odist=m_max; onembed(1)=m_max
        idist=n_r_pad;   inembed(1)=n_r_pad; dim(1)=n_r_pad
        odist=m_max_pad; onembed(1)=m_max_pad
        !JLG, FEB 4, 2011

        howmany=bloc_size*nb_field


        t = mpi_wtime()
        IF (once) CALL dfftw_plan_many_dft_c2r(fftw_plan_multi_c2r, fft_dim, dim, howmany, cu, &
             onembed, ostride, odist, ru, inembed, istride, idist, fftw_estimate)
 !write(*,*) ' FFT_PAR_DOT_PROD: fftw_plan_multi_c2r', fftw_plan_multi_c2r
        CALL dfftw_execute(fftw_plan_multi_c2r)

        ! DOT PRODDUCT
        IF (nb_field==6) THEN
           prod_ru(:,:) = ru(:,1,:)*ru(:,4,:) + ru(:,2,:)*ru(:,5,:) + ru(:,3,:)*ru(:,6,:)
        END IF
        ! DOT PRODUCT

        howmany = bloc_size*1
        IF (once) CALL dfftw_plan_many_dft_r2c(fftw_plan_multi_r2c, fft_dim, dim, howmany, prod_ru, &
             inembed, istride, idist, prod_cu, onembed, ostride, odist, fftw_estimate)
 !write(*,*) ' FFT_PAR_DOT_PROD: fftw_plan_multi_r2c', fftw_plan_multi_r2c
        CALL dfftw_execute(fftw_plan_multi_r2c)
        !JLG, FEB 4, 2011
 !!$       prod_cu = prod_cu/N_r !Scaling
        prod_cu = prod_cu/n_r_pad !Scaling
        !JLG, FEB 4, 2011
        IF (PRESENT(temps)) temps(2) = temps(2) + mpi_wtime() -t

        !Now we need to redistribute the Fourier coefficients on each processor
        t = mpi_wtime()
        combined_prod_cu(:,1)=prod_cu(1,:)
        DO n=2, m_max
           combined_prod_cu(:,n)=2*conjg(prod_cu(n,:))
        END DO

        IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

        t = mpi_wtime()
        longueur_tranche=bloc_size*m_max_c*2
        mpid=mpi_double_precision
        CALL mpi_alltoall (combined_prod_cu,longueur_tranche,mpid, dist_prod_cu,longueur_tranche, &
             mpid,communicator,code)
        IF (PRESENT(temps)) temps(1) = temps(1) + mpi_wtime() -t
        ! dimensions:

        t = mpi_wtime()
        DO i = 1, m_max_c
           DO nb = 1, nb_procs
              shiftc = (nb-1)*bloc_size
              shiftl = (nb-1)*m_max_c
              intermediate = dist_prod_cu(:,shiftl+i)
              DO n = 1, bloc_size
                 IF (n_inf-1+n+shiftc > np_glob ) cycle
                 c_out(n_inf-1+n+shiftc, 1, i) = REAL (intermediate(n),KIND=8)
                 c_out(n_inf-1+n+shiftc, 2 , i)  = aimag(intermediate(n))
              END DO
           END DO
        END DO
        IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

        once = .false.

     END DO

     DEALLOCATE(dist_field, combined_field, dist_prod_cu, combined_prod_cu, intermediate, out_prod_cu)

   END SUBROUTINE fft_par_dot_prod

   SUBROUTINE fft_par_prod(communicator, c1_in, c2_in, c_out, temps, padding)
     !FFT (FFT(-1) V1 . FFT(-1) V2) = c_out
     !This a de-aliased version of the code, FEB 4, 2011, JLG
     IMPLICIT NONE
     include 'fftw3.f'
     ! Format: c_1in(1:np,1:2,1:m_max_c)
     ! INPUT ARE COSINE AND SINE COEFFICIENTS
     ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(IN)  :: c1_in, c2_in
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(OUT) :: c_out
     REAL(KIND=8), DIMENSION(:), OPTIONAL, INTENT(INOUT) :: temps
     LOGICAL,                    OPTIONAL, INTENT(IN)    :: padding
     ! Saved variables
     LOGICAL, SAVE   :: once=.true.
     INTEGER, SAVE   :: np_ref, np, np_tot, bloc_size, &
          m_max, m_max_c, N_r, rang, nb_procs, MPID, m_max_pad, N_r_pad
     INTEGER(KIND=8), SAVE :: fftw_plan_multi_c2r, fftw_plan_multi_r2c
     COMPLEX(KIND=8), ALLOCATABLE, DIMENSION(:,:,:), SAVE :: cu
     REAL(KIND=8),    ALLOCATABLE, DIMENSION(:,:,:), SAVE :: ru
     COMPLEX(KIND=8), ALLOCATABLE, DIMENSION(:,:), SAVE :: prod_cu
     REAL(KIND=8),    ALLOCATABLE, DIMENSION(:,:), SAVE :: prod_ru
     ! End saved variables

     INTEGER :: np_glob, np_loc, reste, np_alloc, nn, nb_bloc, n_sup, n_inf, n_up, n_cache
     INTEGER ::   nb, nf, shiftc, shiftl, jindex, longueur_tranche, i, n, code
     REAL(KIND=8) :: t
     REAL(KIND=8),  ALLOCATABLE, DIMENSION(:,:,:) :: dist_field, combined_field
     COMPLEX(KIND=8), ALLOCATABLE, DIMENSION(:,:) :: combined_prod_cu, dist_prod_cu, out_prod_cu

     !Vectors to speed up the format changes
     COMPLEX(KIND=8), ALLOCATABLE, DIMENSION(:)  :: intermediate
     !Vectors to speed up the format changes

     ! FFTW parameters
     INTEGER   :: fft_dim, howmany, istride, ostride, idist, odist
     INTEGER, DIMENSION(1) :: dim, inembed, onembed
     ! Recall complexes must be rescaled
     ! End FFTW parameters

     !Temps(1) = Temps de communication
     !Temps(2) = Temps de calcul
     !Temps(3) = Temps de changement de format

     !EXTERNAL hostnm
     !EXTERNAL gethostname
 #include "petsc/finclude/petsc.h"
     mpi_comm :: communicator

     CALL mpi_comm_size(communicator,nb_procs,code)
     CALL mpi_comm_rank(communicator,rang,code)

     IF (PRESENT(temps)) temps = 0.d0

     IF (.NOT.once) THEN
        IF (SIZE(c1_in,1).NE.np_ref) THEN
           once = .true. !Something wrong happened, reinitialize
           np_ref = SIZE(c1_in,1)
        END IF
     END IF

     once = .true.
     n_cache = 150000
     np_glob      = SIZE(c1_in,1)
     m_max_c = SIZE(c1_in,3)
     np_loc = n_cache/(12*m_max_c)
     nb_bloc = max(np_glob/np_loc,1)

 100 np_loc = np_glob/nb_bloc
     reste = np_glob - np_loc*nb_bloc
     np_alloc = np_loc + reste
     reste = np_alloc*nb_bloc - np_glob
     IF (reste>np_alloc) THEN
        nb_bloc = nb_bloc - 1
        GO TO 100
     END IF
     ! nb_bloc = nbre de blocs qui decoupe le plan meridien
     ! np_alloc = nbre de points ds 1 bloc

     n_sup = 0
     DO nn= 1, nb_bloc
        IF (once) THEN
           m_max_c = SIZE(c1_in,3) ! Number of complex (cosines + sines) coefficients per point
           m_max = m_max_c*nb_procs! Number of comlex coefficients per point per processor
           n_r=2*m_max-1           ! Number of Real coefficients per point
           IF (m_max_c==0) THEN
              WRITE(*,*) ' BUG '
              stop
           END IF

           ! Bloc_size is the number of points that are handled by one processor
           ! once the Fourier modes are all collected
           ! Computation of bloc_size and np_tot
           np = np_alloc
           IF (modulo(np,nb_procs)==0) THEN
              bloc_size = np/nb_procs
           ELSE
              bloc_size = np/nb_procs + 1
           END IF
           np_tot = nb_procs*bloc_size
           ! fin de la repartition des points


           !JLG, FEB 4, 2011
           !Only ru, cu, prod_ru, prod_cu are modified
           IF (PRESENT(padding)) THEN
              IF (padding) THEN
                 m_max_pad = 3*m_max/2
              ELSE
                 WRITE(*,*) ' NO PADDING '
                 m_max_pad = m_max
              END IF
           ELSE
              m_max_pad = 3*m_max/2
           END IF
           n_r_pad=2*m_max_pad-1

           IF (ALLOCATED(ru)) DEALLOCATE(ru,cu,prod_ru,prod_cu)
           ALLOCATE(cu(m_max_pad,2,bloc_size))
           ALLOCATE(ru(n_r_pad,  2,bloc_size))
           ALLOCATE(prod_cu(m_max_pad,bloc_size))
           ALLOCATE(prod_ru(n_r_pad,  bloc_size))
           !JLG, FEB 4, 2001
           ALLOCATE(intermediate(bloc_size))
           ALLOCATE(    dist_field(m_max_c,4,np_tot))
           ALLOCATE(combined_field(m_max_c,4,np_tot))
           ALLOCATE(dist_prod_cu(bloc_size,m_max))
           ALLOCATE(combined_prod_cu(bloc_size,m_max))
           ALLOCATE(out_prod_cu(m_max_c,np_tot))
        END IF

        ! Packing all 3 complex components of both v1 and v2 input fields
        ! into dist_field, where the dimension indexing the nodal points varies the least rapidly,
        ! so that after distributing the data to the processes, each one will obtain a part
        ! on nodal points
        ! TRANSPOSE pr que la variable i associee aux modes soit la 1ere sur laquelle on va faire la FFT
        t = mpi_wtime()
        n_inf = n_sup + 1
        IF (nn == nb_bloc) THEN
           n_up  = np_glob - n_inf + 1
        ELSE
           n_up  = np_alloc
        END IF
        n_sup = n_inf + n_up - 1

        DO i = 1, m_max_c
           dist_field(i,1:2,1:n_up) = transpose(c1_in(n_inf:n_sup,:,i))
           dist_field(i,3:4,1:n_up) = transpose(c2_in(n_inf:n_sup,:,i))
        END DO
        IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

        IF (np/=np_tot) dist_field(:,:,np+1:np_tot) = 1.d100

        longueur_tranche=bloc_size*m_max_c*4

        t = mpi_wtime()
        mpid=mpi_double_precision
        CALL mpi_alltoall (dist_field,longueur_tranche, mpid, combined_field, longueur_tranche, &
             mpid, communicator, code)
        IF (PRESENT(temps)) temps(1) = temps(1) + mpi_wtime() -t

        t = mpi_wtime()
        !JLG, FEB 4, 2011
        cu = 0.d0
        !JLG, FEB 4, 2011
        DO n = 1, bloc_size
           DO nb = 1, nb_procs
              shiftc = (nb-1)*bloc_size
              shiftl = (nb-1)*m_max_c
              jindex = n + shiftc
              DO nf = 1, 2
                 ! Put real and imaginary parts in a complex
                 ! nf=1 => c1_in
                 ! nf=2 => c2_in
                 ! INPUT ARE COSINE AND SINE COEFFICIENTS
                 ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
                 cu(shiftl+1:shiftl+m_max_c,nf,n) = cmplx(combined_field(:,2*nf-1,jindex),&
                      -combined_field(:,2*nf,jindex),kind=8)/2
              END DO
           END DO
        END DO
        cu(1,:,:) = 2*cmplx(REAL(cu(1,:,:),KIND=8),0.d0,KIND=8)
        !JLG, FEB 4, 2011
        !Padding is done by initialization of cu: cu = 0
        !This is eequivalent to cu(m_max+1:m_max_pad,:,:) = 0.d0
        !JLG, FEB 4, 2011

        IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

        ! Set the parameters for dfftw
        fft_dim=1; istride=1; ostride=1;
        !JLG, FEB 4, 2011
 !!$       idist=N_r;   inembed(1)=N_r; DIM(1)=N_r
 !!$       odist=m_max; onembed(1)=m_max
        idist=n_r_pad;   inembed(1)=n_r_pad; dim(1)=n_r_pad
        odist=m_max_pad; onembed(1)=m_max_pad
        !JLG, FEB 4, 2011

        howmany=bloc_size*2

        t = mpi_wtime()
        IF (once) CALL dfftw_plan_many_dft_c2r(fftw_plan_multi_c2r, fft_dim, dim, howmany, cu, &
             onembed, ostride, odist, ru, inembed, istride, idist, fftw_estimate)
        CALL dfftw_execute(fftw_plan_multi_c2r)

        !PRODDUCT
        prod_ru(:,:) = ru(:,1,:)*ru(:,2,:)
        !PRODUCT

        howmany = bloc_size*1
        IF (once) CALL dfftw_plan_many_dft_r2c(fftw_plan_multi_r2c, fft_dim, dim, howmany, prod_ru, &
             inembed, istride, idist, prod_cu, onembed, ostride, odist, fftw_estimate)
        CALL dfftw_execute(fftw_plan_multi_r2c)
        !JLG, FEB 4, 2011
 !!$       prod_cu = prod_cu/N_r !Scaling
        prod_cu = prod_cu/n_r_pad !Scaling
        !JLG, FEB 4, 2011
        IF (PRESENT(temps)) temps(2) = temps(2) + mpi_wtime() -t

        !Now we need to redistribute the Fourier coefficients on each processor
        t = mpi_wtime()
        combined_prod_cu(:,1)=prod_cu(1,:)
        DO n=2, m_max
           combined_prod_cu(:,n)=2*conjg(prod_cu(n,:))
        END DO

        IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

        t = mpi_wtime()
        longueur_tranche=bloc_size*m_max_c*2
        mpid=mpi_double_precision
        CALL mpi_alltoall (combined_prod_cu,longueur_tranche,mpid, dist_prod_cu,longueur_tranche, &
             mpid,communicator,code)
        IF (PRESENT(temps)) temps(1) = temps(1) + mpi_wtime() -t
        ! dimensions:
        t = mpi_wtime()
        DO i = 1, m_max_c
           DO nb = 1, nb_procs
              shiftc = (nb-1)*bloc_size
              shiftl = (nb-1)*m_max_c
              intermediate = dist_prod_cu(:,shiftl+i)
              DO n = 1, bloc_size
                 IF (n_inf-1+n+shiftc > np_glob ) cycle
                 c_out(n_inf-1+n+shiftc, 1, i) = REAL (intermediate(n),KIND=8)
                 c_out(n_inf-1+n+shiftc, 2 , i)  = aimag(intermediate(n))
              END DO
           END DO
        END DO
        IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

        once = .false.

     END DO

     DEALLOCATE(dist_field, combined_field, dist_prod_cu, combined_prod_cu, intermediate, out_prod_cu)

   END SUBROUTINE fft_par_prod

   SUBROUTINE fft_par_allen_cahn(communicator, c_in, c_out, temps, padding)
     !This a de-aliased version of the code, FEB 4, 2011, JLG
     IMPLICIT NONE
     include 'fftw3.f'
     ! Format: V_1in(1:np,1:6,1:m_max_c)
     ! INPUT ARE COSINE AND SINE COEFFICIENTS
     ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(IN)  :: c_in
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(OUT) :: c_out
     REAL(KIND=8), DIMENSION(:), OPTIONAL, INTENT(INOUT) :: temps
     LOGICAL,                    OPTIONAL, INTENT(IN)    :: padding
     ! Saved variables
     LOGICAL, SAVE   :: once=.true.
     INTEGER, SAVE   :: np_ref, np, np_tot, bloc_size, &
          m_max, m_max_c, N_r, rang, nb_procs, MPID, m_max_pad, N_r_pad
     INTEGER(KIND=8), SAVE :: fftw_plan_multi_c2r, fftw_plan_multi_r2c
     COMPLEX(KIND=8), ALLOCATABLE, DIMENSION(:,:), SAVE :: cu
     REAL(KIND=8),    ALLOCATABLE, DIMENSION(:,:), SAVE :: ru
     COMPLEX(KIND=8), ALLOCATABLE, DIMENSION(:,:), SAVE :: prod_cu
     REAL(KIND=8),    ALLOCATABLE, DIMENSION(:,:), SAVE :: prod_ru
     ! End saved variables

     INTEGER :: np_glob, np_loc, reste, np_alloc, nn, nb_bloc, n_sup, n_inf, n_up, n_cache
     INTEGER ::   nb, shiftc, shiftl, jindex, longueur_tranche, i, n, code
     REAL(KIND=8) :: t
     REAL(KIND=8),  ALLOCATABLE, DIMENSION(:,:,:) :: dist_field, combined_field
     COMPLEX(KIND=8), ALLOCATABLE, DIMENSION(:,:) :: combined_prod_cu, dist_prod_cu

     !Vectors to speed up the format changes
     COMPLEX(KIND=8), ALLOCATABLE, DIMENSION(:)  :: intermediate
     !Vectors to speed up the format changes

     ! FFTW parameters
     INTEGER   :: fft_dim, howmany, istride, ostride, idist, odist
     INTEGER, DIMENSION(1) :: dim, inembed, onembed
     ! Recall complexes must be rescaled
     ! End FFTW parameters

     !Temps(1) = Temps de communication
     !Temps(2) = Temps de calcul
     !Temps(3) = Temps de changement de format

     !EXTERNAL hostnm
     !EXTERNAL gethostname
 #include "petsc/finclude/petsc.h"
     mpi_comm :: communicator

     CALL mpi_comm_size(communicator,nb_procs,code)
     CALL mpi_comm_rank(communicator,rang,code)

     IF (PRESENT(temps)) temps = 0.d0

     IF (.NOT.once) THEN
        IF (SIZE(c_in,1).NE.np_ref) THEN
           once = .true. !Something wrong happened, reinitialize
           np_ref = SIZE(c_in,1)
        END IF
     END IF

     once = .true.
     n_cache = 150000
     np_glob      = SIZE(c_in,1)
     m_max_c = SIZE(c_in,3)
     np_loc = n_cache/(12*m_max_c)
     nb_bloc = max(np_glob/np_loc,1)

 100 np_loc = np_glob/nb_bloc
     reste = np_glob - np_loc*nb_bloc
     np_alloc = np_loc + reste
     reste = np_alloc*nb_bloc - np_glob
     IF (reste>np_alloc) THEN
        nb_bloc = nb_bloc - 1
        GO TO 100
     END IF
     ! nb_bloc = nbre de blocs qui decoupe le plan meridien
     ! np_alloc = nbre de points ds 1 bloc

     n_sup = 0
     DO nn= 1, nb_bloc
        IF (once) THEN
           m_max_c = SIZE(c_in,3) ! Number of complex (cosines + sines) coefficients per point
           m_max = m_max_c*nb_procs! Number of comlex coefficients per point per processor
           n_r=2*m_max-1           ! Number of Real coefficients per point
           IF (m_max_c==0) THEN
              WRITE(*,*) ' BUG '
              stop
           END IF

           ! Bloc_size is the number of points that are handled by one processor
           ! once the Fourier modes are all collected
           ! Computation of bloc_size and np_tot
           np = np_alloc
           IF (modulo(np,nb_procs)==0) THEN
              bloc_size = np/nb_procs
           ELSE
              bloc_size = np/nb_procs + 1
           END IF
           np_tot = nb_procs*bloc_size
           ! fin de la repartition des points


           !JLG, FEB 4, 2011
           !Only ru, cu, prod_ru, prod_cu are modified
           IF (PRESENT(padding)) THEN
              IF (padding) THEN
                 m_max_pad = 3*m_max/2
              ELSE
                 WRITE(*,*) ' NO PADDING '
                 m_max_pad = m_max
              END IF
           ELSE
              m_max_pad = 3*m_max/2
           END IF
           n_r_pad=2*m_max_pad-1

           IF (ALLOCATED(ru)) DEALLOCATE(ru,cu,prod_ru,prod_cu)
           ALLOCATE(cu(m_max_pad,bloc_size))
           ALLOCATE(ru(n_r_pad,  bloc_size))
           ALLOCATE(prod_cu(m_max_pad,bloc_size))
           ALLOCATE(prod_ru(n_r_pad,  bloc_size))
           !JLG, FEB 4, 2001
           ALLOCATE(intermediate(bloc_size))
           ALLOCATE(    dist_field(m_max_c,2,np_tot))
           ALLOCATE(combined_field(m_max_c,2,np_tot))
           ALLOCATE(dist_prod_cu(bloc_size,m_max))
           ALLOCATE(combined_prod_cu(bloc_size,m_max))
        END IF


        ! Packing all complex components of input field
        ! into dist_field, where the dimension indexing the nodal points varies the least rapidly,
        ! so that after distributing the data to the processes, each one will obtain a part
        ! on nodal points
        ! TRANSPOSE pr que la variable i associee aux modes soit la 1ere sur laquelle on va faire la FFT
        t = mpi_wtime()
        n_inf = n_sup + 1
        IF (nn == nb_bloc) THEN
           n_up  = np_glob - n_inf + 1
        ELSE
           n_up  = np_alloc
        END IF
        n_sup = n_inf + n_up - 1

        DO i = 1, m_max_c
           dist_field(i,:,1:n_up) = transpose(c_in(n_inf:n_sup,:,i))
        END DO
        IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

        IF (np/=np_tot) dist_field(:,:,np+1:np_tot) = 1.d100

        longueur_tranche=bloc_size*m_max_c*2

        t = mpi_wtime()
        mpid=mpi_double_precision
        CALL mpi_alltoall (dist_field,longueur_tranche, mpid, combined_field, longueur_tranche, &
             mpid, communicator, code)
        IF (PRESENT(temps)) temps(1) = temps(1) + mpi_wtime() -t

        t = mpi_wtime()
        !JLG, FEB 4, 2011
        cu = 0.d0
        !JLG, FEB 4, 2011
        DO n = 1, bloc_size
           DO nb = 1, nb_procs
              shiftc = (nb-1)*bloc_size
              shiftl = (nb-1)*m_max_c
              jindex = n + shiftc
              ! Put real and imaginary parts in a complex
              ! INPUT ARE COSINE AND SINE COEFFICIENTS
              ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
              cu(shiftl+1:shiftl+m_max_c,n) = cmplx(combined_field(:,1,jindex),&
                   -combined_field(:,2,jindex),kind=8)/2
           END DO
        END DO
        cu(1,:) = 2*cmplx(REAL(cu(1,:),KIND=8),0.d0,KIND=8)
        !JLG, FEB 4, 2011
        !Padding is done by initialization of cu: cu = 0
        !This is eequivalent to cu(m_max+1:m_max_pad,:,:) = 0.d0
        !JLG, FEB 4, 2011

        IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

        ! Set the parameters for dfftw
        fft_dim=1; istride=1; ostride=1;
        !JLG, FEB 4, 2011
 !!$       idist=N_r;   inembed(1)=N_r; DIM(1)=N_r
 !!$       odist=m_max; onembed(1)=m_max
        idist=n_r_pad;   inembed(1)=n_r_pad; dim(1)=n_r_pad
        odist=m_max_pad; onembed(1)=m_max_pad
        !JLG, FEB 4, 2011

        howmany=bloc_size
        t = mpi_wtime()
        IF (once) CALL dfftw_plan_many_dft_c2r(fftw_plan_multi_c2r, fft_dim, dim, howmany, cu, &
             onembed, ostride, odist, ru, inembed, istride, idist, fftw_estimate);
        CALL dfftw_execute(fftw_plan_multi_c2r)

        ! ALLEN CAHN FUNCTION
        prod_ru = ru*(ru**2-1)
        ! ALLEN CAHN FUNCTION

        howmany = bloc_size
        IF (once) CALL dfftw_plan_many_dft_r2c(fftw_plan_multi_r2c, fft_dim, dim, howmany, prod_ru, &
             inembed, istride, idist, prod_cu, onembed, ostride, odist, fftw_estimate);
        CALL dfftw_execute(fftw_plan_multi_r2c)
        !JLG, FEB 4, 2011
 !!$       prod_cu = prod_cu/N_r !Scaling
        prod_cu = prod_cu/n_r_pad !Scaling
        !JLG, FEB 4, 2011
        IF (PRESENT(temps)) temps(2) = temps(2) + mpi_wtime() -t

        !Now we need to redistribute the Fourier coefficients on each processor
        t = mpi_wtime()
        combined_prod_cu(:,1)=prod_cu(1,:)
        DO n=2, m_max
           combined_prod_cu(:,n)=2*conjg(prod_cu(n,:))
        END DO

        IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

        t = mpi_wtime()
        longueur_tranche=bloc_size*m_max_c*2
        mpid=mpi_double_precision
        CALL mpi_alltoall (combined_prod_cu,longueur_tranche,mpid, dist_prod_cu,longueur_tranche, &
             mpid,communicator,code)
        IF (PRESENT(temps)) temps(1) = temps(1) + mpi_wtime() -t
        ! dimensions:
        t = mpi_wtime()
        DO i = 1, m_max_c
           DO nb = 1, nb_procs
              shiftc = (nb-1)*bloc_size
              shiftl = (nb-1)*m_max_c
              intermediate = dist_prod_cu(:,shiftl+i)
              DO n = 1, bloc_size
                 IF (n_inf-1+n+shiftc > np_glob ) cycle
                 c_out(n_inf-1+n+shiftc, 1, i) = REAL (intermediate(n),KIND=8)
                 c_out(n_inf-1+n+shiftc, 2, i)  = aimag(intermediate(n))
              END DO
           END DO
        END DO
        IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

        once = .false.

     END DO

     DEALLOCATE(dist_field, combined_field, dist_prod_cu, combined_prod_cu, intermediate)

   END SUBROUTINE fft_par_allen_cahn


   SUBROUTINE ref(communicator,V1_in, V2_in, V_out, temps)
     IMPLICIT NONE

     include 'fftw3.f'
     !INCLUDE 'mpif.h'
     ! Format: V_1in(1:np,1:6,1:m_max_c)
     ! INPUT ARE COSINE AND SINE COEFFICIENTS
     ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(IN)  :: V1_in, V2_in
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(OUT) :: V_out
     REAL(KIND=8), DIMENSION(:), OPTIONAL, INTENT(INOUT) :: temps

     ! Saved variables
     LOGICAL, SAVE   :: once=.true.
     INTEGER, SAVE   :: np, np_tot, bloc_size, nb_field, m_max, m_max_c, N_r, rang, nb_procs, MPID
     INTEGER(KIND=8), SAVE :: fftw_plan_multi_c2r, fftw_plan_multi_r2c
     COMPLEX(KIND=8), ALLOCATABLE, DIMENSION(:,:,:), SAVE :: cu, prod_cu
     REAL(KIND=8),    ALLOCATABLE, DIMENSION(:,:,:), SAVE :: ru, prod_ru
     ! End saved variables

     INTEGER :: i_field
     INTEGER :: nb, nf, shiftc, shiftl, jindex, longueur_tranche, i, n, code
     REAL(KIND=8) :: t
     REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:,:), SAVE :: dist_field, combined_field
     COMPLEX(KIND=8), ALLOCATABLE, DIMENSION(:,:,:), SAVE :: combined_prod_cu, dist_prod_cu, out_prod_cu

     !Vectors to speed up the format changes
     COMPLEX(KIND=8), ALLOCATABLE, DIMENSION(:,:)  :: intermediate
     !REAL(KIND=8), ALLOCATABLE, DIMENSION(:,:) :: V_int
     !Vectors to speed up the format changes

     ! FFTW parameters
     INTEGER   :: fft_dim, howmany, istride, ostride, idist, odist
     INTEGER, DIMENSION(1) :: dim, inembed, onembed
     ! Recall complexes must be rescaled
     ! End FFTW parameters
 #include "petsc/finclude/petsc.h"
     mpi_comm :: communicator

     !Temps(1) = Temps de communication
     !Temps(2) = Temps de calcul
     !Temps(3) = Temps de changement de format
     IF (PRESENT(temps)) temps = 0.d0

     IF (.NOT.once) THEN
        IF ((SIZE(v1_in,1).NE.np) .OR. (SIZE(v1_in,2).NE.nb_field) .OR. (SIZE(v1_in,3).NE.m_max_c)) THEN
           once = .true. !Something wrong happened, reinitialize
        END IF
     END IF

     IF (once) THEN
        CALL mpi_comm_size(communicator,nb_procs,code)
        CALL mpi_comm_rank(communicator,rang,code)

        np      = SIZE(v1_in,1) ! Number of points in the meridian plane
        nb_field= SIZE(v1_in,2) ! Number of fields
        m_max_c = SIZE(v1_in,3) ! Number of complex (cosines + sines) coefficients per point
        m_max = m_max_c*nb_procs! Number of comlex coefficients per point per processor
        n_r=2*m_max-1           ! Number of Real coefficients per point
        IF (mod(nb_field,2)/=0 .OR. m_max_c==0) THEN
           WRITE(*,*) ' BUG '
           stop
        END IF

        ! Bloc_size is the number of points that are handled by one processor
        ! once the Fourier modes are all collected
        ! Computation of bloc_size and np_tot
        IF (modulo(np,nb_procs)==0) THEN
           bloc_size = np/nb_procs
        ELSE
           bloc_size = np/nb_procs + 1
        END IF
        np_tot = nb_procs*bloc_size

        IF (ALLOCATED(ru)) DEALLOCATE(ru,cu,prod_ru,prod_cu)
        ALLOCATE(cu(m_max,nb_field,bloc_size))
        ALLOCATE(ru(n_r,  nb_field,bloc_size))
        ALLOCATE(prod_cu(m_max,nb_field/2,bloc_size))
        ALLOCATE(prod_ru(n_r,  nb_field/2,bloc_size))
        !ALLOCATE(V_int(nb_field,np))
        !ALLOCATE(V_int(np,nb_field))
     END IF

     ALLOCATE(intermediate(nb_field/2,bloc_size))
     ALLOCATE(    dist_field(m_max_c,2*nb_field,np_tot))
     ALLOCATE(combined_field(m_max_c,2*nb_field,np_tot))
     ALLOCATE(dist_prod_cu(nb_field/2,bloc_size,m_max), combined_prod_cu(nb_field/2,bloc_size,m_max))
     ALLOCATE(out_prod_cu(m_max_c,np_tot,nb_field/2))

     ! Packing all 3 complex components of both v1 and v2 input fields
     ! into dist_field, where the dimension indexing the nodal points varies the least rapidly,
     ! so that after distributing the data to the processes, each one will obtain a part
     ! on nodal points
     t = mpi_wtime()
     DO i = 1, m_max_c
        !V_int = V1_in(:,:,i)
        !dist_field(i,1:nb_field,:) = transpose(V_int)
        !V_int = V2_in(:,:,i)
        !dist_field(i,nb_field+1:2*nb_field,:) = transpose(V_int)
        dist_field(i,1:nb_field,1:np) = transpose(v1_in(:,:,i))
        dist_field(i,nb_field+1:2*nb_field,1:np) = transpose(v2_in(:,:,i))
        !DO nf = 1, nb_field/2
        !   dist_field(i,2*nf-1,1:np) = V1_in(:,2*nf-1,i)
        !   dist_field(i,2*nf  ,1:np) = V1_in(:,2*nf  ,i)
        !   dist_field(i,nb_field+2*nf-1,1:np) = V2_in(:,2*nf-1,i)
        !   dist_field(i,nb_field+2*nf  ,1:np) = V2_in(:,2*nf,i)
        !END DO
     END DO
     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     IF (np/=np_tot) dist_field(:,:,np+1:np_tot) = 1.d100

     longueur_tranche=bloc_size*m_max_c*nb_field*2

     t = mpi_wtime()
     mpid=mpi_double_precision
     CALL mpi_alltoall (dist_field,longueur_tranche, mpid, combined_field, longueur_tranche, &
          mpid, communicator, code)
     IF (PRESENT(temps)) temps(1) = temps(1) + mpi_wtime() -t

     t = mpi_wtime()

     DO n = 1, bloc_size
        DO nb = 1, nb_procs
           shiftc = (nb-1)*bloc_size
           shiftl = (nb-1)*m_max_c
           jindex = n + shiftc
           DO nf = 1, nb_field
              ! Put real and imaginary parts in a complex
              ! for each field, nf=1,2,3,4,5,6
              ! nf=1,2,3 => V1_in
              ! nf=4,5,6 => V2_in
              ! cu(shiftl+1:shiftl+m_max_c,nf,n) = cmplx(combined_field(:,2*nf-1,jindex),combined_field(:,2*nf,jindex))
              ! INPUT ARE COSINE AND SINE COEFFICIENTS
              ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
              cu(shiftl+1:shiftl+m_max_c,nf,n) = 0.5d0*cmplx(combined_field(:,2*nf-1,jindex),-combined_field(:,2*nf,jindex),kind=8)
           END DO
        END DO
     END DO
     cu(1,:,:) = 2.d0*cmplx(REAL(cu(1,:,:),KIND=8),0.d0,KIND=8)
     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     ! Set the parameters for dfftw
     fft_dim=1; istride=1; ostride=1;
     idist=n_r;   inembed(1)=n_r; dim(1)=n_r
     odist=m_max; onembed(1)=m_max
     IF (rang==(nb_procs-1)) THEN
        howmany= (np - bloc_size*(nb_procs-1))*nb_field
     ELSE
        howmany=bloc_size*nb_field
     END IF

     t = mpi_wtime()
     IF (once) CALL dfftw_plan_many_dft_c2r(fftw_plan_multi_c2r, fft_dim, dim, howmany, cu, &
          onembed, ostride, odist, ru, inembed, istride, idist, fftw_estimate);
     CALL dfftw_execute(fftw_plan_multi_c2r)

     ! SIMPLE PRODUCT
     !DO nf = 1, nb_field/2
     !   ! ru(1:3) contains V1_in and ru(4:6) contains V2_in
     !   prod_ru(:,nf,:)  = ru(:,nf,:)*ru(:,nb_field/2+nf,:)
     !END DO
     ! END SIMPLE PRODUCT

     ! CROSS PRODDUCT
     IF (nb_field==6) THEN
        prod_ru(:,1,:) = ru(:,2,:)*ru(:,6,:) - ru(:,3,:)*ru(:,5,:)
        prod_ru(:,2,:) = ru(:,3,:)*ru(:,4,:) - ru(:,1,:)*ru(:,6,:)
        prod_ru(:,3,:) = ru(:,1,:)*ru(:,5,:) - ru(:,2,:)*ru(:,4,:)
     END IF
     ! CROSS PRODUCT

     howmany = howmany/2
     IF (once) CALL dfftw_plan_many_dft_r2c(fftw_plan_multi_r2c, fft_dim, dim, howmany, prod_ru, &
          inembed, istride, idist, prod_cu, onembed, ostride, odist, fftw_estimate);
     CALL dfftw_execute(fftw_plan_multi_r2c)

     prod_cu = prod_cu/n_r !Scaling
     !   prod_cu = prod_cu/REAL(N_r,KIND=8) !Scaling
     IF (PRESENT(temps)) temps(2) = temps(2) + mpi_wtime() -t

     !Now we need to redistribute the Fourier coefficients on each processor

     t = mpi_wtime()
     combined_prod_cu(:,:,1)=prod_cu(1,:,:)
     DO n=2, m_max
        !combined_prod_cu(:,:,n)=prod_cu(n,:,:)
        combined_prod_cu(:,:,n)=2.d0*conjg(prod_cu(n,:,:))
     END DO
     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     t = mpi_wtime()
     longueur_tranche=bloc_size*m_max_c*nb_field
     mpid=mpi_double_precision
     CALL mpi_alltoall (combined_prod_cu,longueur_tranche,mpid, dist_prod_cu,longueur_tranche, &
          mpid,communicator,code)
     IF (PRESENT(temps)) temps(1) = temps(1) + mpi_wtime() -t

     ! dimensions:
     ! v_out(np, nb_field, m_max_c)
     t = mpi_wtime()

     IF (.false.) THEN
        DO i_field = 1, nb_field/2
           DO n = 1, bloc_size
              DO nb = 1, nb_procs
                 shiftc = (nb-1)*bloc_size
                 shiftl = (nb-1)*m_max_c
                 out_prod_cu(:,n+shiftc,i_field) = dist_prod_cu(i_field,n,shiftl+1:shiftl+m_max_c)
              END DO
           END DO
        END DO

        DO i_field = 1, nb_field/2
           DO i = 1, m_max_c
              v_out(:, i_field*2-1, i) = REAL(out_prod_cu(i, 1:np, i_field),KIND=8)
              v_out(:, i_field*2,   i) = aimag(out_prod_cu(i, 1:np, i_field))
           END DO
        END DO
     ELSE
        DO i = 1, m_max_c
           DO nb = 1, nb_procs
              shiftc = (nb-1)*bloc_size
              shiftl = (nb-1)*m_max_c
              intermediate = dist_prod_cu(:,:,shiftl+i)
              DO n = 1, bloc_size
                 IF (n+shiftc > np ) cycle
                 DO i_field = 1, nb_field/2
                    v_out(n+shiftc, i_field*2-1, i) = REAL (intermediate(i_field,n),KIND=8)
                    v_out(n+shiftc, i_field*2 , i)  = aimag(intermediate(i_field,n))
                    !v_out(n+shiftc, i_field*2-1, i) = REAL(dist_prod_cu(i_field,n,shiftl+i),KIND=8)
                    !v_out(n+shiftc, i_field*2 , i)  = AIMAG(dist_prod_cu(i_field,n,shiftl+i))
                 END DO
              END DO
           END DO
        END DO
     END IF

     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     DEALLOCATE(dist_field, combined_field, dist_prod_cu, combined_prod_cu, intermediate, out_prod_cu)
     IF (once) once = .false.
   END SUBROUTINE ref

   SUBROUTINE fft_heaviside_dcl(communicator,V1_in, V_out, nb_procs, bloc_size, m_max_pad, temps, padding)
     !This a de-aliased version of the code, FEB 4, 2011, JLG
     IMPLICIT NONE
     include 'fftw3.f'
     ! Format: V_1in(1:np,1:6,1:m_max_c)
     ! INPUT ARE COSINE AND SINE COEFFICIENTS
     ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(IN)  :: V1_in
     REAL(KIND=8), DIMENSION(:,:,:),  INTENT(OUT) :: V_out
     REAL(KIND=8), DIMENSION(:), OPTIONAL, INTENT(INOUT) :: temps
     LOGICAL,                    OPTIONAL, INTENT(IN)    :: padding
     INTEGER, INTENT(IN)                                 :: bloc_size, m_max_pad, nb_procs
     INTEGER          :: np, np_tot, nb_field, m_max, m_max_c, MPID, N_r_pad
     INTEGER(KIND=8)  :: fftw_plan_multi_c2r, fftw_plan_multi_r2c

     !COMPLEX(KIND=8), DIMENSION(m_max_pad, SIZE(V1_in,2), bloc_size)  :: cu
     COMPLEX(KIND=8), DIMENSION(m_max_pad, bloc_size)                 :: cu
     REAL(KIND=8), DIMENSION(2*m_max_pad-1, bloc_size)                :: ru
     COMPLEX(KIND=8), DIMENSION(m_max_pad,bloc_size)                  :: prod_cu
     REAL(KIND=8), DIMENSION(2*m_max_pad-1,bloc_size)                 :: prod_ru
     COMPLEX(KIND=8), DIMENSION(bloc_size)            :: intermediate
     REAL(KIND=8), DIMENSION(SIZE(V1_in,3),SIZE(V1_in,2),bloc_size*nb_procs)      :: dist_field, combined_field
     COMPLEX(KIND=8), DIMENSION(bloc_size,SIZE(V1_in,3)*nb_procs)                 :: combined_prod_cu
     COMPLEX(KIND=8), DIMENSION(bloc_size,SIZE(V1_in,3)*nb_procs) :: dist_prod_cu

     INTEGER :: n1, n2, rank
     INTEGER :: nb, nf, shiftc, shiftl, jindex, longueur_tranche, i, n, code
     REAL(KIND=8) :: t

     ! FFTW parameters
     INTEGER   :: fft_dim, howmany, istride, ostride, idist, odist
     INTEGER, DIMENSION(1) :: dim, inembed, onembed
     ! Recall complexes must be rescaled
     ! End FFTW parameters
 #include "petsc/finclude/petsc.h"
     mpi_comm :: communicator
     petscerrorcode         :: ierr

     IF (PRESENT(temps)) temps = 0.d0

     np      = SIZE(v1_in,1)
     nb_field= SIZE(v1_in,2) ! Number of fields
     m_max_c = SIZE(v1_in,3) ! Number of complex (cosines + sines) coefficients per point
     m_max = m_max_c*nb_procs! Number of comlex coefficients per point per processor
     n_r_pad=2*m_max_pad-1
     np_tot = nb_procs*bloc_size

     IF (mod(nb_field,2)/=0 .OR. m_max_c==0) THEN
        WRITE(*,*) ' BUG '
        stop
     END IF

     ! Bloc_size is the number of points that are handled by one processor
     ! once the Fourier modes are all collected
     ! Computation of bloc_size and np_tot
     ! fin de la repartition des points

     ! Packing all 3 complex components of both v1 and v2 input fields
     ! into dist_field, where the dimension indexing the nodal points varies the least rapidly,
     ! so that after distributing the data to the processes, each one will obtain a part
     ! on nodal points
     ! TRANSPOSE pr que la variable i associee aux modes soit la 1ere sur laquelle on va faire la FFT
     t = mpi_wtime()

     DO i = 1, m_max_c
        dist_field(i,1:nb_field,1:np) = transpose(v1_in(:,:,i))
     END DO
     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     IF (np/=np_tot) dist_field(:,:,np+1:np_tot) = 1.d100

     longueur_tranche=bloc_size*m_max_c*nb_field

     t = mpi_wtime()
     mpid=mpi_double_precision
     CALL mpi_alltoall (dist_field,longueur_tranche, mpid, combined_field, longueur_tranche, &
          mpid, communicator, code)

     IF (PRESENT(temps)) temps(1) = temps(1) + mpi_wtime() -t

     t = mpi_wtime()
     !JLG, FEB 4, 2011
     cu = 0.d0
     !JLG, FEB 4, 2011
     DO n = 1, bloc_size
        DO nb = 1, nb_procs
           shiftc = (nb-1)*bloc_size
           shiftl = (nb-1)*m_max_c
           jindex = n + shiftc
              ! Put real and imaginary parts in a complex
              ! nf=1,2,3 => V1_in
              ! nf=4,5,6 => V2_in
              ! INPUT ARE COSINE AND SINE COEFFICIENTS
              ! THEY ARE PUT IN COMPLEX FORMAT: c_0 = a_0 + i*0 and c_n = (a_n-i*b_n)/2
              cu(shiftl+1:shiftl+m_max_c,n) = cmplx(combined_field(:,1,jindex),&
                   -combined_field(:,2,jindex),kind=8)/2
        END DO
     END DO
     cu(1,:) = 2*cmplx(REAL(cu(1,:),KIND=8),0.d0,KIND=8)
     !JLG, FEB 4, 2011
     !Padding is done by initialization of cu: cu = 0
     !This is eequivalent to cu(m_max+1:m_max_pad,:,:) = 0.d0
     !JLG, FEB 4, 2011

     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     ! Set the parameters for dfftw
     fft_dim=1; istride=1; ostride=1;
     !JLG, FEB 4, 2011
 !!$       idist=N_r;   inembed(1)=N_r; DIM(1)=N_r
 !!$       odist=m_max; onembed(1)=m_max
     idist=n_r_pad;   inembed(1)=n_r_pad; dim(1)=n_r_pad
     odist=m_max_pad; onembed(1)=m_max_pad
     !JLG, FEB 4, 2011

     howmany=bloc_size*nb_field/2


     t = mpi_wtime()
     CALL dfftw_plan_many_dft_c2r(fftw_plan_multi_c2r, fft_dim, dim, howmany, cu, &
          onembed, ostride, odist, ru, inembed, istride, idist, fftw_estimate)
     !write(*,*) ' FFT_PAR_CROSS_PROD: fftw_plan_multi_c2r', fftw_plan_multi_c2r
     CALL dfftw_execute(fftw_plan_multi_c2r)
     ! CROSS PRODDUCT
     IF (nb_field==2) THEN
        DO n1 = 1, 2*m_max_pad-1
           DO n2 = 1, bloc_size
 !!$             prod_ru(n1,n2) = (1+tanh((ru(n1,n2)-0.5d0)/.01))/2
              IF (ru(n1,n2) > 0.5) THEN
                 prod_ru(n1,n2) = 1.d0
              ELSE
                 prod_ru(n1,n2) = 0.d0
              END IF
           END DO
        END DO
     END IF
     ! CROSS PRODUCT

     howmany = howmany
     CALL dfftw_plan_many_dft_r2c(fftw_plan_multi_r2c, fft_dim, dim, howmany, prod_ru, &
          inembed, istride, idist, prod_cu, onembed, ostride, odist, fftw_estimate)
     !write(*,*) ' FFT_PAR_CROSS_PROD: fftw_plan_multi_r2c', fftw_plan_multi_r2c
     CALL dfftw_execute(fftw_plan_multi_r2c)
     !JLG, FEB 4, 2011
 !!$       prod_cu = prod_cu/N_r !Scaling
     prod_cu = prod_cu/n_r_pad !Scaling
     !JLG, FEB 4, 2011
     IF (PRESENT(temps)) temps(2) = temps(2) + mpi_wtime() -t
     !Now we need to redistribute the Fourier coefficients on each processor

     t = mpi_wtime()
     combined_prod_cu(:,1)=prod_cu(1,:)
     DO n=2, m_max
        !combined_prod_cu(:,:,n)=prod_cu(n,:,:)
        combined_prod_cu(:,n)=2*conjg(prod_cu(n,:))
     END DO

     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

     t = mpi_wtime()
     longueur_tranche=bloc_size*m_max_c*nb_field
     mpid=mpi_double_precision
     CALL mpi_alltoall (combined_prod_cu,longueur_tranche,mpid, dist_prod_cu,longueur_tranche, &
          mpid,communicator,code)
     IF (PRESENT(temps)) temps(1) = temps(1) + mpi_wtime() -t
     ! dimensions:
     t = mpi_wtime()
     DO i = 1, m_max_c
        DO nb = 1, nb_procs
           shiftc = (nb-1)*bloc_size
           shiftl = (nb-1)*m_max_c
           intermediate = dist_prod_cu(:,shiftl+i)
           DO n = 1, bloc_size
              IF (n+shiftc > np ) cycle
                 v_out(n+shiftc, 1, i) = REAL (intermediate(n),KIND=8)
                 v_out(n+shiftc, 2, i)  = aimag(intermediate(n))
           END DO
        END DO
     END DO
     IF (PRESENT(temps)) temps(3) = temps(3) + mpi_wtime() -t

   END SUBROUTINE fft_heaviside_dcl

 END MODULE sft_parallele_obsolete
gauss_points::l_g
integer, public l_g
Definition: associate_gauss.f90:9

sft_parallele_obsolete::fft_par_dot_prod_bis
subroutine fft_par_dot_prod_bis(communicator, V1_in, V2_in, V_out, nb_procs, bloc_size, m_max_pad, temps, padding)
Definition: fft_parallel_obsolete.f90:1040

sft_parallele_obsolete::fft_par_allen_cahn
subroutine, public fft_par_allen_cahn(communicator, c_in, c_out, temps, padding)
Definition: fft_parallel_obsolete.f90:2009

sft_parallele_obsolete::fft_par_cross_prod_bug
subroutine fft_par_cross_prod_bug(communicator, V1_in, V2_in, V_out, nb_procs, bloc_size, m_max_pad, temps)
Definition: fft_parallel_obsolete.f90:129

sft_parallele_obsolete::fft_par_dot_prod
subroutine, public fft_par_dot_prod(communicator, V1_in, V2_in, c_out, temps, padding)
Definition: fft_parallel_obsolete.f90:1488

my_util
Definition: my_util.f90:1

sft_parallele_obsolete::fft_par_dot_prod_dcl
subroutine, public fft_par_dot_prod_dcl(communicator, V1_in, V2_in, c_out, nb_procs, bloc_size, m_max_pad, temps)
Definition: fft_parallel_obsolete.f90:714

my_util::error_petsc
subroutine error_petsc(string)
Definition: my_util.f90:16

sft_parallele_obsolete::fft_heaviside_dcl
subroutine, public fft_heaviside_dcl(communicator, V1_in, V_out, nb_procs, bloc_size, m_max_pad, temps, padding)
Definition: fft_parallel_obsolete.f90:2504

sft_parallele_obsolete::ref
subroutine, public ref(communicator, V1_in, V2_in, V_out, temps)
Definition: fft_parallel_obsolete.f90:2260

sft_parallele_obsolete::fft_par_compressive_visc_dcl
subroutine, public fft_par_compressive_visc_dcl(communicator, V1_in, V2_in, V_out, pb, nb_procs, bloc_size, m_max_pad, l_G, opt_norm_out, opt_M_vel, opt_norm, temps, padding)
Definition: fft_parallel_obsolete.f90:493

sft_parallele_obsolete::fft_par_real
subroutine, public fft_par_real(communicator, V1_in, V_out, opt_nb_plane)
Definition: fft_parallel_obsolete.f90:14

sft_parallele_obsolete::fft_par_cross_prod_dcl
subroutine, public fft_par_cross_prod_dcl(communicator, V1_in, V2_in, V_out, nb_procs, bloc_size, m_max_pad, temps, padding)
Definition: fft_parallel_obsolete.f90:311

sft_parallele_obsolete::fft_par_prod_dcl
subroutine, public fft_par_prod_dcl(communicator, c1_in, c2_in, c_out, nb_procs, bloc_size, m_max_pad, temps)
Definition: fft_parallel_obsolete.f90:880

sft_parallele_obsolete::fft_par_prod
subroutine, public fft_par_prod(communicator, c1_in, c2_in, c_out, temps, padding)
Definition: fft_parallel_obsolete.f90:1752

sft_parallele_obsolete::fft_par_cross_prod
subroutine, public fft_par_cross_prod(communicator, V1_in, V2_in, V_out, temps, padding)
Definition: fft_parallel_obsolete.f90:1218

sft_parallele_obsolete
Definition: fft_parallel_obsolete.f90:4