From 4a0472f8ba4eccb880941d47176b2d79c71215f9 Mon Sep 17 00:00:00 2001
From: Ron Caplan <caplanr@predsci.com>
Date: Thu, 5 Dec 2024 17:46:46 -0800
Subject: [PATCH] Updated stdpar and nompigpu src subfolder versions for
 freeform.  Updated code to use global seam buffers for less gpu
 (de)allocations.

---
 src/no_gpu_mpi/pot3d.F   | 7062 -------------------------------------
 src/no_gpu_mpi/pot3d.F90 | 7217 ++++++++++++++++++++++++++++++++++++++
 src/pot3d.F90            |  405 ++-
 src/stdpar/pot3d.F       | 6970 ------------------------------------
 src/stdpar/pot3d.F90     | 7126 +++++++++++++++++++++++++++++++++++++
 5 files changed, 14621 insertions(+), 14159 deletions(-)
 delete mode 100644 src/no_gpu_mpi/pot3d.F
 create mode 100644 src/no_gpu_mpi/pot3d.F90
 delete mode 100644 src/stdpar/pot3d.F
 create mode 100644 src/stdpar/pot3d.F90

diff --git a/src/no_gpu_mpi/pot3d.F b/src/no_gpu_mpi/pot3d.F
deleted file mode 100644
index 216e5e1..0000000
--- a/src/no_gpu_mpi/pot3d.F
+++ /dev/null
@@ -1,7062 +0,0 @@
-c#######################################################################
-c  _____   ____ _______ ____  _____
-c |  __ \ / __ \__   __|___ \|  __ \
-c | |__) | |  | | | |    __) | |  | |
-c |  ___/| |  | | | |   |__ <| |  | |
-c | |    | |__| | | |   ___) | |__| |
-c |_|     \____/  |_|  |____/|_____/
-c
-c ****** POT3D: 3D potential magnetic field outside a sphere.
-c
-c ****** This program can find the classical potential field, the
-c ****** fully open field, the source-surface field, and the
-c ****** source-surface plus current-sheet field.
-c
-c        Authors:  Zoran Mikic
-c                  Ronald M. Caplan
-c                  Jon A. Linker
-c                  Roberto Lionello
-c                  Miko Stulajter
-c
-c        Predictive Science Inc.
-c        www.predsci.com
-c        San Diego, California, USA 92121
-c
-c#######################################################################
-c Copyright 2021 Predictive Science Inc.
-c
-c Licensed under the Apache License, Version 2.0 (the "License");
-c you may not use this file except in compliance with the License.
-c You may obtain a copy of the License at
-c
-c    http://www.apache.org/licenses/LICENSE-2.0
-c
-c Unless required by applicable law or agreed to in writing, software
-c distributed under the License is distributed on an "AS IS" BASIS,
-c WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-c implied.
-c See the License for the specific language governing permissions and
-c limitations under the License.
-c#######################################################################
-c
-c#######################################################################
-c
-      module ident
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c ****** Code name.
-c-----------------------------------------------------------------------
-c
-      character(*), parameter :: idcode='POT3D'
-      character(*), parameter :: vers  ='4.2.2_nogpumpi'
-      character(*), parameter :: update='12/05/2024'
-c
-      end module
-c#######################################################################
-      module number_types
-c
-c-----------------------------------------------------------------------
-c ****** Basic number types.
-c ****** This module is used to set the default precision for REALs.
-c-----------------------------------------------------------------------
-c
-      use iso_fortran_env
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-      integer, parameter :: KIND_REAL_4=REAL32
-      integer, parameter :: KIND_REAL_8=REAL64
-      integer, parameter :: KIND_REAL_16=max(REAL128,REAL64)
-c
-      integer, parameter :: r_typ=KIND_REAL_8
-c
-      end module
-c#######################################################################
-      module number_types_pc
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use iso_fortran_env
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer, parameter :: r_typ_pc=REAL32
-c
-      end module
-c#######################################################################
-      module constants
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-      real(r_typ), parameter :: pi=3.1415926535897932_r_typ
-c
-      end module
-c#######################################################################
-      module global_dims
-c
-c-----------------------------------------------------------------------
-c ****** Global number of mesh points.
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c ****** Global mesh size.
-c
-      integer :: nr_g,nrm1_g
-      integer :: nt_g,ntm1_g
-      integer :: np_g,npm1_g
-c
-c ****** Flag to indicate an axisymmetric run.
-c
-      logical :: axisymmetric=.false.
-c
-      end module
-c#######################################################################
-      module local_dims_r
-c
-c-----------------------------------------------------------------------
-c ****** Local number of mesh points and indices in the r direction
-c ****** for the radial mesh.
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c ****** Local mesh size.
-c
-      integer :: nr,nrm1
-c
-c ****** Dimensions of arrays on the "main" mesh.
-c
-      integer :: nrm
-c
-c ****** Indices of start and end points in the global mesh
-c ****** belonging to this processor.
-c
-      integer :: i0_g,i1_g
-c
-c ****** Flags to indicate whether this processor has points
-c ****** on the physical boundaries.
-c
-      logical :: rb0,rb1
-c
-      end module
-c#######################################################################
-      module local_dims_tp
-c
-c-----------------------------------------------------------------------
-c ****** Local number of mesh points and indices in the theta
-c ****** and phi dimensions.
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c ****** Local mesh size.
-c
-      integer :: nt,ntm1
-      integer :: np,npm1
-c
-c ****** Dimensions of arrays on the "main" mesh.
-c
-      integer :: ntm
-      integer :: npm
-c
-c ****** Indices of start and end points in the global mesh
-c ****** belonging to this processor.
-c
-      integer :: j0_g,j1_g
-      integer :: k0_g,k1_g
-c
-c ****** Flags to indicate whether this processor has points
-c ****** on the physical boundaries.
-c
-      logical :: tb0,tb1
-c
-      end module
-c#######################################################################
-      module global_mesh
-c
-c-----------------------------------------------------------------------
-c ****** Global mesh.
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use constants
-c
-      implicit none
-c
-      real(r_typ), dimension(:), allocatable :: r_g,rh_g,dr_g,drh_g
-      real(r_typ), dimension(:), allocatable :: t_g,th_g,dt_g,dth_g
-      real(r_typ), dimension(:), allocatable :: p_g,ph_g,dp_g,dph_g
-c
-      real(r_typ), dimension(:), allocatable :: st_g,ct_g,sth_g,cth_g
-      real(r_typ), dimension(:), allocatable :: sp_g,cp_g,sph_g,cph_g
-c
-c ****** Physical mesh size.
-c
-      real(r_typ), parameter, private :: one=1._r_typ
-      real(r_typ), parameter, private :: two=2._r_typ
-c
-      real(r_typ) :: r0=1._r_typ
-      real(r_typ) :: r1=30._r_typ
-      real(r_typ), parameter :: t0=0.
-      real(r_typ), parameter :: t1=pi
-      real(r_typ), parameter :: p0=0.
-      real(r_typ), parameter :: p1=two*pi
-c
-      real(r_typ), parameter :: pl=p1-p0
-      real(r_typ), parameter :: pl_i=one/pl
-c
-      end module
-c#######################################################################
-      module local_mesh_r
-c
-c-----------------------------------------------------------------------
-c ****** Local mesh for the r dimension.
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-      implicit none
-c
-      real(r_typ), dimension(:), allocatable :: r,r2,rh,dr,drh
-      real(r_typ) :: dr1,drn
-c
-c ****** Inverse quantities (for efficiency).
-c
-      real(r_typ), dimension(:), allocatable :: r_i,rh_i
-      real(r_typ), dimension(:), allocatable :: dr_i,drh_i
-c
-      end module
-c#######################################################################
-      module local_mesh_tp
-c
-c-----------------------------------------------------------------------
-c ****** Local mesh for the theta and phi dimensions.
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-      implicit none
-c
-      real(r_typ), dimension(:), allocatable :: t,th,dt,dth
-      real(r_typ), dimension(:), allocatable :: p,ph,dp,dph
-c
-      real(r_typ), dimension(:), allocatable :: st,ct,sth,cth
-      real(r_typ), dimension(:), allocatable :: sp,cp,sph,cph
-c
-c ****** Inverse quantities (for efficiency).
-c
-      real(r_typ), dimension(:), allocatable :: dt_i,dth_i
-      real(r_typ), dimension(:), allocatable :: st_i,sth_i
-      real(r_typ), dimension(:), allocatable :: dp_i,dph_i
-c
-      end module
-c#######################################################################
-      module mpidefs
-c
-c-----------------------------------------------------------------------
-c ****** MPI variables, processor topology, and processor information.
-c-----------------------------------------------------------------------
-c
-      use mpi
-c
-      implicit none
-c
-c ****** Total number of processors.
-c
-      integer :: nproc
-c
-c ****** Total number of processors per node.
-c
-      integer :: nprocsh
-c
-c ****** Processor rank of this process in communicator
-c ****** MPI_COMM_WORLD.
-c
-      integer :: iprocw
-c
-c ****** Processor rank of this process in communicator
-c ****** comm_shared.
-c
-      integer :: iprocsh
-c
-c ****** Flag to designate that this is the processor with
-c ****** rank 0 in communicator MPI_COMM_WORLD.
-c
-      logical :: iamp0
-c
-c ****** Communicator over all processors in the Cartesian topology.
-c
-      integer :: comm_all
-c
-c ****** Processor rank of this process in communicator
-c ****** COMM_ALL.
-c
-      integer :: iproc
-c
-c ****** Processor rank in communicator COMM_ALL for the
-c ****** processor that has rank 0 in MPI_COMM_WORLD.
-c
-      integer :: iproc0
-c
-c ****** Communicators over all processors in the phi dimension.
-c
-      integer :: comm_phi
-c
-c ****** Communicator over all shared processors on the node.
-c
-      integer :: comm_shared
-c
-c ****** Communicators over all processors in the r dimension.
-c
-      integer :: comm_r
-c
-c ****** Processor coordinate indices of this process
-c ****** in the Cartesian topology.
-c
-      integer :: iproc_r,iproc_t,iproc_p
-c
-c ****** Processor coordinate indices of the neighboring
-c ****** processors in the Cartesian topology.
-c
-      integer :: iproc_rm,iproc_rp
-      integer :: iproc_tm,iproc_tp
-      integer :: iproc_pm,iproc_pp
-c
-c ****** Number of processors along r, theta, and phi.
-c
-      integer :: nproc_r,nproc_t,nproc_p
-c
-c ****** Number type for REALs to be used in MPI calls.
-c
-      integer :: ntype_real
-c
-c ****** Total number of GPUs/node (DEPRICATED).
-c
-      integer :: gpn=0
-c
-      end module
-c#######################################################################
-      module decomposition_params
-c
-c-----------------------------------------------------------------------
-c ****** Input parameters that define the domain decomposition
-c ****** among processors.
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c ****** Number of processors per dimension.
-c
-      integer, dimension(3) :: nprocs=(/-1,-1,-1/)
-c
-      end module
-c#######################################################################
-      module decomposition
-c
-c-----------------------------------------------------------------------
-c ****** Information defining the domain decomposition.
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c ****** Define the structure type for mapping local arrays
-c ****** into global arrays.
-c
-      type :: map_struct
-        integer :: n
-        integer :: i0
-        integer :: i1
-        integer :: offset
-      end type
-c
-c ****** Mapping structures for the different mesh types.
-c
-      type(map_struct), dimension(:), pointer :: map_rh
-      type(map_struct), dimension(:), pointer :: map_rm
-      type(map_struct), dimension(:), pointer :: map_th
-      type(map_struct), dimension(:), pointer :: map_tm
-      type(map_struct), dimension(:), pointer :: map_ph
-      type(map_struct), dimension(:), pointer :: map_pm
-c
-      end module
-c#######################################################################
-      module meshdef
-c
-c-----------------------------------------------------------------------
-c ****** Variables that define the mesh-point distributions.
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-      implicit none
-c
-      integer, parameter :: nmseg=30
-c
-      real(r_typ), dimension(nmseg) :: drratio=0.
-      real(r_typ), dimension(nmseg) :: dtratio=0.
-      real(r_typ), dimension(nmseg) :: dpratio=0.
-      real(r_typ), dimension(nmseg) :: rfrac=0.
-      real(r_typ), dimension(nmseg) :: tfrac=0.
-      real(r_typ), dimension(nmseg) :: pfrac=0.
-c
-      integer :: nfrmesh=0
-      integer :: nftmesh=0
-      integer :: nfpmesh=0
-c
-      real(r_typ) :: phishift=0.
-c
-      end module
-c#######################################################################
-      module fields
-c
-c-----------------------------------------------------------------------
-c ****** Local field arrays.
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-      implicit none
-c
-c ****** Potential solution array and cg temp array.
-c
-      real(r_typ), dimension(:,:,:), allocatable :: phi
-      real(r_typ), dimension(:,:,:), allocatable :: x_ax
-c
-c ****** Boundary radial magnetic field arrays.
-c
-      real(r_typ), dimension(:,:), allocatable :: br0
-      real(r_typ), dimension(:,:), allocatable :: br1
-c
-c ****** Arrays used in polar boundary conditions.
-c
-      real(r_typ), dimension(:), allocatable :: sum0,sum1
-c
-c ****** Arrays used for final magnetic field.
-c
-      real(r_typ), dimension(:,:,:), allocatable :: br,bt,bp
-c
-      end module
-c#######################################################################
-      module cgcom
-c
-      use number_types
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c ****** Number of equations to solve in the CG solver.
-c-----------------------------------------------------------------------
-c
-      integer :: ncgeq
-c
-c-----------------------------------------------------------------------
-c ****** CG field solver parameters.
-c-----------------------------------------------------------------------
-c
-      integer :: ifprec=1
-      integer :: ncgmax=1000000
-      integer :: ncghist=100
-      real(r_typ) :: epscg=1.e-9
-c
-c-----------------------------------------------------------------------
-c ****** CG field solver variables.
-c-----------------------------------------------------------------------
-c
-      integer :: ncg
-      real(r_typ) :: epsn
-c
-      end module
-c#######################################################################
-      module vars
-c
-c-----------------------------------------------------------------------
-c ****** Miscellaneous input variables.
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-      implicit none
-c
-      character(256) :: outfile='pot3d.out'
-      character(256) :: phifile='default'
-      character(256) :: br0file='default'
-      character(256) :: brfile='default'
-      character(256) :: btfile='default'
-      character(256) :: bpfile='default'
-      character(256) :: br_photo_file='default'
-      character(256) :: br_photo_original_file='default'
-c
-c ****** Type of field solution.
-c ****** Select between 'potential', 'open', and 'source-surface'.
-c
-      character(16) :: option='potential'
-c
-c ****** Interval at which to dump diagonstics during the
-c ****** iteration for the source-surface plus current-sheet
-c ****** solution.
-c
-      integer :: ndump=0
-c
-c ****** Flag to skip the balancing of the flux (for PFSS and
-c ****** OPEN field options only).
-
-      logical :: do_not_balance_flux=.false.
-c
-c ****** Set format for output files.
-c
-      character(3) :: fmt='h5'
-c
-      logical :: hdf32=.true.
-c
-c ***** Validation run (tilted dipole).
-c
-      logical :: validation_run=.false.
-c
-      real(r_typ) :: dipole_angle=0.7853981633974483_r_typ
-c
-      end module
-c#######################################################################
-      module solve_params
-c
-c-----------------------------------------------------------------------
-c ****** Parameters used in the solver.
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-      implicit none
-c
-c ****** Boundary condition switch at r=R1.
-c
-      real(r_typ) :: pm_r1
-c
-      end module
-c#######################################################################
-      module timer
-c
-c-----------------------------------------------------------------------
-c ****** Timer stack.
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-      implicit none
-c
-      integer, parameter :: nstack=10
-      integer :: istack=0
-      real(r_typ), dimension(nstack) :: tstart=0.
-c
-      end module
-c#######################################################################
-      module timing
-c
-c-----------------------------------------------------------------------
-c ****** Timing variables.
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-      implicit none
-c
-      real(r_typ) :: t_startup=0.
-      real(r_typ) :: t_solve=0.
-      real(r_typ) :: t_pc_load=0.
-      real(r_typ) :: t_pc=0.
-      real(r_typ) :: t_ax=0.
-      real(r_typ) :: t_io=0.
-      real(r_typ) :: c_seam=0.
-      real(r_typ) :: c_cgdot=0.
-      real(r_typ) :: c_sumphi=0.
-      real(r_typ) :: t_wall=0.
-c
-      end module
-c#######################################################################
-      module debug
-c
-c-----------------------------------------------------------------------
-c ****** Debugging level.
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-      integer :: idebug=0
-c
-      end module
-c#######################################################################
-      module assemble_array_interface
-      interface
-        subroutine assemble_array (map_r,map_t,map_p,a,a_g)
-        use number_types
-        use decomposition
-        use mpidefs
-        implicit none
-        type(map_struct), dimension(0:nproc-1) :: map_r,map_t,map_p
-        real(r_typ), dimension(:,:,:) :: a,a_g
-        end subroutine
-      end interface
-      end module
-c#######################################################################
-      module cusparse_interface
-c
-      use, intrinsic :: iso_c_binding
-c
-#ifdef CUSPARSE
-      interface
-        subroutine load_lusol_cusparse(CSR_A,CSR_AI,CSR_AJ,N,M)
-     &    BIND(C, name="load_lusol_cusparse")
-          use, intrinsic :: iso_c_binding
-          implicit none
-          integer(C_INT), value :: N,M
-          type(C_PTR), value :: CSR_A,CSR_AI,CSR_AJ
-        end subroutine load_lusol_cusparse
-c
-        subroutine lusol_cusparse(x)
-     &    BIND(C, name="lusol_cusparse")
-          use, intrinsic :: iso_c_binding
-          implicit none
-          type(C_PTR), value :: x
-        end subroutine lusol_cusparse
-c
-        subroutine unload_lusol_cusparse()
-     &    BIND(C, name="unload_lusol_cusparse")
-        end subroutine unload_lusol_cusparse
-      end interface
-#endif
-c
-      integer(c_int) :: cN,cM
-c
-      end module
-c#######################################################################
-      module matrix_storage_pot3d_solve
-c
-c-----------------------------------------------------------------------
-c ****** Storage for the matrix/preconditioners of the solve.
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use number_types_pc
-c
-      implicit none
-c
-      real(r_typ), dimension(:,:,:,:), allocatable :: a
-      real(r_typ_pc), dimension(:), allocatable :: a_i
-c
-      integer, dimension(7) :: a_offsets
-
-      integer :: N,M
-      real(r_typ_pc), dimension(:), allocatable, target :: a_csr
-      real(r_typ_pc), dimension(:), allocatable :: lu_csr
-      real(r_typ_pc), dimension(:), allocatable :: a_csr_d
-      integer, dimension(:), allocatable :: lu_csr_ja
-      integer, dimension(:), allocatable, target :: a_csr_ia
-      integer, dimension(:), allocatable, target :: a_csr_ja
-      integer, dimension(:), allocatable :: a_N1
-      integer, dimension(:), allocatable :: a_N2
-      integer, dimension(:), allocatable :: a_csr_dptr
-c
-      end module
-c#######################################################################
-      program POT3D
-c
-c-----------------------------------------------------------------------
-c
-      use ident
-      use mpidefs
-      use vars
-      use solve_params
-      use timing
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-c ****** Initialize MPI.
-c
-      call init_mpi
-c
-c ****** Start the wall-clock timer.
-c
-      call timer_on
-c
-c ****** Write the code name and version.
-c
-      if (iamp0) then
-        write (*,*)
-        write (*,*) idcode,' Version ',vers,', updated on ',update
-      end if
-c
-      call timer_on
-c
-c ****** Read the input file.
-c
-      call read_input_file
-c
-c ****** Create the output file.
-c
-      if (iamp0) then
-        call ffopen (9,outfile,'rw',ierr)
-        if (ierr.ne.0) then
-          write (*,*)
-          write (*,*) '### ERROR in POT3D:'
-          write (*,*) '### Could not create the output file.'
-          write (*,*) 'File name: ',trim(outfile)
-        end if
-      end if
-      call check_error_on_p0 (ierr)
-c
-c ****** Check the input parameters.
-c
-      call check_input
-c
-c ****** Check the processor topology.
-c
-      call check_proc_topology
-c
-c ****** Decompose the domain.
-c
-      call decompose_domain
-c
-c ****** Allocate global arrays.
-c
-      call allocate_global_arrays
-c
-c ****** Set the global meshes.
-c
-      call set_global_mesh
-c
-c ****** Decompose the mesh.
-c
-      call decompose_mesh_r
-      call decompose_mesh_tp
-c
-c ****** Allocate local arrays.
-c
-      call allocate_local_arrays_r
-      call allocate_local_arrays_tp
-c
-c ****** Set the local meshes.
-c
-      call set_local_mesh_r
-      call set_local_mesh_tp
-c
-c ****** Print decomposition diagnostics.
-c
-      call decomp_diags
-c
-c ****** Initialize the flux and if validating, write analytic solution.
-c
-      if (validation_run) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### COMMENT from POT3D:'
-          write (*,*) '### Validation run requested.'
-          write (*,*) '### Ignoring br input file,'
-          write (*,*) '### setting HDF32=.FALSE.,'
-          write (*,*) '### and overriding output filenames.'
-          write (9,*)
-          write (9,*) '### COMMENT from POT3D:'
-          write (9,*) '### Validation run requested.'
-          write (9,*) '### Ignoring br input file'
-          write (9,*) '### setting HDF32=.FALSE.,'
-          write (9,*) '### and overriding output filenames.'
-        end if
-        hdf32=.false.
-        brfile='br_solved.'//trim(fmt)
-        btfile='bt_solved.'//trim(fmt)
-        bpfile='bp_solved.'//trim(fmt)
-        phifile='phi_solved.'//trim(fmt)
-        call set_validation_flux
-        call write_validation_solution
-      else
-        call set_flux
-      end if
-c
-      call timer_off (t_startup)
-c
-c ****** Find the solution.
-c
-      if (iamp0) then
-        write (*,*)
-        write (*,*) '### COMMENT from POT3D:'
-        write (*,*) '### Starting PCG solve.'
-        call FLUSH(OUTPUT_UNIT)
-        write (9,*)
-        write (9,*) '### COMMENT from POT3D:'
-        write (9,*) '### Starting PCG solve.'
-        call FLUSH(9)
-      end if
-c
-      call timer_on
-c
-      call potfld
-c
-      call timer_off (t_solve)
-c
-c ****** Compute B.
-c
-      call getb
-c
-c ****** Write solution to file.
-c
-      call write_solution
-c
-c ****** Magnetic energy diagnostics.
-c
-      call magnetic_energy
-c
-      call MPI_Barrier(MPI_COMM_WORLD,ierr)
-      call timer_off (t_wall)
-c
-      call write_timing
-c
-      call endrun (.false.)
-c
-      end
-c#######################################################################
-      subroutine read_input_file
-c
-c-----------------------------------------------------------------------
-c
-c ****** Read the input file.
-c
-c-----------------------------------------------------------------------
-c
-      use global_dims
-      use global_mesh
-      use mpidefs
-      use meshdef
-      use cgcom
-      use debug
-      use vars
-      use decomposition_params
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-c ****** Values for the global mesh size.
-c ****** Since these names conflict with those in LOCAL_DIMS*, it is
-c ****** important not to use these modules here.
-c
-      integer :: nr=0
-      integer :: nt=0
-      integer :: np=0
-c
-c-----------------------------------------------------------------------
-c
-      namelist /topology/ nprocs,nr,nt,np,gpn
-c
-c-----------------------------------------------------------------------
-c
-      namelist /inputvars/ r0,r1,fmt,
-     &                     drratio,dtratio,dpratio,
-     &                     rfrac,tfrac,pfrac,
-     &                     nfrmesh,nftmesh,nfpmesh,
-     &                     phishift,
-     &                     ifprec,ncgmax,ncghist,epscg,
-     &                     idebug,br0file,phifile,
-     &                     brfile,btfile,bpfile,br_photo_file,
-     &                     br_photo_original_file,
-     &                     option,
-     &                     do_not_balance_flux,hdf32,
-     &                     validation_run,dipole_angle
-c
-c-----------------------------------------------------------------------
-c
-      integer :: ierr
-      character(80) :: infile='pot3d.dat'
-c
-c-----------------------------------------------------------------------
-c
-c ****** Read the input file.
-c
-      call ffopen (8,infile,'r',ierr)
-c
-      if (ierr.ne.0) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in READ_INPUT_FILE:'
-          write (*,*) '### Could not open the input file.'
-          write (*,*) 'File name: ',trim(infile)
-        end if
-        call endrun (.true.)
-      end if
-c
-      read (8,topology)
-c
-      read (8,inputvars)
-c
-      close (8)
-c
-      if (trim(fmt).ne.'h5') then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in READ_INPUT_FILE:'
-          write (*,*) '### "fmt" must be "h5".'
-          write (*,*) 'fmt: ',trim(fmt)
-        end if
-        call endrun (.true.)
-      end if
-c
-c ****** Check if output names were overwritten.
-c ****** If not, set default names with format fmt.
-c
-      if (trim(phifile).eq.'default') then
-        phifile='phi.'//trim(fmt)
-      end if
-      if (trim(br0file).eq.'default') then
-        br0file='br0.'//trim(fmt)
-      end if
-      if (trim(brfile).eq.'default') then
-        brfile='br.'//trim(fmt)
-      end if
-      if (trim(btfile).eq.'default') then
-        btfile='bt.'//trim(fmt)
-      end if
-      if (trim(bpfile).eq.'default') then
-        bpfile='bp.'//trim(fmt)
-      end if
-      if (trim(br_photo_file).eq.'default') then
-        br_photo_file='br_photo.'//trim(fmt)
-      end if
-      if (trim(br_photo_original_file).eq.'default') then
-        br_photo_original_file='br_photo_original.'//trim(fmt)
-      end if
-c
-      nr_g=nr
-      nt_g=nt
-      np_g=np
-c
-c ****** Check if the specified mesh dimensions are valid.
-c
-      call check_mesh_dimensions (nr_g,nt_g,np_g)
-c
-      nrm1_g=nr_g-1
-      ntm1_g=nt_g-1
-      npm1_g=np_g-1
-c
-      end subroutine
-c#######################################################################
-      subroutine check_error_on_p0 (ierr0)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Check if the error flag IERR0 on processor 0 in
-c ****** MPI_COMM_WORLD (i.e., processor IPROC0 in COMM_ALL)
-c ****** indicates that the code should exit.
-c
-c ****** If IERR0 is non-zero, all the processors are directed
-c ****** to call ENDRUN to terminate the code.
-c
-c-----------------------------------------------------------------------
-c
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: ierr0
-c
-c-----------------------------------------------------------------------
-c
-c ****** MPI error return.
-c
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-c ****** Broadcast IERR0 to all processors.
-c
-      call MPI_Bcast (ierr0,1,MPI_INTEGER,0,MPI_COMM_WORLD,ierr)
-c
-c ****** Call ENDRUN if IERR0 is non-zero.
-c
-      if (ierr0.ne.0) then
-        call endrun (.true.)
-      end if
-c
-      return
-      end
-c#######################################################################
-      subroutine endrun (ifstop)
-c
-c-----------------------------------------------------------------------
-c
-c ****** End the run and exit the code.
-c
-c-----------------------------------------------------------------------
-c
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      logical :: ifstop
-c
-c-----------------------------------------------------------------------
-c
-c ****** MPI error return.
-c
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-c ****** Close the output file.
-c
-      if (iamp0) then
-        close (9)
-      end if
-c
-c ****** Exit MPI gracefully.
-c
-      call MPI_Finalize (ierr)
-c
-c ****** Call the STOP statement if requested.
-c
-      if (ifstop) then
-        stop
-      end if
-c
-      return
-      end
-c#######################################################################
-      subroutine init_mpi
-c
-c-----------------------------------------------------------------------
-c
-c ****** Initialize MPI.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-c ****** MPI error return.
-c
-      integer :: ierr,tcheck
-c
-c-----------------------------------------------------------------------
-c
-c ****** Real number to determine the KIND of REALs.
-c
-      real(r_typ) :: def_real
-c
-c-----------------------------------------------------------------------
-c
-      call MPI_Init_thread (MPI_THREAD_FUNNELED,tcheck,ierr)
-c
-c ****** Get the total number of processors.
-c
-      call MPI_Comm_size (MPI_COMM_WORLD,nproc,ierr)
-c
-c ****** Get the index (rank) of the local processor in
-c ****** communicator MPI_COMM_WORLD in variable IPROCW.
-c
-      call MPI_Comm_rank (MPI_COMM_WORLD,iprocw,ierr)
-c
-c ****** Create a shared communicator for all ranks in the node.
-c
-      call MPI_Comm_split_type (MPI_COMM_WORLD,MPI_COMM_TYPE_SHARED,0,
-     &                          MPI_INFO_NULL,comm_shared,ierr)
-c
-c ****** Get the total number of processors in node.
-c
-      call MPI_Comm_size (comm_shared,nprocsh,ierr)
-c
-c ****** Get the index (rank) of the local processor in the local node.
-c
-      call MPI_Comm_rank (comm_shared,iprocsh,ierr)
-c
-c ****** Set the flag to designate whether this processor
-c ****** has rank 0 in communicator MPI_COMM_WORLD.
-c
-      if (iprocw.eq.0) then
-        iamp0=.true.
-      else
-        iamp0=.false.
-      end if
-c
-c ****** Set the number type for communicating REAL
-c ****** numbers in MPI calls.
-c
-      if (kind(def_real).eq.KIND_REAL_4) then
-        ntype_real=MPI_REAL4
-      else if (kind(def_real).eq.KIND_REAL_8) then
-        ntype_real=MPI_REAL8
-      else
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in INIT_MPI:'
-          write (*,*) '### Unrecognized default REAL number KIND:'
-          write (*,*) 'KIND(default_real) = ',kind(def_real)
-          write (*,*) 'This is a fatal error.'
-        end if
-        call endrun (.true.)
-      end if
-c
-c ****** Set the GPU device number based on local rank.
-c ****** NOTE! This assumes than #GPUs per node = #MPI ranks per node.
-c
-!$    call omp_set_default_device (iprocsh)
-!$acc set device_num(iprocsh)
-c
-      end subroutine
-c#######################################################################
-      subroutine check_input
-c
-c-----------------------------------------------------------------------
-c
-c ****** Check the validity of the input parameters.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use vars
-      use solve_params
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: one=1._r_typ
-c
-c-----------------------------------------------------------------------
-c
-c ****** Check that OPTION is valid and set the boundary condition
-c ****** switches accordingly.
-c
-      if (option.eq.'potential') then
-c
-c ****** For a potential field, set d(phi)/dr to zero at r=R1
-c ****** (i.e., the field is tangential to the boundary).
-c
-        pm_r1=one
-c
-      else if (option.eq.'open') then
-c
-c ****** For an open field, set phi to zero at r=R1
-c ****** (i.e., the field is radial there).
-c
-        pm_r1=-one
-c
-      else if (option.eq.'ss') then
-c
-c ****** For a source surface field, set phi to zero at r=R1
-c ****** (i.e., the field is radial there).
-c
-        pm_r1=-one
-c
-      else
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in CHECK_INPUT:'
-          write (*,*) '### Invalid OPTION:'
-          write (*,*)
-          write (*,*) 'OPTION = ',trim(option)
-          write (*,*)
-          write (*,*) 'The options allowed are:'
-          write (*,*) '''potential'''
-          write (*,*) '''open'''
-          write (*,*) '''ss'''
-        end if
-        call endrun (.true.)
-      end if
-c
-      if (iamp0) then
-        write (*,*)
-        write (*,*) '### COMMENT from CHECK_INPUT:'
-        write (*,*) '### Field solve type:'
-        write (*,*)
-        write (*,*) 'OPTION = ',option
-        write (9,*)
-        write (9,*) '### COMMENT from CHECK_INPUT:'
-        write (9,*) '### Field solve type:'
-        write (9,*)
-        write (9,*) 'OPTION = ',option
-      end if
-c
-      return
-      end
-c#######################################################################
-      subroutine set_proc_topology
-c
-c-----------------------------------------------------------------------
-c
-c ****** Set the optimal values of the MPI rank topology
-c ****** in dimensions not set by user.
-c
-c-----------------------------------------------------------------------
-c
-      use mpidefs
-      use decomposition_params
-      use number_types
-      use global_dims
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: one=1.0_r_typ
-      real(r_typ), parameter :: zero=0.0_r_typ
-      real(r_typ), parameter :: bigval=HUGE(1.0_r_typ)
-c
-c-----------------------------------------------------------------------
-c
-      integer, dimension(:), allocatable :: factors
-      integer, dimension(:,:), allocatable :: rank_factors
-      real(r_typ), dimension(:,:), allocatable :: nperrank
-      real(r_typ), dimension(:), allocatable :: penalty
-c
-      integer :: i,j,k,num_fac,num_rank_fac,best_idx
-      integer :: fr=1,ft=1,fp=1
-      real(r_typ) :: a12,a13,a23
-c
-c-----------------------------------------------------------------------
-c
-c ****** Extract nproc values.  A value of -1 indicates the dimension
-c ****** should be autoset.
-c
-      nproc_r=nprocs(1)
-      nproc_t=nprocs(2)
-      nproc_p=nprocs(3)
-c
-c ****** If no dimensions are to be autoset, return.
-c
-      if(nproc_r.ne.-1.and.nproc_t.ne.-1.and.nproc_p.ne.-1) return
-c
-c ****** Get all factors of nproc and store them in factors array.
-c
-      i=1
-      num_fac=0
-      do while(i.le.nproc)
-        if (MOD(nproc,i).eq.0) then
-          num_fac=num_fac+1
-        endif
-        i=i+1
-      enddo
-      allocate (factors(num_fac))
-      i=1
-      num_fac=0
-      do while(i.le.nproc)
-        if (MOD(nproc,i).eq.0) then
-          num_fac=num_fac+1
-          factors(num_fac)=i
-        endif
-        i=i+1
-      enddo
-c
-c ****** Set penalty function parameters and any fixed dimensions
-c ****** based on which dimensions are to be autoset.
-c
-      a12=one
-      a13=one
-      a23=one
-c
-      if (nproc_r.ne.-1) then
-        fr=nproc_r
-        a12=zero
-        a13=zero
-      end if
-      if (nproc_t.ne.-1) then
-        ft=nproc_t
-        a12=zero
-        a23=zero
-      end if
-      if (nproc_p.ne.-1) then
-        fp=nproc_p
-        a13=zero
-        a23=zero
-      end if
-c
-c ****** Loop over all combinations of factors and save those that
-c ****** yield the correct number of MPI ranks into rank_factors array.
-c
-      num_rank_fac=0
-      do k=1,num_fac
-        do j=1,num_fac
-          do i=1,num_fac
-            if(nproc_r.eq.-1) fr=factors(i)
-            if(nproc_t.eq.-1) ft=factors(j)
-            if(nproc_p.eq.-1) fp=factors(k)
-            if (fr*ft*fp.eq.nproc) then
-              num_rank_fac=num_rank_fac+1
-            end if
-          enddo
-        enddo
-      enddo
-c
-      if (num_rank_fac.eq.0) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in SET_PROC_TOPOLOGY:'
-          write (*,*) '### Processor topology specification error.'
-          write (*,*) 'No valid topologies found for selected options.'
-          write (*,*) 'Number of MPI ranks = ',nproc
-          write (*,*) 'NPROC_R = ',nproc_r
-          write (*,*) 'NPROC_T = ',nproc_t
-          write (*,*) 'NPROC_P = ',nproc_p
-        end if
-        call endrun (.true.)
-      end if
-c
-      allocate(rank_factors(num_rank_fac,3))
-      allocate(nperrank(num_rank_fac,3))
-      allocate(penalty(num_rank_fac))
-c
-      rank_factors(:,:)=-1
-      penalty(:)=bigval
-c
-      num_rank_fac=0
-      do k=1,num_fac
-        do j=1,num_fac
-          do i=1,num_fac
-            if(nproc_r.eq.-1) fr=factors(i)
-            if(nproc_t.eq.-1) ft=factors(j)
-            if(nproc_p.eq.-1) fp=factors(k)
-            if (fr*ft*fp.eq.nproc) then
-              num_rank_fac=num_rank_fac+1
-              rank_factors(num_rank_fac,1)=fr
-              rank_factors(num_rank_fac,2)=ft
-              rank_factors(num_rank_fac,3)=fp
-            end if
-          enddo
-        enddo
-      enddo
-c
-c ****** Get number of grid points per rank for each dimension.
-c
-      nperrank(:,1)=real(nr_g)/rank_factors(:,1)
-      nperrank(:,2)=real(nt_g)/rank_factors(:,2)
-      nperrank(:,3)=real(np_g)/rank_factors(:,3)
-c
-c ****** Compute penalty function.
-c
-      penalty(:)=a12*(nperrank(:,1)-nperrank(:,2))**2
-     &          +a23*(nperrank(:,2)-nperrank(:,3))**2
-     &          +a13*(nperrank(:,3)-nperrank(:,1))**2
-c
-c ****** Eliminate any choices that yield less than a minimum number
-c ****** of grid points per rank.
-c
-      do i=1,num_rank_fac
-        if (nperrank(i,1).lt.4) penalty(i)=bigval
-        if (nperrank(i,2).lt.4) penalty(i)=bigval
-        if (nperrank(i,3).lt.3) penalty(i)=bigval
-      enddo
-c
-c ****** Find optimal topology.
-c
-      best_idx=MINLOC(penalty,1)
-c
-      if (penalty(best_idx).eq.bigval) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in SET_PROC_TOPOLOGY:'
-          write (*,*) '### Processor topology specification error.'
-          write (*,*) 'No valid topologies found for selected options'
-          write (*,*) 'with selected grid.  '
-          write (*,*) 'It is likely you are using too many MPI ranks.'
-          write (*,*) 'Number of MPI ranks = ',nproc
-          write (*,*) 'NPROC_R = ',nproc_r
-          write (*,*) 'NPROC_T = ',nproc_t
-          write (*,*) 'NPROC_P = ',nproc_p
-          write (*,*) 'NR = ',nr_g
-          write (*,*) 'NT = ',nt_g
-          write (*,*) 'NP = ',np_g
-        end if
-        call endrun (.true.)
-      end if
-c
-c ****** Set optimal topology.
-c
-      nprocs(1)=rank_factors(best_idx,1)
-      nprocs(2)=rank_factors(best_idx,2)
-      nprocs(3)=rank_factors(best_idx,3)
-c
-      deallocate(factors)
-      deallocate(rank_factors)
-      deallocate(nperrank)
-      deallocate(penalty)
-c
-      end subroutine
-c#######################################################################
-      subroutine check_proc_topology
-c
-c-----------------------------------------------------------------------
-c
-c ****** Check the validity of the requested processor topology.
-c
-c-----------------------------------------------------------------------
-c
-      use mpidefs
-      use decomposition_params
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,nreq
-c
-c-----------------------------------------------------------------------
-c
-c ****** Check the processor topology.
-c
-      do i=1,3
-        if (nprocs(i).lt.1.and.nprocs(i).ne.-1) then
-          if (iamp0) then
-            write (*,*)
-            write (*,*) '### ERROR in CHECK_PROC_TOPOLOGY:'
-            write (*,*) '### Processor topology specification error.'
-            write (*,*) 'Invalid number of processors requested.'
-            write (*,*) 'Dimension = ',i
-            write (*,*) 'Number of processors requested = ',
-     &                  nprocs(i)
-          end if
-          call endrun (.true.)
-        end if
-      enddo
-c
-c ****** Set the optimal values of the topology if requested.
-c
-      call set_proc_topology
-c
-c ****** Check that the number of processors available
-c ****** matches the number requested.
-c
-      nreq=nprocs(1)*nprocs(2)*nprocs(3)
-c
-      if (nreq.ne.nproc) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in CHECK_PROC_TOPOLOGY:'
-          write (*,*) '### Processor topology specification error.'
-          write (*,*) 'The number of processors requested does not'//
-     &                ' equal the number available.'
-          write (*,*) 'Number of processors requested = ',nreq
-          write (*,*) 'Number of processors available = ',nproc
-        end if
-        call endrun (.true.)
-      end if
-c
-      return
-      end
-c#######################################################################
-      subroutine decompose_domain
-c
-c-----------------------------------------------------------------------
-c
-c ****** Decompose the domain into a Cartesian MPI topology.
-c
-c-----------------------------------------------------------------------
-c
-      use mpidefs
-      use decomposition_params
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-      integer, parameter :: ndim=3
-      integer, dimension(ndim) :: coords
-      logical, dimension(ndim) :: periodic
-      logical :: reorder
-      logical, dimension(ndim) :: keep_dim
-c
-c-----------------------------------------------------------------------
-c
-c ****** Create a communicator over all processors, COMM_ALL,
-c ****** that has a Cartesian topology.
-c
-c ****** Specify the periodicity of the coordinate system.
-c
-      periodic(1)=.false.
-      periodic(2)=.false.
-      periodic(3)=.true.
-c
-c ****** Allow re-ordering in the Cartesian topology.
-c
-      reorder=.true.
-c
-      call MPI_Cart_create (MPI_COMM_WORLD,ndim,nprocs,
-     &                      periodic,reorder,comm_all,ierr)
-c
-c ****** Get the index (rank) of the local processor in
-c ****** communicator COMM_ALL in variable IPROC.
-c
-c ****** IMPORTANT NOTE:
-c ****** If re-odering was allowed in the Cartesian topology
-c ****** creation (above), then the rank of the local processor
-c ****** in communicator COMM_ALL may be different from its rank
-c ****** in communicator MPI_COMM_WORLD.
-c
-      call MPI_Comm_rank (comm_all,iproc,ierr)
-c
-c ****** Set the processor rank IPROC0 in communicator COMM_ALL
-c ****** for the processor that has rank 0 in MPI_COMM_WORLD.
-c ****** This value is broadcast to all the processors.
-c
-      if (iamp0) then
-        iproc0=iproc
-      end if
-      call MPI_Bcast (iproc0,1,MPI_INTEGER,0,MPI_COMM_WORLD,ierr)
-c
-c ****** Get the coordinate indices of this processor in the
-c ****** Cartesian MPI topology.
-c
-      call MPI_Cart_coords (comm_all,iproc,ndim,coords,ierr)
-c
-      iproc_r=coords(1)
-      iproc_t=coords(2)
-      iproc_p=coords(3)
-c
-      nproc_r=nprocs(1)
-      nproc_t=nprocs(2)
-      nproc_p=nprocs(3)
-c
-c ****** Get the rank of the neighboring processors in the
-c ****** Cartesian MPI topology.
-c
-      call MPI_Cart_shift (comm_all,0,1,iproc_rm,iproc_rp,ierr)
-      call MPI_Cart_shift (comm_all,1,1,iproc_tm,iproc_tp,ierr)
-      call MPI_Cart_shift (comm_all,2,1,iproc_pm,iproc_pp,ierr)
-c
-c ****** Create communicators for operations involving all
-c ****** processors in the phi dimension.  These communicators
-c ****** are stored in COMM_PHI (and generally represent different
-c ****** communicators on different processors).
-c
-      keep_dim(1)=.false.
-      keep_dim(2)=.false.
-      keep_dim(3)=.true.
-c
-      call MPI_Cart_sub (comm_all,keep_dim,comm_phi,ierr)
-c
-c ****** Create communicators for operations involving
-c ****** all processors in the r dimension.
-c ****** These communicators are stored in COMM_R
-c ****** (and generally represent different communicators on
-c ****** different processors).
-c
-      keep_dim(1)=.true.
-      keep_dim(2)=.false.
-      keep_dim(3)=.false.
-c
-      call MPI_Cart_sub (comm_all,keep_dim,comm_r,ierr)
-c
-      return
-      end
-c#######################################################################
-      subroutine decompose_mesh_r
-c
-c-----------------------------------------------------------------------
-c
-c ****** Decompose the r mesh between processors.
-c
-c-----------------------------------------------------------------------
-c
-      use global_dims
-      use local_dims_r
-      use decomposition
-      use solve_params
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: ierr,i,npts
-      integer :: i0_h,i1_h,i0_m,i1_m
-      integer, dimension(nproc_r) :: mp_r
-c
-c-----------------------------------------------------------------------
-c
-c ****** Decompose the r dimension.
-c
-      npts=nr_g
-c
-      call decompose_dimension (npts,nproc_r,mp_r,ierr)
-      if (ierr.ne.0) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in DECOMPOSE_MESH:'
-          write (*,*) '### Anomaly in decomposing the mesh'//
-     &                ' between processors.'
-          write (*,*) '### Could not decompose the r mesh.'
-          write (*,*) 'Number of mesh points in r = ',npts
-          write (*,*) 'Number of processors along r = ',nproc_r
-        end if
-        call endrun (.true.)
-      end if
-c
-c ****** Check that the resulting mesh topology is valid.
-c
-      call check_mesh_topology (nproc_r,mp_r,1,'r')
-c
-c ****** Compute the mapping between the processor decomposition
-c ****** and the global mesh.
-c
-c ****** Note that there is a two-point overlap in the mesh
-c ****** between adjacent processors in r.
-c
-      i0_g=1
-      do i=1,iproc_r
-        i0_g=i0_g+mp_r(i)
-      enddo
-      nr=mp_r(iproc_r+1)+2
-      i1_g=i0_g+nr-1
-c
-      nrm1=nr-1
-c
-c ****** Set the flags to indicate whether this processor has
-c ****** points on the physical boundaries.
-c
-      if (iproc_r.eq.0) then
-        rb0=.true.
-      else
-        rb0=.false.
-      end if
-c
-      if (iproc_r.eq.nproc_r-1) then
-        rb1=.true.
-      else
-        rb1=.false.
-      end if
-c
-c ****** Set the dimensions of arrays for the "main" meshes
-c ****** (i.e., the "m" mesh) for which normal derivatives are
-c ****** needed (e.g., v).  These vary on different processors,
-c ****** depending if they are left-boundary, internal, or
-c ****** right-boundary processors.
-c
-      if (rb1) then
-        nrm=nrm1
-      else
-        nrm=nr
-      end if
-c
-c ****** Store the mapping structure (for this processor).
-c
-      allocate (map_rh(0:nproc-1))
-      allocate (map_rm(0:nproc-1))
-c
-      if (rb0) then
-        i0_h=1
-      else
-        i0_h=2
-      end if
-      if (rb1) then
-        i1_h=nr
-      else
-        i1_h=nrm1
-      end if
-c
-      if (rb0) then
-        i0_m=1
-      else
-        i0_m=2
-      end if
-      i1_m=nrm1
-c
-      map_rh(iproc)%i0=i0_h
-      map_rh(iproc)%i1=i1_h
-c
-      map_rm(iproc)%i0=i0_m
-      map_rm(iproc)%i1=i1_m
-c
-      map_rh(iproc)%offset=i0_g+map_rh(iproc)%i0-1
-      map_rh(iproc)%n=map_rh(iproc)%i1-map_rh(iproc)%i0+1
-c
-      map_rm(iproc)%offset=i0_g+map_rm(iproc)%i0-1
-      map_rm(iproc)%n=map_rm(iproc)%i1-map_rm(iproc)%i0+1
-c
-c ****** Gather the mapping information by communicating among
-c ****** all processors.
-c
-      call gather_mapping_info (map_rh)
-      call gather_mapping_info (map_rm)
-c
-      return
-      end
-c#######################################################################
-      subroutine decompose_mesh_tp
-c
-c-----------------------------------------------------------------------
-c
-c ****** Decompose the theta and phi mesh between processors.
-c
-c-----------------------------------------------------------------------
-c
-      use global_dims
-      use local_dims_tp
-      use decomposition
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: ierr,j,k
-      integer :: j0_h,j1_h,j0_m,j1_m
-      integer :: k0_h,k1_h,k0_m,k1_m
-      integer, dimension(nproc_t) :: mp_t
-      integer, dimension(nproc_p) :: mp_p
-c
-c-----------------------------------------------------------------------
-c
-c ****** Decompose the t dimension.
-c
-      call decompose_dimension (nt_g,nproc_t,mp_t,ierr)
-      if (ierr.ne.0) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in DECOMPOSE_MESH_TP:'
-          write (*,*) '### Anomaly in decomposing the mesh'//
-     &                ' between processors.'
-          write (*,*) '### Could not decompose the theta mesh.'
-          write (*,*) 'Number of mesh points in theta = ',nt_g
-          write (*,*) 'Number of processors along theta = ',nproc_t
-        end if
-        call endrun (.true.)
-      end if
-c
-c ****** Decompose the p dimension.
-c
-      call decompose_dimension (np_g,nproc_p,mp_p,ierr)
-      if (ierr.ne.0) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in DECOMPOSE_MESH_TP:'
-          write (*,*) '### Anomaly in decomposing the mesh'//
-     &                ' between processors.'
-          write (*,*) '### Could not decompose the phi mesh.'
-          write (*,*) 'Number of mesh points in phi = ',np_g
-          write (*,*) 'Number of processors along phi = ',nproc_p
-        end if
-        call endrun (.true.)
-      end if
-c
-c ****** Check that the resulting mesh topology is valid.
-c
-      call check_mesh_topology (nproc_t,mp_t,1,'theta')
-      call check_mesh_topology (nproc_p,mp_p,1,'phi')
-c
-c ****** Set the flag for an axisymmetric run (requested by
-c ****** setting NP_G = 3).
-c
-      if (np_g.eq.3) then
-        axisymmetric=.true.
-      else
-        axisymmetric=.false.
-      end if
-c
-c ****** Compute the mapping between the processor decomposition
-c ****** and the global mesh.
-c
-c ****** Note that there is a two-point overlap in the mesh
-c ****** between adjacent processors in theta and phi.
-c
-      j0_g=1
-      do j=1,iproc_t
-        j0_g=j0_g+mp_t(j)
-      enddo
-      nt=mp_t(iproc_t+1)+2
-      j1_g=j0_g+nt-1
-c
-      k0_g=1
-      do k=1,iproc_p
-        k0_g=k0_g+mp_p(k)
-      enddo
-      np=mp_p(iproc_p+1)+2
-      k1_g=k0_g+np-1
-c
-      ntm1=nt-1
-      npm1=np-1
-c
-c ****** Set the flags to indicate whether this processor has
-c ****** points on the physical boundaries.
-c
-      if (iproc_t.eq.0) then
-        tb0=.true.
-      else
-        tb0=.false.
-      end if
-c
-      if (iproc_t.eq.nproc_t-1) then
-        tb1=.true.
-      else
-        tb1=.false.
-      end if
-c
-c ****** Set the dimensions of arrays for the "main" meshes
-c ****** (i.e., the "m" mesh) for which normal derivatives are
-c ****** needed (e.g., v).  These vary on different processors,
-c ****** depending if they are left-boundary, internal, or
-c ****** right-boundary processors.
-c
-      if (tb1) then
-        ntm=ntm1
-      else
-        ntm=nt
-      end if
-c
-c ****** Since the phi dimension is periodic, all processors
-c ****** have the same mesh limits.
-c
-      npm=np
-c
-c ****** Store the mapping structure (for this processor).
-c
-      allocate (map_th(0:nproc-1))
-      allocate (map_tm(0:nproc-1))
-      allocate (map_ph(0:nproc-1))
-      allocate (map_pm(0:nproc-1))
-c
-      if (tb0) then
-        j0_h=1
-      else
-        j0_h=2
-      end if
-      if (tb1) then
-        j1_h=nt
-      else
-        j1_h=ntm1
-      end if
-c
-      if (tb0) then
-        j0_m=1
-      else
-        j0_m=2
-      end if
-      j1_m=ntm1
-c
-      if (iproc_p.eq.0) then
-        k0_m=1
-      else
-        k0_m=2
-      end if
-      k1_m=npm1
-c
-      if (iproc_p.eq.0) then
-        k0_h=1
-      else
-        k0_h=2
-      end if
-      if (iproc_p.eq.nproc_p-1) then
-        k1_h=np
-      else
-        k1_h=npm1
-      end if
-c
-      map_th(iproc)%i0=j0_h
-      map_th(iproc)%i1=j1_h
-c
-      map_tm(iproc)%i0=j0_m
-      map_tm(iproc)%i1=j1_m
-c
-      map_ph(iproc)%i0=k0_h
-      map_ph(iproc)%i1=k1_h
-c
-      map_pm(iproc)%i0=k0_m
-      map_pm(iproc)%i1=k1_m
-c
-      map_th(iproc)%offset=j0_g+map_th(iproc)%i0-1
-      map_th(iproc)%n=map_th(iproc)%i1-map_th(iproc)%i0+1
-c
-      map_tm(iproc)%offset=j0_g+map_tm(iproc)%i0-1
-      map_tm(iproc)%n=map_tm(iproc)%i1-map_tm(iproc)%i0+1
-c
-      map_ph(iproc)%offset=k0_g+map_ph(iproc)%i0-1
-      map_ph(iproc)%n=map_ph(iproc)%i1-map_ph(iproc)%i0+1
-c
-      map_pm(iproc)%offset=k0_g+map_pm(iproc)%i0-1
-      map_pm(iproc)%n=map_pm(iproc)%i1-map_pm(iproc)%i0+1
-c
-c ****** Gather the mapping information by communicating among
-c ****** all processors.
-c
-      call gather_mapping_info (map_th)
-      call gather_mapping_info (map_tm)
-      call gather_mapping_info (map_ph)
-      call gather_mapping_info (map_pm)
-c
-      return
-      end
-c#######################################################################
-      subroutine check_mesh_dimensions (nr_g,nt_g,np_g)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Check that the requested (global) mesh dimensions are
-c ****** valid.
-c
-c-----------------------------------------------------------------------
-c
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: nr_g,nt_g,np_g
-c
-c-----------------------------------------------------------------------
-c
-      if (nr_g.lt.4) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in CHECK_MESH_DIMENSIONS:'
-          write (*,*) '### Invalid number of r mesh points'//
-     &              ' requested.'
-          write (*,*) '### The minimum number of mesh points is 4.'
-          write (*,*)
-          write (*,*) 'Number of mesh points requested = ',nr_g
-        end if
-        call endrun (.true.)
-      end if
-c
-      if (nt_g.lt.4) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in CHECK_MESH_DIMENSIONS:'
-          write (*,*) '### Invalid number of theta mesh points'//
-     &              ' requested.'
-          write (*,*) '### The minimum number of mesh points is 4.'
-          write (*,*)
-          write (*,*) 'Number of mesh points requested = ',nt_g
-        end if
-        call endrun (.true.)
-      end if
-c
-      if (np_g.lt.3) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in CHECK_MESH_DIMENSIONS:'
-          write (*,*) '### Invalid number of phi mesh points'//
-     &              ' requested.'
-          write (*,*) '### The minimum number of mesh points is 3.'
-          write (*,*)
-          write (*,*) 'Number of mesh points requested = ',np_g
-        end if
-        call endrun (.true.)
-      end if
-c
-      return
-      end
-c#######################################################################
-      subroutine decompose_dimension (nx,np,mp,ierr)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Decompose the mesh points NX along NP processors.
-c
-c ****** The decomposed mesh points are returned in array MP.
-c
-c-----------------------------------------------------------------------
-c
-c ****** This routine attempts to assign the mesh points as equally
-c ****** as possible between the processors.
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: nx
-      integer :: np
-      integer, dimension(np) :: mp
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-      integer :: nxm2,mpav,nrem
-c
-c-----------------------------------------------------------------------
-c
-      ierr=0
-c
-      nxm2=nx-2
-c
-      if (nxm2.le.0) then
-        ierr=1
-        return
-      end if
-c
-      if (np.le.0) then
-        ierr=2
-        return
-      end if
-c
-      mpav=nxm2/np
-c
-      mp(:)=mpav
-c
-      nrem=nxm2-mpav*np
-c
-      mp(1:nrem)=mp(1:nrem)+1
-c
-      return
-      end
-c#######################################################################
-      subroutine check_mesh_topology (np,mp,min_pts,coord)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Check the validity of the requested mesh topology.
-c
-c-----------------------------------------------------------------------
-c
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: np
-      integer, dimension(np) :: mp
-      integer :: min_pts
-      character(*) :: coord
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i
-c
-c-----------------------------------------------------------------------
-c
-c ****** Check that the number of mesh points on each processor
-c ****** is valid.
-c
-      do i=1,np
-        if (mp(i).lt.min_pts) then
-          if (iamp0) then
-            write (*,*)
-            write (*,*) '### ERROR in CHECK_MESH_TOPOLOGY:'
-            write (*,*) '### Mesh topology specification error.'
-            write (*,*) 'Invalid number of ',coord,
-     &                  ' mesh points requested.'
-            write (*,*) 'Processor index = ',i
-            write (*,*) 'Number of mesh points requested = ',mp(i)
-            write (*,*) 'Minimum number of mesh points allowed = ',
-     &                  min_pts
-          end if
-          call endrun (.true.)
-        end if
-      enddo
-c
-      return
-      end
-c#######################################################################
-      subroutine gather_mapping_info (map)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Gather a mapping information array by communicating
-c ****** among all processors.
-c
-c-----------------------------------------------------------------------
-c
-      use mpidefs
-      use decomposition
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      type(map_struct), dimension(0:nproc-1) :: map
-c
-c-----------------------------------------------------------------------
-c
-c ****** Buffers for packing the data.
-c
-      integer, parameter :: lbuf=4
-      integer, dimension(lbuf) :: sbuf
-      integer, dimension(lbuf,0:nproc-1) :: rbuf
-c
-c-----------------------------------------------------------------------
-c
-      integer :: ierr,irank
-c
-c-----------------------------------------------------------------------
-c
-c ****** Put the local section of the mapping information
-c ****** array into a buffer.
-c
-      sbuf(1)=map(iproc)%n
-      sbuf(2)=map(iproc)%i0
-      sbuf(3)=map(iproc)%i1
-      sbuf(4)=map(iproc)%offset
-c
-c ****** Communicate among all processors.  After this call, all
-c ****** processors have the complete mapping information.
-c
-      call MPI_Allgather (sbuf,lbuf,MPI_INTEGER,
-     &                    rbuf,lbuf,MPI_INTEGER,comm_all,ierr)
-c
-c ****** Extract the mapping information from the buffer.
-c
-      do irank=0,nproc-1
-        map(irank)%n     =rbuf(1,irank)
-        map(irank)%i0    =rbuf(2,irank)
-        map(irank)%i1    =rbuf(3,irank)
-        map(irank)%offset=rbuf(4,irank)
-      enddo
-c
-      return
-      end
-c#######################################################################
-      subroutine decomp_diags
-c
-c-----------------------------------------------------------------------
-c
-c ****** Print diagnostics about the mesh decomposition.
-c
-c-----------------------------------------------------------------------
-c
-      use global_dims
-      use global_mesh
-      use local_dims_r
-      use local_mesh_r
-      use local_dims_tp
-      use local_mesh_tp
-      use mpidefs
-      use solve_params
-      use debug
-      use decomposition
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: ierr
-      integer :: irank
-      real(r_typ) :: n_per_grid_min,n_per_grid_max
-c
-c-----------------------------------------------------------------------
-c
-      if (iamp0) then
-c
-        n_per_grid_min=floor(real(nr_g)/nproc_r)
-     &                *floor(real(nt_g)/nproc_t)
-     &                *floor(real(np_g)/nproc_p)
-c
-        n_per_grid_max=ceiling(real(nr_g)/nproc_r)
-     &                *ceiling(real(nt_g)/nproc_t)
-     &                *ceiling(real(np_g)/nproc_p)
-c
-        write (*,*)
-        write (*,*) 'Total number of MPI ranks = ',nproc
-        write (*,*)
-        write (*,*) 'Number of MPI ranks in r = ',nproc_r
-        write (*,*) 'Number of MPI ranks in t = ',nproc_t
-        write (*,*) 'Number of MPI ranks in p = ',nproc_p
-        write (*,*)
-        write (*,*) 'Global mesh dimension in r = ',nr_g
-        write (*,*) 'Global mesh dimension in t = ',nt_g
-        write (*,*) 'Global mesh dimension in p = ',np_g
-        write (*,*)
-        write (*,'(A,F6.1)') ' Average # of r mesh pts per rank = ',
-     &               real(nr_g)/nproc_r
-        write (*,'(A,F6.1)') ' Average # of t mesh pts per rank = ',
-     &               real(nt_g)/nproc_t
-        write (*,'(A,F6.1)') ' Average # of p mesh pts per rank = ',
-     &               real(np_g)/nproc_p
-        write (*,*)
-        write (*,'(A,F6.2,A)') ' Estimated load imbalance = ',
-     &      100.0*(1.0-real(n_per_grid_min)/real(n_per_grid_max)),' %'
-c
-        write (9,*)
-        write (9,*) 'Total number of MPI ranks = ',nproc
-        write (9,*)
-        write (9,*) 'Number of MPI ranks in r = ',nproc_r
-        write (9,*) 'Number of MPI ranks in t = ',nproc_t
-        write (9,*) 'Number of MPI ranks in p = ',nproc_p
-        write (9,*)
-        write (9,*) 'Global mesh dimension in r = ',nr_g
-        write (9,*) 'Global mesh dimension in t = ',nt_g
-        write (9,*) 'Global mesh dimension in p = ',np_g
-        write (9,*)
-        write (9,'(A,F6.1)') ' Average # of r mesh pts per rank = ',
-     &               real(nr_g)/nproc_r
-        write (9,'(A,F6.1)') ' Average # of t mesh pts per rank = ',
-     &               real(nt_g)/nproc_t
-        write (9,'(A,F6.1)') ' Average # of p mesh pts per rank = ',
-     &               real(np_g)/nproc_p
-        write (9,*)
-        write (9,'(A,F6.2,A)') ' Estimated load imbalance = ',
-     &      100.0*(1.0-real(n_per_grid_min)/real(n_per_grid_max)),' %'
-c
-      end if
-c
-      if (idebug.le.1) return
-c
-      do irank=0,nproc-1
-        call MPI_Barrier (comm_all,ierr)
-        if (irank.eq.iproc) then
-          write (*,*)
-          write (*,*)
-          write (*,*) 'Rank id = ',iproc
-          write (*,*) 'nr = ',nr
-          write (*,*) 'nt = ',nt
-          write (*,*) 'np = ',np
-          write (*,*) 'i0_g = ',i0_g
-          write (*,*) 'i1_g = ',i1_g
-          write (*,*) 'j0_g = ',j0_g
-          write (*,*) 'j1_g = ',j1_g
-          write (*,*) 'k0_g = ',k0_g
-          write (*,*) 'k1_g = ',k1_g
-          write (*,*) 'Rank index in r    = ',iproc_r
-          write (*,*) 'Rank index in t    = ',iproc_t
-          write (*,*) 'Rank index in p    = ',iproc_p
-          write (*,*) 'Rank to left  in r = ',iproc_rm
-          write (*,*) 'Rank to right in r = ',iproc_rp
-          write (*,*) 'Rank to left  in t = ',iproc_tm
-          write (*,*) 'Rank to right in t = ',iproc_tp
-          write (*,*) 'Rank to left  in p = ',iproc_pm
-          write (*,*) 'Rank to right in p = ',iproc_pp
-          write (*,*)
-          write (*,*) 'Rank in MPI_COMM_WORLD = ',iprocw
-          write (*,*) 'Rank in COMM_ALL       = ',iproc
-          if (idebug.gt.2) then
-            write (*,*)
-            write (*,*) 'r mesh:'
-            write (*,*) r
-            write (*,*)
-            write (*,*) 'theta mesh:'
-            write (*,*) t
-            write (*,*)
-            write (*,*) 'phi mesh:'
-            write (*,*) p
-          end if
-        end if
-      enddo
-c
-      return
-      end
-c#######################################################################
-      subroutine allocate_global_arrays
-c
-c-----------------------------------------------------------------------
-c
-c ****** Allocate global arrays.
-c
-c-----------------------------------------------------------------------
-c
-      use global_dims
-      use global_mesh
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-c ****** Allocate global mesh arrays.
-c
-      allocate (r_g (nrm1_g))
-      allocate (dr_g(nrm1_g))
-c
-      allocate (rh_g (nr_g))
-      allocate (drh_g(nr_g))
-c
-      allocate (t_g (ntm1_g))
-      allocate (dt_g(ntm1_g))
-c
-      allocate (th_g (nt_g))
-      allocate (dth_g(nt_g))
-c
-      allocate (p_g  (np_g))
-      allocate (dp_g (np_g))
-      allocate (ph_g (np_g))
-      allocate (dph_g(np_g))
-c
-      allocate (st_g(ntm1_g))
-      allocate (ct_g(ntm1_g))
-c
-      allocate (sth_g(nt_g))
-      allocate (cth_g(nt_g))
-c
-      allocate (sp_g (np_g))
-      allocate (cp_g (np_g))
-      allocate (sph_g(np_g))
-      allocate (cph_g(np_g))
-c
-      return
-      end
-c#######################################################################
-      subroutine allocate_local_arrays_r
-c
-c-----------------------------------------------------------------------
-c
-c ****** Allocate local arrays for the r dimension and 3D arrays.
-c
-c-----------------------------------------------------------------------
-c
-      use local_dims_r
-      use local_mesh_r
-      use local_dims_tp
-      use fields
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      allocate (r (nrm))
-      allocate (r2 (nrm))
-      allocate (dr(nrm))
-c
-      allocate (rh (nr))
-      allocate (drh(nr))
-c
-c ****** Allocate inverse quantities.
-c
-      allocate (r_i (nrm))
-      allocate (dr_i(nrm))
-c
-      allocate (rh_i (nr))
-      allocate (drh_i(nr))
-c
-c ****** Allocate the potential array and cg scratch array.
-c
-      allocate (phi(nr,nt,np))
-      allocate (x_ax(nr,nt,np))
-      phi(:,:,:)=0.
-      x_ax(:,:,:)=0.
-!$omp target enter data map(to:phi,x_ax)
-c
-c ****** Allocate polar boundary arrays.
-c
-      allocate (sum0(nr))
-      allocate (sum1(nr))
-      sum0(:)=0.
-      sum1(:)=0.
-!$omp target enter data map(to:sum0,sum1)
-c
-c ****** Allocate the local magnetic field arrays.
-c
-      allocate (br(nrm,nt,np))
-      allocate (bt(nr,ntm,np))
-      allocate (bp(nr,nt,npm))
-      br(:,:,:)=0.
-      bt(:,:,:)=0.
-      bp(:,:,:)=0.
-c
-      end subroutine
-c#######################################################################
-      subroutine allocate_local_arrays_tp
-c
-c-----------------------------------------------------------------------
-c
-c ****** Allocate local arrays for the theta and phi dimensions.
-c
-c-----------------------------------------------------------------------
-c
-      use local_dims_tp
-      use local_mesh_tp
-      use fields
-      use vars
-      use solve_params
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      allocate (t (ntm))
-      allocate (dt(ntm))
-c
-      allocate (th (nt))
-      allocate (dth(nt))
-c
-      allocate (p (np))
-      allocate (dp(np))
-c
-      allocate (ph (np))
-      allocate (dph(np))
-c
-      allocate (st(ntm))
-      allocate (ct(ntm))
-c
-      allocate (sth(nt))
-      allocate (cth(nt))
-c
-      allocate (sp (np))
-      allocate (cp (np))
-      allocate (sph(np))
-      allocate (cph(np))
-c
-c ****** Allocate inverse quantities.
-c
-      allocate (dt_i(ntm))
-      allocate (st_i(ntm))
-c
-      allocate (dth_i(nt))
-      allocate (sth_i(nt))
-c
-      allocate (dp_i (np))
-      allocate (dph_i(np))
-c
-c ****** Allocate the boundary radial magnetic field array(s).
-c
-      allocate (br0(nt,np))
-      br0(:,:)=0.
-!$omp target enter data map(to:br0)
-c
-      if (validation_run) then
-        allocate (br1(nt,np))
-        br1(:,:)=0.
-!$omp target enter data map(to:br1)
-      end if
-c
-      return
-      end
-c#######################################################################
-      subroutine set_global_mesh
-c
-c-----------------------------------------------------------------------
-c
-c ****** Define the global mesh arrays.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use global_dims
-      use global_mesh
-      use meshdef
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: zero=0._r_typ
-      real(r_typ), parameter :: one=1._r_typ
-      real(r_typ), parameter :: half=.5_r_typ
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j,k
-c
-c-----------------------------------------------------------------------
-c
-c ****** Define the radial mesh.
-c
-      call genmesh (9,'r',nrm1_g,r0,r1,nmseg,rfrac,drratio,
-     &              nfrmesh,.false.,zero,r_g)
-c
-      do i=2,nrm1_g
-        rh_g(i)=.5*(r_g(i)+r_g(i-1))
-        drh_g(i)=r_g(i)-r_g(i-1)
-      enddo
-      rh_g(1)=rh_g(2)-drh_g(2)
-      rh_g(nr_g)=rh_g(nrm1_g)+drh_g(nrm1_g)
-      drh_g(1)=drh_g(2)
-      drh_g(nr_g)=drh_g(nrm1_g)
-c
-      do i=1,nrm1_g
-        dr_g(i)=rh_g(i+1)-rh_g(i)
-      enddo
-c
-c ****** Define the theta mesh.
-c
-      call genmesh (9,'t',ntm1_g,t0,t1,nmseg,tfrac,dtratio,
-     &              nftmesh,.false.,zero,t_g)
-c
-      do j=2,ntm1_g
-        th_g(j)=.5*(t_g(j)+t_g(j-1))
-        dth_g(j)=t_g(j)-t_g(j-1)
-      enddo
-      th_g(1)=th_g(2)-dth_g(2)
-      th_g(nt_g)=th_g(ntm1_g)+dth_g(ntm1_g)
-      dth_g(1)=dth_g(2)
-      dth_g(nt_g)=dth_g(ntm1_g)
-c
-      do j=1,ntm1_g
-        dt_g(j)=th_g(j+1)-th_g(j)
-      enddo
-c
-c ****** Define the periodic phi mesh.
-c
-      call genmesh (9,'p',npm1_g,p0,p1,nmseg,pfrac,dpratio,
-     &              nfpmesh,.true.,phishift,p_g)
-      p_g(np_g)=p_g(2)+pl
-c
-      do k=2,np_g
-        ph_g(k)=half*(p_g(k)+p_g(k-1))
-        dph_g(k)=p_g(k)-p_g(k-1)
-      enddo
-      ph_g(1)=ph_g(npm1_g)-pl
-      dph_g(1)=dph_g(npm1_g)
-c
-      do k=1,npm1_g
-        dp_g(k)=ph_g(k+1)-ph_g(k)
-      enddo
-      dp_g(np_g)=dp_g(2)
-c
-c ****** Enforce exact periodicity to protect symmetry properties
-c ****** from round-off errors (especially for axisymmetric cases).
-c
-      dph_g(np_g)=dph_g(2)
-      dp_g(1)=dp_g(npm1_g)
-c
-c ****** Define global auxiliary mesh-related arrays.
-c
-      st_g(:)=sin(t_g(:))
-      ct_g(:)=cos(t_g(:))
-      sth_g(:)=sin(th_g(:))
-      cth_g(:)=cos(th_g(:))
-c
-      sp_g(:)=sin(p_g(:))
-      cp_g(:)=cos(p_g(:))
-      sph_g(:)=sin(ph_g(:))
-      cph_g(:)=cos(ph_g(:))
-c
-c ****** For an axisymmetric case, set the exact values of
-c ****** sin(phi) and cos(phi) to preserve symmetry properties
-c ****** in the presence of round-off errors.
-c
-      if (axisymmetric) then
-        sp_g(2)=0.
-        cp_g(2)=one
-        sph_g(2)=0.
-        cph_g(2)=-one
-      end if
-c
-c ****** Enforce exact periodicity to protect symmetry properties
-c ****** from round-off errors (especially for axisymmetric cases).
-c
-      sph_g(1)=sph_g(npm1_g)
-      sph_g(np_g)=sph_g(2)
-      cph_g(1)=cph_g(npm1_g)
-      cph_g(np_g)=cph_g(2)
-      sp_g(1)=sp_g(npm1_g)
-      sp_g(np_g)=sp_g(2)
-      cp_g(1)=cp_g(npm1_g)
-      cp_g(np_g)=cp_g(2)
-c
-      return
-      end
-c#######################################################################
-      subroutine set_local_mesh_r
-c
-c-----------------------------------------------------------------------
-c
-c ****** Define the local r mesh arrays.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use global_dims
-      use global_mesh
-      use local_dims_r
-      use local_mesh_r
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: one=1._r_typ
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i
-c
-c-----------------------------------------------------------------------
-c
-c ****** Define the local meshes.
-c
-      do i=1,nrm
-        r(i)=r_g(i0_g+i-1)
-        dr(i)=dr_g(i0_g+i-1)
-      enddo
-c
-      dr1=dr(1)
-      drn=dr(nrm)
-c
-      do i=1,nr
-        rh(i)=rh_g(i0_g+i-1)
-        drh(i)=drh_g(i0_g+i-1)
-      enddo
-c
-c ****** Define local auxiliary mesh-related arrays.
-c
-      r2(:)=r(:)**2
-      r_i(:)=one/r(:)
-      dr_i(:)=one/dr(:)
-      rh_i(:)=one/rh(:)
-      drh_i(:)=one/drh(:)
-c
-!$omp target enter data map(to:r,r2,r_i,dr,dr_i,dr1,drn,rh,rh_i,drh,
-!$omp&                         drh_i)
-      return
-      end
-c#######################################################################
-      subroutine set_local_mesh_tp
-c
-c-----------------------------------------------------------------------
-c
-c ****** Define the local theta and phi mesh arrays.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use global_dims
-      use global_mesh
-      use local_dims_tp
-      use local_mesh_tp
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: one=1._r_typ
-c
-c-----------------------------------------------------------------------
-c
-      integer :: j,k,j0,j1
-c
-c-----------------------------------------------------------------------
-c
-c ****** Define the local meshes.
-c
-      do j=1,ntm
-        t(j)=t_g(j0_g+j-1)
-        dt(j)=dt_g(j0_g+j-1)
-      enddo
-c
-      do j=1,nt
-        th(j)=th_g(j0_g+j-1)
-        dth(j)=dth_g(j0_g+j-1)
-      enddo
-c
-      do k=1,npm
-        p(k)=p_g(k0_g+k-1)
-        dp(k)=dp_g(k0_g+k-1)
-      enddo
-c
-      do k=1,np
-        ph(k)=ph_g(k0_g+k-1)
-        dph(k)=dph_g(k0_g+k-1)
-      enddo
-c
-c ****** Define local auxiliary mesh-related arrays.
-c
-      do j=1,ntm
-        st(j)=st_g(j0_g+j-1)
-        ct(j)=ct_g(j0_g+j-1)
-      enddo
-c
-      do j=1,nt
-        sth(j)=sth_g(j0_g+j-1)
-        cth(j)=cth_g(j0_g+j-1)
-      enddo
-c
-      do k=1,npm
-        sp(k)=sp_g(k0_g+k-1)
-        cp(k)=cp_g(k0_g+k-1)
-      enddo
-c
-      do k=1,np
-        sph(k)=sph_g(k0_g+k-1)
-        cph(k)=cph_g(k0_g+k-1)
-      enddo
-c
-      dt_i(:)=one/dt(:)
-      dth_i(:)=one/dth(:)
-      sth_i(:)=one/sth(:)
-      dp_i(:)=one/dp(:)
-      dph_i(:)=one/dph(:)
-c
-c ****** Prevent division by zero at the poles for sin(theta).
-c
-      if (tb0) then
-        j0=2
-      else
-        j0=1
-      end if
-      if (tb1) then
-        j1=ntm1-1
-      else
-        j1=ntm1
-      end if
-c
-      st_i(:)=0.
-      do j=j0,j1
-        st_i(j)=one/st(j)
-      enddo
-c
-!$omp target enter data map(to:t,th,dt,dth,p,ph,dp,dph,st,ct,sth,cth,
-!$omp&                 sp,cp,sph,cph,dt_i,dth_i,st_i,sth_i,dp_i,dph_i)
-      return
-      end
-c#######################################################################
-      subroutine genmesh (io,label,nc,c0,c1,nseg,frac,dratio,
-     &                    nfilt,periodic,c_shift,c)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Generate a one-dimensional mesh.
-c
-c-----------------------------------------------------------------------
-c
-c ****** Input arguments:
-c
-c          IO      : [integer]
-c                    Fortran file unit number to which to write
-c                    mesh diagnostics.  Set IO=0 if diagnostics
-c                    are not of interest.  It is assumed that
-c                    unit IO has been connected to a file prior
-c                    to calling this routine.
-c
-c          LABEL   : [character(*)]
-c                    Name for the mesh coordinate (example: 'x').
-c
-c          NC      : [integer]
-c                    Number of mesh points to load.
-c
-c          C0      : [real]
-c                    The starting location for the coordinate.
-c
-c          C1      : [real]
-c                    The ending location for the coordinate.
-c                    It is required that C1.gt.C0.
-c
-c          NSEG    : [integer]
-c                    Maximum number of mesh segments.
-c                    The mesh spacing in each segment varies
-c                    exponentially with a uniform amplification
-c                    factor.  The actual number of mesh segments
-c                    used is NSEG or less.  It is obtained from the
-c                    information in array FRAC.
-c
-c          FRAC    : [real array, dimension NSEG]
-c                    The normalized positions of the mesh segment
-c                    boundaries (as a fraction of the size of the
-c                    domain).  For a non-periodic mesh, the first
-c                    value of FRAC specified must equal 0. and the
-c                    last value must equal 1.  For a periodic mesh,
-c                    FRAC must not contain both 0. and 1., since
-c                    these represent the same point.
-c
-c          DRATIO  : [real array, dimension NSEG]
-c                    The ratio of the mesh spacing at the end of a
-c                    segment to that at the beginning.
-c
-c          NFILT   : [integer]
-c                    The number of times to filter the mesh-point
-c                    distribution array.  Set NFILT=0 if filtering
-c                    is not desired.  Filtering can reduce
-c                    discontinuities in the derivative of the mesh
-c                    spacing.
-c
-c          PERIODIC: [logical]
-c                    A flag to indicate whether the mesh to be
-c                    generated represents a periodic coordinate.
-c                    If the coordinate is specified as periodic,
-c                    the range [C0,C1] should be the whole periodic
-c                    interval; the first mesh point is set at C0
-c                    and the last mesh point, C(NC), is set at C1.
-c
-c          C_SHIFT : [real]
-c                    Amount by which to shift the periodic coordinate.
-c                    C_SHIFT is only used when PERIODIC=.true.,
-c                    and is ignored otherwise.  A positive C_SHIFT
-c                    moves the mesh points to the right.
-c
-c ****** Output arguments:
-c
-c          C       : [real array, dimension NC]
-c                    The locations of the mesh points.
-c
-c-----------------------------------------------------------------------
-c
-c ****** The arrays DRATIO and FRAC define the mesh as follows.
-c
-c ****** For example, suppose that a (non-periodic) mesh with three
-c ****** segments is desired.  Suppose the domain size is c=[0:2].
-c ****** In the first segment (with c between 0 and .5) the mesh
-c ****** spacing is decreasing with c, such that DC at c=.5 is half
-c ****** DC at c=0.  From c=.5 to c=1, the mesh is uniform.  From c=1
-c ****** to c=2, the mesh spacing is increasing with c such that DC at
-c ****** c=2 is 10 times DC at c=1.  This mesh would be specified by:
-c ******
-c ******     FRAC=0.,.25,.5,1.
-c ******     DRATIO=.5,1.,10.
-c ******
-c ****** The variable C_SHIFT can be used to shift the mesh point
-c ****** distribution for a periodic coordinate.  For example,
-c ****** suppose C represents mesh points in the interval [0,2*pi].
-c ****** C_SHIFT=.5*pi would move the distribution of mesh points
-c ****** so that the original mesh point with C(1)=0. would be
-c ****** close to .5*pi in the new mesh.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use mpidefs
-      use debug
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer, intent(in) :: io
-      character(*), intent(in) :: label
-      integer, intent(in) :: nc
-      real(r_typ), intent(in) :: c0,c1
-      integer, intent(in) :: nseg
-      real(r_typ), dimension(nseg), intent(in) :: frac,dratio
-      integer, intent(in) :: nfilt
-      logical, intent(in) :: periodic
-      real(r_typ), intent(in) :: c_shift
-      real(r_typ), dimension(nc), intent(out) :: c
-c
-c-----------------------------------------------------------------------
-c
-c ****** Storage for the coordinate transformation.
-c
-      integer :: ns
-      real(r_typ), dimension(:), allocatable :: xi,cs,a,r,f
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: zero=0._r_typ
-      real(r_typ), parameter :: one=1._r_typ
-      real(r_typ), parameter :: half=.5_r_typ
-      real(r_typ), parameter :: eps=1.e-5_r_typ
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j,nf,nr,ll,j0
-      real(r_typ) :: alpha,dr,fac,d,dxi,xiv,cshft,xi_shift
-      real(r_typ), dimension(:), allocatable :: dc,rdc
-c
-c-----------------------------------------------------------------------
-c
-c ****** Check that the number of mesh points is valid.
-c
-      if (nc.lt.2) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in GENMESH:'
-          write (*,*) '### Invalid number of mesh points specified.'
-          write (*,*) '### There must be at least two mesh points.'
-          write (*,*) 'Mesh coordinate: ',label
-          write (*,*) 'Number of mesh points specified = ',nc
-        end if
-        call endrun (.true.)
-      end if
-c
-c ****** Check that a positive mesh interval has been specified.
-c
-      if (c0.ge.c1) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in GENMESH:'
-          write (*,*) '### Invalid mesh interval specified.'
-          write (*,*) '### C1 must be greater than C0.'
-          write (*,*) 'Mesh coordinate: ',label
-          write (*,*) 'C0 = ',c0
-          write (*,*) 'C1 = ',c1
-        end if
-        call endrun (.true.)
-      end if
-c
-c ****** Find the number of values of FRAC specified.
-c
-      do nf=1,nseg-1
-        if (frac(nf+1).eq.zero) exit
-      enddo
-c
-c ****** When no values have been specified (NF=1, the default),
-c ****** a uniform mesh is produced.
-c
-      if (nf.eq.1.and.frac(1).eq.zero) then
-        ns=1
-        allocate (cs(ns+1))
-        allocate (r(ns))
-        cs(1)=c0
-        cs(2)=c1
-        r(1)=one
-        go to 100
-      end if
-c
-c ****** Check that the specified values of FRAC are monotonically
-c ****** increasing.
-c
-      do i=2,nf
-        if (frac(i).lt.frac(i-1)) then
-          if (iamp0) then
-            write (*,*)
-            write (*,*) '### ERROR in GENMESH:'
-            write (*,*) '### Invalid mesh specification.'
-            write (*,*) 'Mesh coordinate: ',label
-            write (*,*) 'The values in FRAC must increase'//
-     &                  ' monotonically.'
-            write (*,*) 'FRAC = ',frac(1:nf)
-          end if
-          call endrun (.true.)
-        end if
-      enddo
-c
-c ****** Check the specified values of FRAC.
-c
-      if (periodic) then
-c
-c ****** A periodic mesh requires the specified values of FRAC
-c ****** to be in the range 0. to 1.
-c
-        if (frac(1).lt.zero.or.frac(nf).gt.one) then
-          if (iamp0) then
-            write (*,*)
-            write (*,*) '### ERROR in GENMESH:'
-            write (*,*) '### Invalid mesh specification.'
-            write (*,*) 'Mesh coordinate: ',label
-            write (*,*) 'For a periodic coordinate, the values in'//
-     &                  ' FRAC must be between 0. and 1.'
-            write (*,*) 'FRAC = ',frac(1:nf)
-          end if
-          call endrun (.true.)
-        end if
-c
-c ****** A periodic mesh cannot contain both 0. and 1. in FRAC,
-c ****** since these represent the same point.
-c
-        if (frac(1).eq.zero.and.frac(nf).eq.one) then
-          if (iamp0) then
-            write (*,*)
-            write (*,*) '### ERROR in GENMESH:'
-            write (*,*) '### Invalid mesh specification.'
-            write (*,*) 'Mesh coordinate: ',label
-            write (*,*) 'For a periodic coordinate, FRAC must not'//
-     &                  ' contain both 0. and 1.'
-            write (*,*) 'FRAC = ',frac(1:nf)
-          end if
-          call endrun (.true.)
-        end if
-c
-      else
-c
-c ****** A non-periodic mesh requires the first specified value
-c ****** of FRAC to be 0., and the last value to equal 1.
-c
-        if (frac(1).ne.zero) then
-          if (iamp0) then
-            write (*,*)
-            write (*,*) '### ERROR in GENMESH:'
-            write (*,*) '### Invalid mesh specification.'
-            write (*,*) 'Mesh coordinate: ',label
-            write (*,*) 'For a non-periodic coordinate, the first'//
-     &                ' value of FRAC must equal 0.'
-            write (*,*) 'FRAC = ',frac(1:nf)
-          end if
-          call endrun (.true.)
-        end if
-c
-        if (frac(nf).ne.one) then
-          if (iamp0) then
-            write (*,*)
-            write (*,*) '### ERROR in GENMESH:'
-            write (*,*) '### Invalid mesh specification.'
-            write (*,*) 'Mesh coordinate: ',label
-            write (*,*) 'For a non-periodic coordinate, the last'//
-     &                  ' value of FRAC must equal 1.'
-            write (*,*) 'FRAC = ',frac(1:nf)
-          end if
-          call endrun (.true.)
-        end if
-c
-      end if
-c
-c ****** Check that the required values of DRATIO have been set,
-c ****** and are positive.
-c
-      if (periodic) then
-        nr=nf
-      else
-        nr=nf-1
-      end if
-c
-      do i=1,nr
-        if (dratio(i).le.zero) then
-          if (iamp0) then
-            write (*,*)
-            write (*,*) '### ERROR in GENMESH:'
-            write (*,*) '### Invalid mesh specification.'
-            write (*,*) 'Mesh coordinate: ',label
-            write (*,*) 'A required value in DRATIO has not been'//
-     &                  ' set or is not positive.'
-            write (*,*) 'DRATIO = ',dratio(1:nr)
-          end if
-          call endrun (.true.)
-        end if
-      enddo
-c
-c ****** Check that an inherently discontinuous mesh has not been
-c ****** specified inadvertently.
-c
-      if (periodic.and.nr.eq.1.and.dratio(1).ne.one) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### WARNING from GENMESH:'
-          write (*,*) '### Discontinuous mesh specification.'
-          write (*,*) 'Mesh coordinate: ',label
-          write (*,*) 'An inherently discontinuous mesh has been'//
-     &                ' specified.'
-          write (*,*) 'FRAC = ',frac(1:nf)
-          write (*,*) 'DRATIO = ',dratio(1:nr)
-        end if
-      end if
-c
-c ****** Set the number of segments.
-c
-      ns=nf-1
-c
-c ****** For a periodic coordinate, add points at XI=0. and XI=1.
-c ****** if they are not already present.
-c
-      if (periodic) then
-        if (frac(1).ne.zero) ns=ns+1
-        if (frac(nf).ne.one) ns=ns+1
-      end if
-c
-      allocate (cs(ns+1))
-      allocate (r(ns))
-c
-c ****** Set up the coordinate limits of the segments.
-c
-      if (periodic) then
-        if (frac(1).ne.zero) then
-          cs(1)=c0
-          cs(2:nf+1)=c0+(c1-c0)*frac(1:nf)
-          if (frac(nf).ne.one) then
-            alpha=(one-frac(nf))/(frac(1)+one-frac(nf))
-            r(1)=dratio(nr)/(one+alpha*(dratio(nr)-one))
-            r(2:nr+1)=dratio(1:nr)
-            cs(ns+1)=c1
-            r(ns)=one+alpha*(dratio(nr)-one)
-          else
-            r(1)=dratio(nr)
-            r(2:nr)=dratio(1:nr-1)
-          end if
-        else
-          cs(1:nf)=c0+(c1-c0)*frac(1:nf)
-          r(1:nr)=dratio(1:nr)
-          cs(ns+1)=c1
-        end if
-      else
-        cs(1:nf)=c0+(c1-c0)*frac(1:nf)
-        r(1:nr)=dratio(1:nr)
-      end if
-c
-  100 continue
-c
-      allocate (xi(ns+1))
-      allocate (a(ns))
-      allocate (f(ns))
-c
-c ****** Compute the XI values at the segment limits.
-c
-      do i=1,ns
-        dr=r(i)-one
-        if (abs(dr).lt.eps) then
-          f(i)=(cs(i+1)-cs(i))*(one+half*dr)
-        else
-          f(i)=(cs(i+1)-cs(i))*log(r(i))/dr
-        end if
-      enddo
-c
-      fac=zero
-      do i=ns,1,-1
-        fac=fac/r(i)+f(i)
-      enddo
-c
-      d=f(1)/fac
-      xi(1)=zero
-      do i=2,ns
-        xi(i)=xi(i-1)+d
-        if (i.lt.ns) d=d*f(i)/(f(i-1)*r(i-1))
-      enddo
-      xi(ns+1)=one
-c
-c ****** Set the amplification factor for each segment.
-c
-      do i=1,ns
-        a(i)=log(r(i))/(xi(i+1)-xi(i))
-      enddo
-c
-c ****** For a periodic coordinate, find the XI shift corresponding
-c ****** to a shift of C_SHIFT in the coordinate.
-c ****** Note that a positive value of C_SHIFT moves the mesh
-c ****** points to the right.
-c
-      if (periodic) then
-        cshft=-c_shift
-        call map_c_to_xi (periodic,ns,xi,cs,a,r,cshft,xi_shift)
-      else
-        xi_shift=0.
-      end if
-c
-c ****** Compute the location of the mesh points in array C
-c ****** by mapping from the XI values.
-c
-      dxi=one/(nc-one)
-c
-      c(1)=c0
-      do j=2,nc-1
-        xiv=(j-1)*dxi
-        call map_xi_to_c (periodic,ns,xi,cs,a,r,
-     &                    cshft,xi_shift,xiv,c(j))
-      enddo
-      c(nc)=c1
-c
-c ****** Filter the mesh if requested.
-c
-      if (nfilt.gt.0) then
-        do i=1,nfilt
-          if (periodic) then
-            call filter_coord_periodic (c1-c0,nc,c)
-          else
-            call filter_coord (nc,c)
-          end if
-        enddo
-      end if
-c
-c ****** Write out the mesh information.
-c
-      if (io.gt.0.and.iamp0) then
-c
-        write (io,*)
-        write (io,*) '### COMMENT from GENMESH:'
-        write (io,*) '### Mesh information for coordinate ',label,':'
-c
-        if (idebug.gt.0) then
-          write (io,*)
-          write (io,*) 'Flag to indicate a periodic mesh: ',periodic
-          write (io,*) 'Number of mesh points = ',nc
-          write (io,*) 'Lower mesh limit = ',c0
-          write (io,*) 'Upper mesh limit = ',c1
-          write (io,*) 'Number of times to filter the mesh = ',nfilt
-          if (periodic) then
-            write (io,*) 'Amount to shift the mesh = ',c_shift
-          end if
-        end if
-c
-        write (io,*)
-        write (io,*) 'Number of mesh segments = ',ns
-c
-        ll=len_trim(label)
-c
-        write (io,900) 'Segment      xi0       xi1'//
-     &                 repeat (' ',16-ll)//label//'0'//
-     &                 repeat (' ',16-ll)//label//'1'//
-     &                 '            ratio'
-        do i=1,ns
-          write (io,910) i,xi(i),xi(i+1),cs(i),cs(i+1),r(i)
-        enddo
-c
-        allocate (dc(nc))
-        allocate (rdc(nc))
-c
-        dc=c-cshift(c,-1)
-        if (periodic) dc(1)=dc(nc)
-        rdc=dc/cshift(dc,-1)
-        if (periodic) rdc(1)=rdc(nc)
-c
-        write (io,*)
-        write (io,920) 'Mesh-point locations:'
-        write (io,920) '     i'//
-     &                 repeat (' ',18-ll)//label//
-     &                 repeat (' ',17-ll)//'d'//label//
-     &                 '             ratio'
-c
-        if (periodic) then
-          j0=1
-        else
-          j0=3
-          write (io,930) 1,c(1)
-          write (io,930) 2,c(2),dc(2)
-        end if
-        do j=j0,nc
-          write (io,930) j,c(j),dc(j),rdc(j)
-        enddo
-c
-        deallocate (dc)
-        deallocate (rdc)
-c
-      end if
-c
-  900 format (/,tr1,a)
-  910 format (tr1,i4,2x,2f10.6,4f17.8)
-  920 format (tr1,a)
-  930 format (tr1,i6,3f18.8)
-c
-      deallocate (cs)
-      deallocate (r)
-      deallocate (xi)
-      deallocate (a)
-      deallocate (f)
-c
-      return
-      end
-c#######################################################################
-      subroutine map_xi_to_c (periodic,ns,xi,cs,a,r,
-     &                        cshft,xi_shift,xiv,cv)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Get the mesh coordinate value CV for the specified
-c ****** xi value XIV.
-c
-c ****** Set PERIODIC=.true. for a periodic coordinate.
-c ****** NS is the number of segments in the mesh definition.
-c ****** The arrays XI, CS, A, and R define the mesh mapping.
-c
-c ****** CSHFT represents the amount to shift a periodic coordinate.
-c ****** XI_SHIFT represents the corresponding amount to shift xi.
-c
-c ****** This is a utility routine for GENMESH.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      logical, intent(in) :: periodic
-      integer, intent(in) :: ns
-      real(r_typ), dimension(ns+1), intent(in) :: xi,cs
-      real(r_typ), dimension(ns), intent(in) :: a,r
-      real(r_typ), intent(in) :: cshft,xi_shift
-      real(r_typ), intent(in) :: xiv
-      real(r_typ), intent(out) :: cv
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: zero=0._r_typ
-      real(r_typ), parameter :: one=1._r_typ
-      real(r_typ), parameter :: half=.5_r_typ
-      real(r_typ), parameter :: eps=1.e-5_r_typ
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i
-      real(r_typ) :: xiv_p,d,d1,da,da1,fac
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), external :: fold
-c
-c-----------------------------------------------------------------------
-c
-c ****** Find the index of the segment to which XIV belongs.
-c
-      if (periodic) then
-c
-c ****** Shift XIV by XI_SHIFT.
-c
-        xiv_p=xiv+xi_shift
-c
-c ****** Fold XIV_P into the main interval.
-c
-        xiv_p=fold(zero,one,xiv_p)
-c
-      else
-c
-        xiv_p=xiv
-c
-      end if
-c
-      do i=1,ns
-        if (xiv_p.ge.xi(i).and.xiv_p.le.xi(i+1)) exit
-      enddo
-c
-      if (i.gt.ns) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in MAP_XI_TO_C:'
-          write (*,*) '### Error in finding the XI segment.'
-          write (*,*) '### Could not find XIV in the XI table.'
-          write (*,*) '[Utility routine for GENMESH.]'
-          write (*,*) '[This is an internal error.]'
-          write (*,*) 'XI = ',xi
-          write (*,*) 'XIV = ',xiv
-          write (*,*) 'XIV_P = ',xiv_p
-        end if
-        call endrun (.true.)
-      end if
-c
-      d =xiv_p  -xi(i)
-      d1=xi(i+1)-xi(i)
-c
-      da =a(i)*d
-      da1=a(i)*d1
-c
-c ****** Interpolate the mapping function at XIV_P.
-c
-      if (abs(da1).lt.eps) then
-        fac=(d*(one+half*da))/(d1*(one+half*da1))
-      else
-        fac=(exp(da)-one)/(r(i)-one)
-      end if
-c
-      cv=cs(i)+(cs(i+1)-cs(i))*fac
-c
-      if (periodic) then
-c
-c ****** Shift CV by the amount CSHFT.
-c
-        cv=cv-cshft
-c
-c ****** Fold CV into the main interval.
-c
-        cv=fold(cs(1),cs(ns+1),cv)
-c
-      end if
-c
-      return
-      end
-c#######################################################################
-      subroutine map_c_to_xi (periodic,ns,xi,cs,a,r,cv,xiv)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Get the xi value XIV for the specified coordinate value CV.
-c
-c ****** Set PERIODIC=.true. for a periodic coordinate.
-c ****** NS is the number of segments in the mesh definition.
-c ****** The arrays XI, CS, A, and R define the mesh mapping.
-c
-c ****** This is a utility routine for GENMESH.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      logical, intent(in) :: periodic
-      integer, intent(in) :: ns
-      real(r_typ), dimension(ns+1), intent(in) :: xi,cs
-      real(r_typ), dimension(ns), intent(in) :: a,r
-      real(r_typ), intent(in) :: cv
-      real(r_typ), intent(out) :: xiv
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: one=1._r_typ
-      real(r_typ), parameter :: eps=1.e-5_r_typ
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i
-      real(r_typ) :: cv_p,d,da,fac
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), external :: fold
-c
-c-----------------------------------------------------------------------
-c
-c ****** Find the index of the segment to which CV belongs.
-c
-      if (periodic) then
-c
-c ****** Fold CV_P into the main interval.
-c
-        cv_p=fold(cs(1),cs(ns+1),cv)
-c
-      else
-c
-        cv_p=cv
-c
-      end if
-c
-      do i=1,ns
-        if (cv_p.ge.cs(i).and.cv_p.le.cs(i+1)) exit
-      enddo
-c
-      if (i.gt.ns) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in MAP_C_TO_XI:'
-          write (*,*) '### Error in finding the CS segment.'
-          write (*,*) '### Could not find CV in the CS table.'
-          write (*,*) '[Utility routine for GENMESH.]'
-          write (*,*) '[This is an internal error.]'
-          write (*,*) 'CS = ',cs
-          write (*,*) 'CV = ',cv
-          write (*,*) 'CV_P = ',cv_p
-        end if
-        call endrun (.true.)
-      end if
-c
-      d=(cv_p-cs(i))/(cs(i+1)-cs(i))
-      da=(r(i)-one)*d
-c
-c ****** Interpolate the mapping function at XIV_P.
-c
-      if (abs(da).lt.eps) then
-        fac=d*(xi(i+1)-xi(i))
-      else
-        fac=log(da+one)/a(i)
-      end if
-c
-      xiv=xi(i)+fac
-c
-      return
-      end
-c#######################################################################
-      subroutine filter_coord (n,f)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Apply a "(1,2,1)/4" low-pass digital filter to a
-c ****** 1D coordinate.
-c
-c ****** The end-points F(1) and F(N) are not changed.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: n
-      real(r_typ), dimension(n) :: f
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: two=2._r_typ
-      real(r_typ), parameter :: quarter=.25_r_typ
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), dimension(n) :: ff
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i
-c
-c-----------------------------------------------------------------------
-c
-c ****** Make a copy of the function.
-c
-      ff=f
-c
-c ****** Apply the filter.
-c
-      do i=2,n-1
-        f(i)=quarter*(ff(i-1)+two*ff(i)+ff(i+1))
-      enddo
-c
-      return
-      end
-c#######################################################################
-      subroutine filter_coord_periodic (xl,n,f)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Apply a "(1,2,1)/4" low-pass digital filter to a
-c ****** periodic 1D coordinate.
-c
-c-----------------------------------------------------------------------
-c
-c ****** XL is the periodic interval for the coordinate.
-c
-c ****** The filtered coordinate is translated so that F(1)
-c ****** is preserved.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ) :: xl
-      integer :: n
-      real(r_typ), dimension(n) :: f
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: two=2._r_typ
-      real(r_typ), parameter :: quarter=.25_r_typ
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), dimension(0:n+1) :: ff
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i
-      real(r_typ) :: f1old,f1new
-c
-c-----------------------------------------------------------------------
-c
-c ****** Save the value of F(1).
-c
-      f1old=f(1)
-c
-c ****** Make a periodic copy of the function.
-c
-      ff(1:n)=f(:)
-c
-      ff(0)=f(n-1)-xl
-      ff(n+1)=f(2)+xl
-c
-c ****** Apply the filter.
-c
-      do i=1,n
-        f(i)=quarter*(ff(i-1)+two*ff(i)+ff(i+1))
-      enddo
-c
-c ****** Translate F so that F(1) is preserved.
-c
-      f1new=f(1)
-      do i=1,n
-        f(i)=f(i)-f1new+f1old
-      enddo
-c
-      return
-      end
-c#######################################################################
-      function fold (x0,x1,x)
-c
-c-----------------------------------------------------------------------
-c
-c ****** "Fold" X into the periodic interval [X0,X1].
-c
-c ****** On return, X is such that X0.le.X.lt.X1.
-c
-c-----------------------------------------------------------------------
-c
-c ****** It is assumed that X0 does not equal X1, as is physically
-c ****** necessary.  If X0 and X1 are equal, the routine just
-c ****** returns with FOLD=X.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ) :: fold
-      real(r_typ) :: x0,x1,x
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ) :: xl
-c
-c-----------------------------------------------------------------------
-c
-      fold=x
-c
-      if (x0.eq.x1) return
-c
-      xl=x1-x0
-c
-      fold=mod(x-x0,xl)+x0
-c
-      if (fold.lt.x0) fold=fold+xl
-      if (fold.ge.x1) fold=fold-xl
-c
-      return
-      end
-c#######################################################################
-      subroutine set_flux
-c
-c-----------------------------------------------------------------------
-c
-c ****** Set the radial magnetic field at the photosphere.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use global_dims
-      use global_mesh
-      use local_dims_r
-      use local_mesh_r
-      use local_dims_tp
-      use local_mesh_tp
-      use fields
-      use vars
-      use mpidefs
-      use timing
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-c ****** Global flux array.
-c
-      real(r_typ), dimension(:,:), allocatable :: br0_g
-c
-c-----------------------------------------------------------------------
-c
-      integer :: j,k,ierr
-c
-c-----------------------------------------------------------------------
-c
-      call timer_on
-c
-      allocate (br0_g(nt_g,np_g))
-c
-c ****** Define the global flux array.
-c
-c ****** Read the flux from file BR0FILE (only on processor IPROC0).
-c
-      if (iamp0) then
-        call readbr (br0file,br0_g,ierr)
-      end if
-      call check_error_on_p0 (ierr)
-c
-c ****** Broadcast BR0_G to all the processors.
-c
-      call MPI_Bcast (br0_g,nt_g*np_g,ntype_real,0,comm_all,ierr)
-c
-c ****** For a fully open field, reverse negative Br
-c ****** (i.e., use the monopole trick).
-c
-      if (option.eq.'open') then
-c
-c ****** Write the boundary flux (before the sign flip) to a file
-c ****** if requested.
-c
-        if (iamp0) then
-          if (br_photo_original_file.ne.'') then
-            write (*,*)
-            write (*,*) '### COMMENT from SET_FLUX:'
-            write (*,*)
-            write (*,*) 'Writing BR0 (before sign flip) to file: ',
-     &                  trim(br_photo_original_file)
-            write (9,*)
-            write (9,*) '### COMMENT from SET_FLUX:'
-            write (9,*)
-            write (9,*) 'Writing BR0 (before sign flip) to file: ',
-     &                  trim(br_photo_original_file)
-            call wrhdf_2d (br_photo_original_file,
-     &                     .true.,nt_g,np_g,
-     &                     br0_g,th_g,ph_g,hdf32,ierr)
-          end if
-        end if
-c
-c ****** Reverse Br.
-c
-        br0_g(:,:)=abs(br0_g(:,:))
-c
-      end if
-c
-c ****** Write the boundary flux to a file if requested.
-c
-      if (iamp0) then
-        if (br_photo_file.ne.' ') then
-          write (*,*)
-          write (*,*) '### COMMENT from SET_FLUX:'
-          write (*,*)
-          write (*,*) 'Writing BR0 to file: ',trim(br_photo_file)
-          write (9,*)
-          write (9,*) '### COMMENT from SET_FLUX:'
-          write (9,*)
-          write (9,*) 'Writing BR0 to file: ',trim(br_photo_file)
-          call wrhdf_2d (br_photo_file,.true.,nt_g,np_g,
-     &                   br0_g,th_g,ph_g,hdf32,ierr)
-        end if
-      end if
-c
-      do j=1,nt
-        do k=1,np
-          br0(j,k)=br0_g(j0_g+j-1,k0_g+k-1)
-        enddo
-      enddo
-!$omp target update to(br0)
-c
-      deallocate(br0_g)
-c
-      call timer_off (t_io)
-c
-      end subroutine
-c#######################################################################
-      subroutine potfld
-c
-c-----------------------------------------------------------------------
-c
-c ****** Find the potential field solution.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use local_dims_r
-      use local_mesh_r
-      use local_dims_tp
-      use local_mesh_tp
-      use fields
-      use cgcom
-      use solve_params
-      use mpidefs
-      use debug
-      use timing
-      use matrix_storage_pot3d_solve
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: one=1._r_typ
-c
-c-----------------------------------------------------------------------
-c
-      integer :: ierr,nrm2,ntm2,npm2,i
-c
-      real(r_typ), dimension(:), allocatable :: rhs_cg,x_cg
-c
-c-----------------------------------------------------------------------
-c
-c ****** Load matrix and preconditioner.
-c
-      nrm2=nrm1-1
-      ntm2=ntm1-1
-      npm2=npm1-1
-c
-      a_offsets(1)=-nrm2*ntm2
-      a_offsets(2)=-nrm2
-      a_offsets(3)=-1
-      a_offsets(4)= 0
-      a_offsets(5)= 1
-      a_offsets(6)= nrm2
-      a_offsets(7)= nrm2*ntm2
-c
-c ****** Allocate cg 1D vectors.
-c
-      N=nrm2*ntm2*npm2
-c
-c ****** Prepare the guess, and rhs for the solve.
-c
-      allocate(rhs_cg(N))
-      allocate(x_cg(N))
-      rhs_cg(:)=0.
-      x_cg(:)=0.
-!$omp target enter data map(to:rhs_cg,x_cg)
-c
-      call getM (N,a_offsets,M)
-      call alloc_pot3d_matrix_coefs
-      call load_matrix_pot3d_solve
-!$omp target enter data map(to:a)
-      call load_preconditioner_pot3d_solve
-!$omp target enter data map(to:a_i)
-c
-c ****** Use a trick to accumulate the contribution of the
-c ****** boundary conditions (i.e., the inhomogenous part).
-c
-      call set_boundary_points (x_ax,one)
-      call seam (x_ax,nr,nt,np)
-      call delsq (x_ax,rhs_cg)
-c
-c ****** Original rhs is zero so just use negative of boundary
-c        trick contributions:
-c
-      do concurrent (i=1:N)
-        rhs_cg(i)=-rhs_cg(i)
-      enddo
-c
-c ****** Solve for the potential.
-c
-      if (idebug.gt.0.and.iamp0) then
-        write (*,*)
-        write (*,*) '### COMMENT from POTFLD:'
-        write (*,*) '### Doing a solution:'
-      end if
-c
-      call solve (x_cg,rhs_cg,N,ierr)
-c
-      if (ierr.ne.0) then
-        call endrun (.true.)
-      end if
-c
-      call unpack_scalar (phi,x_cg)
-c
-      call set_boundary_points (phi,one)
-      call seam (phi,nr,nt,np)
-c
-!$omp target exit data map(delete:rhs_cg,x_cg,a,a_i)
-      call dealloc_pot3d_matrix_coefs
-      deallocate(rhs_cg)
-      deallocate(x_cg)
-c
-      return
-      end
-c#######################################################################
-      subroutine set_validation_flux
-c
-c-----------------------------------------------------------------------
-c
-c ****** Set the radial magnetic field at the photosphere.
-c ****** This uses the tilted dipole analytic solution for validation.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use global_dims
-      use global_mesh
-      use local_dims_r
-      use local_mesh_r
-      use local_dims_tp
-      use local_mesh_tp
-      use fields
-      use vars
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: two=2.0_r_typ
-c
-c-----------------------------------------------------------------------
-c
-c ****** Global Br boundary arrays.
-c
-      real(r_typ), dimension(:,:), allocatable :: br0_g
-      real(r_typ), dimension(:,:), allocatable :: br1_g
-c
-c-----------------------------------------------------------------------
-c
-      integer :: j,k
-      real(r_typ) :: sum00,sum11
-c
-c-----------------------------------------------------------------------
-c
-c ****** Define the global boundary condition arrays.
-c
-      allocate (br0_g(nt_g,np_g))
-      allocate (br1_g(nt_g,np_g))
-c
-c ****** Set the tilted dipole analytic Br.
-c
-      do j=1,nt_g
-        do k=1,np_g
-          br0_g(j,k)=(two/r0**3)*(cth_g(j)*cos(dipole_angle) +
-     &                            sth_g(j)*cph_g(k)*sin(dipole_angle))
-          br1_g(j,k)=(two/r1**3)*(cth_g(j)*cos(dipole_angle) +
-     &                            sth_g(j)*cph_g(k)*sin(dipole_angle))
-        enddo
-      enddo
-c
-c ****** Set Br to be periodic (should not be nesessary?).
-c
-      br0_g(:,1)=br0_g(:,npm1_g)
-      br0_g(:,np_g)=br0_g(:,2)
-      br1_g(:,1)=br1_g(:,npm1_g)
-      br1_g(:,np_g)=br1_g(:,2)
-c
-c ****** Set BCs at the poles.
-c
-      sum00=sum(br0_g(     2,2:npm1_g)*dph_g(2:npm1_g))*pl_i
-      sum11=sum(br0_g(ntm1_g,2:npm1_g)*dph_g(2:npm1_g))*pl_i
-c
-      br0_g(1   ,:)=two*sum00-br0_g(     2,:)
-      br0_g(nt_g,:)=two*sum11-br0_g(ntm1_g,:)
-c
-      sum00=sum(br1_g(     2,2:npm1_g)*dph_g(2:npm1_g))*pl_i
-      sum11=sum(br1_g(ntm1_g,2:npm1_g)*dph_g(2:npm1_g))*pl_i
-c
-      br1_g(1   ,:)=two*sum00-br1_g(     2,:)
-      br1_g(nt_g,:)=two*sum11-br1_g(ntm1_g,:)
-c
-      do j=1,nt
-        do k=1,np
-          br0(j,k)=br0_g(j0_g+j-1,k0_g+k-1)
-          br1(j,k)=br1_g(j0_g+j-1,k0_g+k-1)
-        enddo
-      enddo
-!$omp target update to(br0,br1)
-c
-      deallocate(br0_g)
-      deallocate(br1_g)
-c
-      end subroutine
-c#######################################################################
-      subroutine write_validation_solution
-c
-c-----------------------------------------------------------------------
-c
-c ****** Write out the tilted dipole analytic solution for validation.
-c ****** This writes out the phi, B, and B computed from phi.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use global_dims
-      use global_mesh
-      use local_dims_r
-      use local_mesh_r
-      use local_dims_tp
-      use local_mesh_tp
-      use fields
-      use vars
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: one=1.0_r_typ
-      real(r_typ), parameter :: two=2.0_r_typ
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j,k
-      character(256) :: fnamephi,fnamebr,fnamebt,fnamebp
-c
-c-----------------------------------------------------------------------
-c
-c ****** Set phi.
-c
-      do concurrent (k=1:np, j=1:nt, i=1:nr)
-        phi(i,j,k)=(-one/rh(i)**2)*(cth(j)*cos(dipole_angle)+
-     &                             sth(j)*cph(k)*sin(dipole_angle))
-      enddo
-c
-      call set_boundary_points (phi,one)
-      call seam (phi,nr,nt,np)
-c
-!$omp target enter data map(alloc:br,bt,bp)
-c
-c ****** Set Br.
-c
-      do concurrent (k=1:np, j=1:nt, i=1:nrm1)
-        br(i,j,k)=(two/r(i)**3)*(cth(j)*cos(dipole_angle) +
-     &                           sth(j)*cph(k)*sin(dipole_angle))
-      enddo
-c
-c ****** Set Bt.
-c
-      do concurrent (k=1:np, j=1:ntm1, i=1:nr)
-        bt(i,j,k)=(one/rh(i)**3)*(st(j)*cos(dipole_angle) -
-     &                            ct(j)*cph(k)*sin(dipole_angle))
-      enddo
-c
-c ****** Set Bp.
-c
-      do concurrent (k=1:npm1, j=1:nt, i=1:nr)
-        bp(i,j,k)=(one/rh(i)**3)*(sph(k)*sin(dipole_angle))
-      enddo
-c
-c ****** Due to half mesh trig issues on periodic domain, we need
-c ****** to seam these even though seaming is not needed in getb().
-c
-      call seam (br,nrm,nt,np)
-      call seam (bt,nr,ntm,np)
-      call seam (bp,nr,nt,npm)
-c
-c ****** Backup fnames and set new ones so we can reuse write routine.
-c
-      fnamephi=phifile
-      phifile='phi_exact.'//trim(fmt)
-      fnamebr=brfile
-      brfile='br_exact.'//trim(fmt)
-      fnamebt=btfile
-      btfile='bt_exact.'//trim(fmt)
-      fnamebp=bpfile
-      bpfile='bp_exact.'//trim(fmt)
-c
-      call write_solution
-c
-c ****** Restore original output names.
-c
-      phifile=fnamephi
-      brfile=fnamebr
-      btfile=fnamebt
-      bpfile=fnamebp
-c
-c ****** Reset phi.
-c
-      phi(:,:,:)=0.
-!$omp target update to(phi)
-!$omp target exit data map(delete:br,bt,bp)
-c
-      end subroutine
-c#######################################################################
-      subroutine solve (x,rhs,N,ierr)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Solve the implicit equations iteratively.
-c
-c-----------------------------------------------------------------------
-c
-c ****** Return IERR=0 if the iteration converges; otherwise,
-c ****** IERR is set to a nonzero value.
-c
-c ****** X is the initial guess at the solution.
-c ****** RHS is the right-hand side.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use cgcom
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: N
-      real(r_typ), dimension(N) :: x,rhs
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-c ****** Solve the equations using the CG method.
-c
-      call cgsolve (x,rhs,N,ierr)
-c
-c ****** Check for convergence.
-c
-      if (ierr.ne.0) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in SOLVE:'
-          write (*,*) '### The field solver did not converge.'
-          write (*,*) 'IERR = ',ierr
-          write (*,100) ncg,epsn
-  100     format (1x,'N = ',i6,' EPSN = ',1pe13.6)
-        end if
-      else
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### COMMENT from SOLVE:'
-          write (*,*) '### The field solver converged.'
-          write (*,*) 'Number of iterations = ',ncg
-          write (9,*)
-          write (9,*) '### COMMENT from SOLVE:'
-          write (9,*) '### The field solver converged.'
-          write (9,*) 'Number of iterations = ',ncg
-        end if
-      end if
-c
-      return
-      end
-c#######################################################################
-      subroutine cgsolve (x,r,N,ierr)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Solve the linear system:
-c
-c            A * x = b
-c
-c ****** using the classical Conjugate Gradient method for symmetric
-c ****** and positive-definite matrices.
-c
-c-----------------------------------------------------------------------
-c
-c ****** On input, X(N) contains a guess at the solution, and
-c ****** R(N) contains the right-hand side, b.
-c
-c ****** On exit, X contains an estimate to the solution, and
-c ****** R contains the residual (b-Ax).
-c
-c ****** IERR=0 indicates that the solution converged to the
-c ****** requested accuracy.  Other values indicate that the
-c ****** iteration did not converge for the given maximum number
-c ****** of iterations.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use cgcom
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: N
-      real(r_typ), dimension(N) :: x,r
-      integer :: ierr,i
-c
-c-----------------------------------------------------------------------
-c
-c ****** Scratch space for the CG iteration vectors.
-c
-      real(r_typ), dimension(N), target :: p,ap
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), external :: cgdot
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ) :: bdotb,rdotr,pdotap,alphai,rdotr_old,betai
-c
-c-----------------------------------------------------------------------
-c
-      ncg=0
-!$omp target enter data map(alloc:p,ap)
-c
-c ****** Get the norm of the RHS.
-c
-      do concurrent (i=1:N)
-        p(i)=r(i)
-      enddo
-c
-      call prec_inv (p)
-      bdotb=cgdot(r,p,N)
-c
-c ****** If the RHS is zero, return with a zero solution.
-c
-      if (bdotb.eq.0.) then
-        do concurrent (i=1:N)
-          x(i)=0.
-        enddo
-        epsn=0.
-        ierr=0
-        return
-      end if
-c
-c-----------------------------------------------------------------------
-c ****** Initialization.
-c-----------------------------------------------------------------------
-c
-      call ax (x,ap,N)
-c
-      do concurrent (i=1:N)
-        r(i)=r(i)-ap(i)
-        p(i)=r(i)
-      enddo
-c
-c ****** Find the initial error norm.
-c
-      call prec_inv (p)
-      rdotr=cgdot(r,p,N)
-c
-      call ernorm (bdotb,rdotr,ierr)
-      if (ierr.ge.0) return
-c
-c-----------------------------------------------------------------------
-c ****** Main iteration loop.
-c-----------------------------------------------------------------------
-c
-      do
-        ncg=ncg+1
-c
-        call ax (p,ap,N)
-c
-        pdotap=cgdot(p,ap,N)
-        alphai=rdotr/pdotap
-c
-        do concurrent (i=1:N)
-          x(i)=x(i)+alphai*p(i)
-          r(i)=r(i)-alphai*ap(i)
-          ap(i)=r(i)
-        enddo
-c
-        call prec_inv (ap)
-        rdotr_old=rdotr
-        rdotr=cgdot(r,ap,N)
-c
-c ****** Check for convergence.
-c
-        call ernorm (bdotb,rdotr,ierr)
-        if (ierr.ge.0) exit
-c
-        betai=rdotr/rdotr_old
-c
-        do concurrent (i=1:N)
-          p(i)=betai*p(i)+ap(i)
-        enddo
-c
-      enddo
-c
-!$omp target exit data map(delete:p,ap)
-      end subroutine
-c#######################################################################
-      subroutine ernorm (bdotb,rdotr,ierr)
-c
-c-----------------------------------------------------------------------
-c
-c ****** This subroutine checks if the iterative solver has
-c ****** converged or if the maximum allowed number of iterations,
-c ****** NCGMAX, has been exceeded.
-c
-c-----------------------------------------------------------------------
-c
-c ****** Convergence is deemed to have occurred when:
-c ******
-c ******     ||R||/||B|| .lt. EPSCG
-c ******
-c ****** where ||R|| is the norm of the (preconditioned)
-c ****** residual, ||B|| is the norm of the (preconditioned)
-c ****** RHS, and EPSCG is the specified convergence criterion.
-c
-c ****** Set IERR=0 if the error is below the error criterion
-c ****** (i.e., the solution has converged).
-c ****** Set IERR=-1 if the error does not yet meet the error
-c ****** criterion and the number of iterations is less than NCGMAX.
-c ****** Set IERR=1 if the maximum number of iterations has
-c ****** been exceeded without convergence.
-c
-c-----------------------------------------------------------------------
-c
-c ****** On input, BDOTB has the dot product of the RHS vector
-c ****** with itself, weighted by the preconditioning matrix.
-c ****** Similarly, RDOTR has the dot product of the residual vector
-c ****** with itself, weighted by the preconditioning matrix.
-c ****** This is used to normalize the error estimate.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use cgcom
-      use mpidefs
-      use vars
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ) :: bdotb,rdotr
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: one=1._r_typ
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ) :: epssq
-c
-c-----------------------------------------------------------------------
-c
-      ierr=-1
-c
-      epssq=rdotr/bdotb
-      epsn=sign(one,epssq)*sqrt(abs(epssq))
-c
-      if (ncghist.gt.0) then
-c
-        if (mod(ncg,ncghist).eq.0) then
-          if (iamp0) then
-            if (ncg.eq.0) then
-              write (9,*)
-              write (9,*) '### Convergence information:'
-              write (*,*)
-            end if
-            write (*,100) ncg,epsn
-            FLUSH(OUTPUT_UNIT)
-            write (9,100) ncg,epsn
-            FLUSH(9)
-  100       format (1x,'Iteration: ',i8,'   Residual: ',1pe23.15)
-          end if
-        end if
-      end if
-c
-c ****** Check for convergence.
-c
-      if (epsn.lt.epscg) then
-        if (ncghist.gt.0) then
-          if (iamp0) then
-            write (*,100) ncg,epsn
-            FLUSH(OUTPUT_UNIT)
-            write (9,*)
-            write (9,*) '### The CG solver has converged.'
-            write (9,100) ncg,epsn
-            FLUSH(9)
-          end if
-        end if
-        ierr=0
-      else if (ncg.ge.ncgmax) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in ERNORM:'
-          write (*,*) '### Exceeded maximum number of iterations.'
-          write (*,*) 'NCGMAX = ',ncgmax
-          write (*,*) 'EPSN = ',epsn
-          FLUSH(OUTPUT_UNIT)
-          write (9,*)
-          write (9,*) '### ERROR in ERNORM:'
-          write (9,*) '### Exceeded maximum number of iterations.'
-          write (9,*) 'NCGMAX = ',ncgmax
-          write (9,*) 'EPSN = ',epsn
-          FLUSH(9)
-        end if
-        ierr=1
-      end if
-c
-      return
-      end
-c#######################################################################
-      subroutine alloc_pot3d_matrix_coefs
-c
-c-----------------------------------------------------------------------
-c
-c ****** Allocate the arrays in which the matrix coefficients
-c ****** for the pot3d solve are stored.
-c
-c-----------------------------------------------------------------------
-c
-      use matrix_storage_pot3d_solve
-      use cgcom
-      use local_dims_r
-      use local_dims_tp
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      allocate (a(2:nrm1,2:ntm1,2:npm1,7))
-      a=0.
-      allocate (a_i(N))
-      a_i=0.
-c
-      if (ifprec.eq.2) then
-        allocate (a_csr(M))
-        allocate (lu_csr(M))
-        allocate (lu_csr_ja(M))
-        allocate (a_csr_ja(M))
-        allocate (a_csr_ia(1+N))
-        allocate (a_N1(N))
-        allocate (a_N2(N))
-        allocate (a_csr_d(N))
-        allocate (a_csr_dptr(N))
-      endif
-c
-      return
-      end
-c#######################################################################
-      subroutine dealloc_pot3d_matrix_coefs
-c
-c-----------------------------------------------------------------------
-c
-c ****** Deallocate the arrays in which the matrix coefficients
-c ****** for the pot3d solve are stored.
-c
-c-----------------------------------------------------------------------
-c
-      use matrix_storage_pot3d_solve
-      use cgcom
-      use cusparse_interface
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      deallocate (a)
-      deallocate (a_i)
-!$omp target exit data map(delete:a,a_i)
-c
-      if (ifprec.eq.2) then
-#ifdef CUSPARSE
-        call unload_lusol_cusparse
-!$omp target exit data map(delete:a_csr,a_csr_ja,a_csr_ia)
-#endif
-        deallocate (a_csr)
-        deallocate (lu_csr)
-        deallocate (lu_csr_ja)
-        deallocate (a_csr_ia)
-        deallocate (a_csr_ja)
-        deallocate (a_csr_d)
-        deallocate (a_N1)
-        deallocate (a_N2)
-        deallocate (a_csr_dptr)
-      endif
-c
-      end subroutine
-c#######################################################################
-      subroutine load_matrix_pot3d_solve
-c
-c-----------------------------------------------------------------------
-c
-c ****** Load the matrix coefficients for the pot3d solve.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use matrix_storage_pot3d_solve
-      use local_dims_r
-      use local_mesh_r
-      use local_dims_tp
-      use local_mesh_tp
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j,k
-c
-c-----------------------------------------------------------------------
-c
-c ****** Set matrix coefs
-c
-      do k=2,npm1
-        do j=2,ntm1
-          do i=2,nrm1
-c           a*ps(i,j,k-1):
-            a(i,j,k,1)=-drh(i)*dth(j)*sth_i(j)*dp_i(k-1)
-c           a*ps(i,j-1,k):
-            a(i,j,k,2)=-drh(i)*dph(k)*st(j-1)*dt_i(j-1)
-c           a*ps(i-1,j,k):
-            a(i,j,k,3)=-sth(j)*dth(j)*dph(k)*r2(i-1)*dr_i(i-1)
-c           a*ps(i+1,j,k):
-            a(i,j,k,5)=-sth(j)*dth(j)*dph(k)*r2(i  )*dr_i(i  )
-c           a*ps(i,j+1,k):
-            a(i,j,k,6)=-drh(i)*dph(k)*st(j  )*dt_i(j  )
-c           a*ps(i,j,k+1):
-            a(i,j,k,7)=-drh(i)*dth(j)*sth_i(j)*dp_i(k  )
-c
-c           a*ps(i,j,k):
-            a(i,j,k,4)=-(a(i,j,k,1)+a(i,j,k,2)+a(i,j,k,3)+
-     &                   a(i,j,k,5)+a(i,j,k,6)+a(i,j,k,7))
-          enddo
-        enddo
-      enddo
-c
-      return
-      end
-c#######################################################################
-      subroutine load_preconditioner_pot3d_solve
-c
-c-----------------------------------------------------------------------
-c
-c ****** Load the preconditioner for the pot3d solve.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use matrix_storage_pot3d_solve
-      use cgcom
-      use local_dims_r
-      use local_dims_tp
-      use, intrinsic :: iso_c_binding
-      use cusparse_interface
-      use timing
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: one=1._r_typ
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j,k,icode,ii
-c
-c-----------------------------------------------------------------------
-c
-      call timer_on
-c
-      if (ifprec.eq.0) return
-c
-      if (ifprec.eq.1) then
-c
-c ****** Diagonal scaling:
-c
-        ii=0
-        do k=2,npm1
-          do j=2,ntm1
-            do i=2,nrm1
-              ii=ii+1
-              a_i(ii)=real(one/a(i,j,k,4),r_typ_pc)
-            enddo
-          enddo
-        enddo
-c
-      elseif (ifprec.eq.2) then
-c
-c ****** Convert A matrix into CSR format:
-c
-        call diacsr (N,M,a,a_offsets,a_csr,a_csr_ja,
-     &                     a_csr_ia,a_csr_dptr)
-#ifdef CUSPARSE
-      cN=N
-      cM=M
-!$omp target enter data map(to:a_csr,a_csr_ja,a_csr_ia)
-!$omp target data use_device_ptr(a_csr,a_csr_ja,a_csr_ia)
-      call load_lusol_cusparse (C_LOC(a_csr(1)),
-     &                          C_LOC(a_csr_ia(1)),
-     &                          C_LOC(a_csr_ja(1)),cN,cM)
-!$omp end target data
-#else
-c
-c ****** Overwrite CSR A with preconditioner L and U matrices:
-c
-c ****** Incomplete LU (ILU)
-c
-        icode=0
-        call ilu0 (N,M,a_csr,a_csr_ja,a_csr_ia,a_csr_dptr,icode)
-c
-        if (icode.ne.0) then
-          print*, '### ERROR IN ILU FORMATION'
-        endif
-c
-c ****** Convert LU stored in A to LU matrix in optimized layout.
-c
-        call lu2luopt (N,M,lu_csr,a_csr,a_csr_ia,a_csr_ja,lu_csr_ja,
-     &                 a_csr_dptr,a_N1,a_N2)
-c
-c ****** Store inverse of diagonal of LU matrix.
-c
-        do i=1,N
-          a_csr_d(i)=one/a_csr(a_csr_dptr(i))
-        enddo
-c
-#endif
-      endif
-c
-      call timer_off (t_pc_load)
-c
-      end subroutine
-c#######################################################################
-      subroutine ilu0 (N,M,A,JA,IA,A_da,icode)
-c
-c-----------------------------------------------------------
-c
-c     Set-up routine for ILU(0) preconditioner. This routine
-c     computes the L and U factors of the ILU(0) factorization
-c     of a general sparse matrix A stored in CSR format with
-c     1-based indexing. Since
-c     L is unit triangular, the L and U factors can be stored
-c     as a single matrix which occupies the same storage as A.
-c     New ja and ia arrays are not needed for the LU matrix
-c     since the pattern of the LU matrix is identical with
-c     that of A.
-c
-c     Original Author:  Yousef Saad
-c            Iterative Methods for Sparse Linear Systems 2nd Ed. pg. 309
-c     Modified by R.M. Caplan
-c
-c-----------------------------------------------------------
-c     INPUT:
-c     N         : Dimension of matrix
-c     A, JA, IA : Sparse matrix in CSR sparse storage format
-c     A_da      : Pointers to the diagonal elements in the CSR
-c                 data structure luval
-c
-c     OUTPUT:
-c     A     : L/U matrices stored together. On return A,
-c             JA, and IA are the combined CSR data structure for
-c             the L and U factors.
-c     icode : Integer indicating error code on return:
-c             (0): Normal return.
-c             (k): Encountered a zero pivot at step k.
-c------------------------------------------------------------
-c
-      use number_types
-      use number_types_pc
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: JA(M),IA(N+1),A_da(N),iw(N)
-      integer :: icode,N,M
-      real(r_typ_pc) :: A(M)
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,ik,kj,k,ij,IA_i,IA_ip1m1
-      real(r_typ_pc) :: Aik
-c
-c-----------------------------------------------------------------------
-c
-      icode=0
-c     Initialize scratch index array:
-      iw(:)=0
-c
-      do i=2,N
-c       Store index of (i,j) in A in scratch array of iw(j=1:N)
-c       This allows lookup given a column index (j) in row (k)
-c       to see if the column is in row (i).
-        IA_i    =IA(i)
-        IA_ip1m1=IA(i+1)-1
-c
-        do ij=IA_i,IA_ip1m1
-          iw(JA(ij))=ij
-        enddo
-c
-c       Loop from first element in row i to 1 less than diagonal elem:
-        do ik=IA_i,A_da(i)-1     !IA(i+1) !ik is index of (i,k) in A[]
-          k    =JA(ik)           !Actual column index in matrix (k)
-          Aik  =A(ik)/A(A_da(k)) !Save Aik for next loop as an optim.
-          A(ik)=Aik
-c
-c         Loop from 1 more than diag elem to last elem in row k:
-          do kj=A_da(k)+1,IA(k+1)-1 !kj is index of (k,j) in A[]
-c            Get ij location from scratch array (if 0, no ij present)
-             ij=iw(JA(kj))
-             if (ij .ne. 0) then
-               A(ij)=A(ij)-Aik*A(kj)
-             endif
-          enddo
-        enddo
-c
-        if (A(ik).eq.0) then
-          icode=i
-          exit
-        endif
-c
-c       Reset scratch index array:
-        do ij=IA_i,IA_ip1m1
-          iw(JA(ij))=0
-        enddo
-      enddo
-c
-      end subroutine
-c#######################################################################
-      subroutine lu2luopt (N,M,LU,A,IA,JA,LUJA,A_da,N1,N2)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Re-order elements of LU matrix in CSR format into custom,
-c ****** optimized format for use with lusol().
-c ****** (Eventually, this could be merged with the ilu0 and/or diacsr)
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use number_types_pc
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: N,M
-      integer :: JA(M),LUJA(M),IA(N+1),A_da(N)
-      integer :: N1(N),N2(N)
-      real(r_typ_pc) :: A(M),LU(M)
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,k,ii
-c
-c-----------------------------------------------------------------------
-c
-      ii=0
-c
-      do i=1,N
-        do k=IA(i),A_da(i)-1
-           ii=ii+1
-           LU(ii)=A(k)
-           LUJA(ii)=JA(k)
-        enddo
-c
-c       Store k1 and k2 ranges for lusolve:
-c
-        N1(i)=A_da(i)-1-IA(i)
-        N2(i)=IA(i+1)-2-A_da(i)
-      enddo
-c
-      do i=N,1,-1
-        do k=A_da(i)+1,IA(i+1)-1
-           ii=ii+1
-           LU(ii)=A(k)
-           LUJA(ii)=JA(k)
-        enddo
-      enddo
-c
-      end subroutine
-c#######################################################################
-      subroutine diacsr (N,M,Adia,ioff,Acsr,JA,IA,Adptr)
-c
-c-----------------------------------------------------------------------
-c
-c *** DIACSR_INNER converts a solver matrix in a MAS-style
-c     diagonal format to standard compressed sparse row (CSR)
-c     including periodic coefficents when nproc_p=1.
-c
-c     Author of original diacsr: Youcef Saad
-c     Modifications for MAS:     RM Caplan
-c
-c     Input:
-c                     N: Size of the matrix (NxN)
-c                     M: Number of non-zero entries in matrix
-c                        (computed with getM_tc())
-c         Adia(IDIAG,N): The matrix in modified "DIA" format
-c           ioff(IDIAG): Offsets of the diagonals in A.
-c
-c     Output:
-c            Acsr(M), JA(M), IA(N+1): The matrix A in CSR.
-c                           Adptr(N): Pointers to diag elements in A,
-c                                     [e.g. A(i,i) == A(Adptr(i))]
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use number_types_pc
-      use local_dims_r
-      use local_dims_tp
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer, parameter :: IDIAG=7
-c
-c-----------------------------------------------------------------------
-c
-      real (r_typ_pc) :: Acsr(M)
-      real (r_typ) :: Adia(N,IDIAG)
-      integer :: N,M
-      integer :: Adptr(N)
-      integer :: IA(N+1)
-      integer :: JA(M)
-      integer :: ioff(IDIAG)
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j,jj,mi,mj,mk,ko,x
-      integer :: ioffok(IDIAG)
-c
-c-----------------------------------------------------------------------
-c
-      x=0
-c
-      IA(1)=1
-      ko=1
-      i=0
-c
-      do mk=2,npm1
-        do mj=2,ntm1
-          do mi=2,nrm1
-c ********* Set index of value and column indicies array:
-            i=i+1
-c
-c ********* Do not add coefs that multiply boundaries:
-c           For each boundary, there is a sub-set of coefs in the
-c           matrix row that should not be added.
-c           This makes "local" matrices have no bc info
-c
-c ********* Reset "i-offset-ok-to-use-coef-jj" array:
-c
-            ioffok(:)=1
-c
-            if (mi.eq.2) then
-              ioffok(3)=0;
-            endif
-c
-            if (mi.eq.nrm1) then
-              ioffok(5)=0;
-            endif
-c
-            if (mj.eq.2) then
-              ioffok(2)=0;
-            endif
-c
-            if (mj.eq.ntm1) then
-              ioffok(6)=0;
-            endif
-c
-c ********* Eliminate periodic ceofs in the case nproc_p>1
-c
-            if (nproc_p.gt.1) then
-              if (mk.eq.2) then
-                ioffok(1)=0
-              endif
-              if (mk.eq.npm1) then
-                ioffok(7)=0
-              endif
-            endif
-c
-c ********* To handle periodicity of phi in nproc_p=1 case:
-c           We want CSR matrix to be in order so
-c           have to sweep three times to avoid sorting:
-c
-c ********* Add periodic coefs of "right side":
-c
-            do jj=1,IDIAG
-              if (ioffok(jj).eq.1) then
-                j=i+ioff(jj)-x
-                if (j.gt.N-x) then
-                  j=j-N
-                  Acsr(ko)=real(Adia(i,jj),r_typ_pc)
-                  JA(ko)=j
-                  ko=ko+1
-                endif
-              endif
-            enddo
-c
-c ********* Now do non-periodic coefs:
-c
-            do jj=1,IDIAG
-              if (ioffok(jj).eq.1) then
-                j=i+ioff(jj)-x
-                if (j.ge.1.and.j.le.N-x) then
-c                 Store pointer to diagonal elements in A:
-                  if (jj.eq.4) Adptr(i)=ko
-                  Acsr(ko)=real(Adia(i,jj),r_typ_pc)
-                  JA(ko)=j
-                  ko=ko+1
-                endif
-              endif
-            enddo
-c
-c ********* Now do periodic coefs of "left side":
-c
-            do jj=1,IDIAG
-              if (ioffok(jj).eq.1) then
-                j=i+ioff(jj)-x
-                if (j.lt.1) then
-                  j=N+j
-                  Acsr(ko)=real(Adia(i,jj),r_typ_pc)
-                  JA(ko)=j
-                  ko=ko+1
-                endif
-              endif
-            enddo
-c
-c ********* Set row offset:
-c
-            IA(i+1)=ko-x
-          enddo
-        enddo
-      enddo
-c
-      return
-      end
-c#######################################################################
-      subroutine getM (N, ioff, M)
-c
-c-----------------------------------------------------------------------
-c
-c *** This routine computes the number of non-zeros in the
-c     solver matrix for use with allocating the matrices.
-c     See diacsr() for description of inputs.
-c
-c     Output:  M  # of nonzeros.
-c
-c-----------------------------------------------------------------------
-c
-      use mpidefs
-      use local_dims_r
-      use local_dims_tp
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer, parameter :: IDIAG=7
-      integer :: N,M,i,j,jj,ko,mi,mj,mk,x
-      integer :: ioff(IDIAG)
-      integer :: ioffok(IDIAG)
-c
-      x=0
-c
-      ko=1
-      i=0
-c
-      do mk=2,npm1
-        do mj=2,ntm1
-          do mi=2,nrm1
-c
-            ioffok(:)=1
-c
-            if (mi.eq.2) then
-              ioffok(3)=0;
-            endif
-c
-            if (mi.eq.nrm1) then
-              ioffok(5)=0;
-            endif
-c
-            if (mj.eq.2) then
-              ioffok(2)=0;
-            endif
-c
-            if (mj.eq.ntm1) then
-              ioffok(6)=0;
-            endif
-c
-c ********* Eliminate periodic ceofs in the case nproc_p>1
-c
-            if (nproc_p.gt.1) then
-              if (mk.eq.2) then
-                ioffok(1)=0
-              endif
-              if (mk.eq.npm1) then
-                ioffok(7)=0
-              endif
-            endif
-c
-            do jj=1,IDIAG
-              if (ioffok(jj).eq.1) then
-                j=i+ioff(jj)-x
-                if (j.gt.N-x) then
-                  ko=ko+1
-                endif
-              endif
-            enddo
-c
-            do jj=1,IDIAG
-              if (ioffok(jj).eq.1) then
-                j=i+ioff(jj)-x
-                if (j.ge.1.and.j.le.N-x) then
-                  ko=ko+1
-                endif
-              endif
-            enddo
-c
-            do jj=1,IDIAG
-              if (ioffok(jj).eq.1) then
-                j=i+ioff(jj)-x
-                if (j.lt.1) then
-                  ko=ko+1
-                endif
-              endif
-            enddo
-          enddo
-        enddo
-      enddo
-c
-c *** Save number of non-zeros of matrix:
-c
-      M=ko-1
-c
-      return
-      end
-c#######################################################################
-      subroutine ax (x,y,N)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Set y = A * x.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use local_dims_r
-      use local_dims_tp
-      use fields, ONLY : x_ax
-      use timing
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: zero=0._r_typ
-c
-c-----------------------------------------------------------------------
-c
-      integer :: N
-      real(r_typ), dimension(N) :: x,y
-c
-c-----------------------------------------------------------------------
-c
-c ****** Expand X array to allow for boundary and seam values.
-c
-      call unpack_scalar (x_ax,x)
-c
-c ****** Set the boundary values of X.
-c
-      call set_boundary_points (x_ax,zero)
-c
-c ****** Seam along edges between processors.
-c
-      call seam (x_ax,nr,nt,np)
-c
-c ****** Get the matrix-vector product.
-c
-      call timer_on
-c
-      call delsq (x_ax,y)
-c
-      call timer_off (t_ax)
-c
-      end subroutine
-c#######################################################################
-      subroutine prec_inv (x)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Apply preconditioner: x := M(inv) * x.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use number_types_pc
-      use cgcom
-      use solve_params
-      use matrix_storage_pot3d_solve
-      use, intrinsic :: iso_c_binding
-      use cusparse_interface
-      use timing
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), dimension(N) :: x
-      real(r_typ_pc), dimension(N) :: x_32
-      integer :: i
-c
-c-----------------------------------------------------------------------
-c
-      if (ifprec.eq.0) return
-c
-      call timer_on
-c
-      if (ifprec.eq.1) then
-c
-c ****** Point-Jacobi (diagonal scaling):
-c
-        do concurrent (i=1:N)
-          x(i)=a_i(i)*x(i)
-        enddo
-c
-      elseif (ifprec.eq.2) then
-c
-c ****** ILU0 Partial-Block-Jacobi:
-c
-#ifdef CUSPARSE
-!$omp target data use_device_ptr(x)
-        call lusol_cusparse(C_LOC(x(1)))
-!$omp end target data
-c
-#else
-!$omp target update from(x)
-c
-c ****** Convert input array to single precision.
-c
-        do i=1,N
-          x_32(i) = real(x(i),r_typ_pc)
-        enddo
-c
-        call lusol (N,M,x_32,lu_csr,lu_csr_ja,a_N1,a_N2,a_csr_d)
-c
-c ****** Convert result back to double precision.
-c
-        do i=1,N
-          x(i) = real(x_32(i),r_typ)
-        enddo
-c
-!$omp target update to(x)
-#endif
-      endif
-c
-      call timer_off (t_pc)
-c
-      end subroutine
-c#######################################################################
-      subroutine lusol (N,M,x,LU,LU_ja,N1,N2,LUd_i)
-c
-c-----------------------------------------------------------
-c
-c     Performs a forward and a backward solve for the sparse system
-c     (LU) x=y  where LU is in an optimized custom CSR format
-c                                              (see lu2luopt())
-c
-c     For use where LU is an ILU or SSOR/SGS factorization.
-c
-c     Author of original lusol: Yousef Saad
-c           Iterative Methods for Sparse Linear Systems 2nd Ed. pg. 299
-c
-c     Modified by RM Caplan to include optimized memory access
-c     as described in
-c     B. Smith, H. Zhang  Inter. J. of High Perf. Comp. Appl.
-c     Vol. 25 #4 pg. 386-391 (2011)
-c
-c-----------------------------------------------------------
-c     PARAMETERS:
-c     N     : Dimension of problem
-c     x     : At input, x is rhs (y), at output x is the solution.
-c     LU    : Values of the LU matrix. L and U are stored together in
-c             order of access in this routine.
-c     LU_ja : Column indices of elements in LU.
-c     N1    : Row-start indicies in original CSR LU.
-c     N2    : Indices of diagonal elements in orig CSR LU
-c     LUd_i : Inverse diagonal elements of U
-c------------------------------------------------------------
-c
-      use number_types
-      use number_types_pc
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ_pc) :: x(N),LUd_i(N),LU(M)
-      integer :: N1(N),N2(N),LU_ja(M)
-      integer :: N,M
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,k,k1,k2
-c
-c-----------------------------------------------------------------------
-c
-c ****** FORWARD SOLVE: Solve L x'=y
-c
-      k2=0
-      do i=1,N
-c       Compute x(i) := x(i) - sum L(i,j) * x(j)
-        k1=k2+1
-        k2=k1+N1(i)
-        do k=k1,k2
-          x(i)=x(i)-LU(k)*x(LU_ja(k))
-        enddo
-c       Diagonal is always 1 for L so no division here is nessesary.
-      enddo
-c
-c ****** BACKWARD SOLVE: Solve U x=x'
-c
-      do i=N,1,-1
-c       Compute x(i) := x(i) - sum U(i,j) * x(j)
-        k1=k2+1
-        k2=k1+N2(i)
-        do k=k1,k2
-          x(i)=x(i)-LU(k)*x(LU_ja(k))
-        enddo
-c       Compute x(i) := x(i) / U(i,i)
-        x(i)=x(i)*LUd_i(i)
-      enddo
-c
-      end subroutine
-c#######################################################################
-      subroutine unpack_scalar (s,x)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Unpack the scalar x into
-c ****** three-dimensional array s leaving room for boundaries.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use local_dims_r
-      use local_dims_tp
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), dimension(nr,nt,np) :: s
-      real(r_typ), dimension(2:nrm1,2:ntm1,2:npm1) :: x
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j,k
-c
-c-----------------------------------------------------------------------
-c
-      do concurrent (k=2:npm1, j=2:ntm1, i=2:nrm1)
-        s(i,j,k)=x(i,j,k)
-      enddo
-c
-      return
-      end
-c#######################################################################
-      subroutine delsq (x,y)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Set Y = - (dV * del-squared X) at the internal points.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use local_dims_r
-      use local_dims_tp
-      use matrix_storage_pot3d_solve
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), dimension(nr,nt,np) :: x
-      real(r_typ), dimension(2:nrm1,2:ntm1,2:npm1) :: y
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j,k
-c
-c-----------------------------------------------------------------------
-c
-      do concurrent (k=2:npm1, j=2:ntm1, i=2:nrm1)
-        y(i,j,k)=a(i,j,k,1)*x(i  ,j  ,k-1)
-     &          +a(i,j,k,2)*x(i  ,j-1,k  )
-     &          +a(i,j,k,3)*x(i-1,j  ,k  )
-     &          +a(i,j,k,4)*x(i  ,j  ,k  )
-     &          +a(i,j,k,5)*x(i+1,j  ,k  )
-     &          +a(i,j,k,6)*x(i  ,j+1,k  )
-     &          +a(i,j,k,7)*x(i  ,j  ,k+1)
-      enddo
-c
-      return
-      end
-c#######################################################################
-      subroutine set_boundary_points (x,vmask)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Set boundary points of X at the physical boundaries.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use global_mesh
-      use local_dims_r
-      use local_mesh_r
-      use local_dims_tp
-      use local_mesh_tp
-      use fields
-      use solve_params
-      use vars, ONLY : validation_run
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), dimension(nr,nt,np) :: x
-      real(r_typ) :: vmask,temp_sum0
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: two=2._r_typ
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j,k
-c
-c-----------------------------------------------------------------------
-c
-c ****** Set X at the radial boundaries.
-c
-      if (rb0) then
-        do concurrent (k=2:npm1, j=2:ntm1)
-          x( 1,j,k)=x(2,j,k)-vmask*br0(j,k)*dr1
-        enddo
-      end if
-c
-      if (rb1) then
-        if (validation_run) then
-          do concurrent (k=2:npm1, j=2:ntm1)
-            x(nr,j,k)=x(nrm1,j,k)+vmask*br1(j,k)*drn
-          enddo
-        else
-          do concurrent (k=2:npm1, j=2:ntm1)
-            x(nr,j,k)=pm_r1*x(nrm1,j,k)
-          enddo
-        end if
-      end if
-c
-c ****** If this processor does not contain any points at the
-c ****** pole, return.
-c
-      if (.not.(tb0.or.tb1)) return
-c
-c ****** Get the m=0 component of X at the poles.
-c
-      if (tb0) then
-        do concurrent (i=1:nr)
-          sum0(i)=0
-        enddo
-!$omp target teams loop
-        do i=1,nr
-          temp_sum0=0.
-!$omp loop reduction(+:temp_sum0)
-          do k=2,npm1
-            temp_sum0=temp_sum0+x(i,2,k)*dph(k)*pl_i
-          enddo
-          sum0(i)=temp_sum0
-        enddo
-      end if
-c
-      if (tb1) then
-        do concurrent (i=1:nr)
-          sum1(i)=0
-        enddo
-!$omp target teams loop 
-        do i=1,nr
-          temp_sum0=0.
-!$omp loop reduction(+:temp_sum0)
-          do k=2,npm1
-            temp_sum0=temp_sum0+x(i,ntm1,k)*dph(k)*pl_i
-          enddo
-          sum1(i)=temp_sum0
-        enddo
-      end if
-c
-c ****** Sum over all processors.
-c
-      call sum_over_phi (nr,sum0,sum1)
-c
-c ****** Set X to have only an m=0 component at the poles.
-c
-      if (tb0) then
-        do concurrent (k=2:npm1, i=1:nr)
-          x(i,1,k)=two*sum0(i)-x(i,2,k)
-        enddo
-      end if
-c
-      if (tb1) then
-        do concurrent (k=2:npm1, i=1:nr)
-          x(i,nt,k)=two*sum1(i)-x(i,ntm1,k)
-        enddo
-      end if
-c
-      return
-      end subroutine
-c#######################################################################
-      subroutine sum_over_phi (n,a0,a1)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Sum the contribution over all processors in the phi
-c ****** dimension (only for processors with points on the poles).
-c
-c ****** The sum is performed for all N points in the vectors
-c ****** SUM0(N) and SUM1(N), at the North and South pole,
-c ****** respectively.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use local_dims_tp
-      use mpidefs
-      use timing
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: n
-      real(r_typ), dimension(n) :: a0,a1
-c
-c-----------------------------------------------------------------------
-c
-c ****** MPI error return.
-c
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-      call timer_on
-c
-      if (tb0) then
-!$omp target update from(a0)
-        call MPI_Allreduce (MPI_IN_PLACE,a0,n,ntype_real,
-     &                      MPI_SUM,comm_phi,ierr)
-!$omp target update to(a0)
-      end if
-c
-      if (tb1) then
-!$omp target update from(a1)
-        call MPI_Allreduce (MPI_IN_PLACE,a1,n,ntype_real,
-     &                      MPI_SUM,comm_phi,ierr)
-!$omp target update to(a1)
-      end if
-c
-      call timer_off (c_sumphi)
-c
-      return
-      end
-c#######################################################################
-      subroutine zero_boundary_points (x)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Set the boundary points at the physical boundaries
-c ****** of X to zero.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use local_dims_r
-      use local_dims_tp
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), dimension(nr,nt,np) :: x
-c
-c-----------------------------------------------------------------------
-c
-      if (rb0) x( 1,:,:)=0.
-      if (rb1) x(nr,:,:)=0.
-      if (tb0) x(:, 1,:)=0.
-      if (tb1) x(:,nt,:)=0.
-c
-      return
-      end
-c#######################################################################
-      function cgdot (x,y,N)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Get the dot product of the vectors X and Y.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: N,i
-      real(r_typ) :: cgdot
-      real(r_typ), dimension(N) :: x,y
-c
-c-----------------------------------------------------------------------
-c
-      cgdot=0.
-c
-!$omp target teams loop reduction(+:cgdot)
-      do i=1,N
-        cgdot=cgdot+x(i)*y(i)
-      enddo
-c
-c ****** Sum over all the processors.
-c
-      call global_sum (cgdot)
-c
-      return
-      end function
-c#######################################################################
-      subroutine global_sum (x)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Overwrite X by the its sum over all processors.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use mpidefs
-      use timing
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ) :: x
-c
-c-----------------------------------------------------------------------
-c
-c ****** MPI error return.
-c
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-      call timer_on
-c
-c ****** Take the sum over all the processors.
-c
-      call MPI_Allreduce (MPI_IN_PLACE,x,1,ntype_real,
-     &                    MPI_SUM,comm_all,ierr)
-c
-      call timer_off (c_cgdot)
-c
-      end subroutine
-c#######################################################################
-      subroutine seam (a,n1,n2,n3)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Seam the boundary points of 3D (r,t,p) array A between
-c ****** adjacent processors.
-c
-c ****** This routine assumes that there is a two-point
-c ****** overlap between processors in each dimension.
-c
-c-----------------------------------------------------------------------
-c
-c ****** This version uses non-blocking MPI sends and receives
-c ****** whenever possible in order to overlap communications.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use mpidefs
-      use timing
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), dimension(n1,n2,n3) :: a
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), dimension(n2,n3) :: sbuf11,rbuf11
-      real(r_typ), dimension(n2,n3) :: sbuf12,rbuf12
-      real(r_typ), dimension(n1,n3) :: sbuf21,rbuf21
-      real(r_typ), dimension(n1,n3) :: sbuf22,rbuf22
-      real(r_typ), dimension(n1,n2) :: sbuf31,rbuf31
-      real(r_typ), dimension(n1,n2) :: sbuf32,rbuf32
-c
-c-----------------------------------------------------------------------
-c
-c ****** MPI error return.
-c
-      integer :: ierr
-c
-c ****** MPI tag for MPI_ISEND and MPI_IRECV (not tagged).
-c
-      integer :: tag=0
-c
-c-----------------------------------------------------------------------
-c
-      integer :: lbuf,i,j
-      integer :: n1,n2,n3
-      integer :: reqs(4)
-c
-c-----------------------------------------------------------------------
-c
-      call timer_on
-c
-c ****** Seam the third (periodic) dimension.
-c
-!$omp target enter data map(alloc:sbuf31,sbuf32,rbuf31,rbuf32)
-c
-      lbuf=n1*n2
-c
-      do concurrent (j=1:n2, i=1:n1)
-        sbuf31(i,j)=a(i,j,n3-1)
-        sbuf32(i,j)=a(i,j,   2)
-      enddo
-c
-!$omp target update from(sbuf31,sbuf32)
-      call MPI_Isend (sbuf31,lbuf,ntype_real,iproc_pp,tag,
-     &                comm_all,reqs(1),ierr)
-c
-      call MPI_Isend (sbuf32,lbuf,ntype_real,iproc_pm,tag,
-     &                comm_all,reqs(2),ierr)
-c
-      call MPI_Irecv (rbuf31,lbuf,ntype_real,iproc_pm,tag,
-     &                comm_all,reqs(3),ierr)
-c
-      call MPI_Irecv (rbuf32,lbuf,ntype_real,iproc_pp,tag,
-     &                comm_all,reqs(4),ierr)
-c
-      call MPI_Waitall (4,reqs,MPI_STATUSES_IGNORE,ierr)
-!$omp target update to(rbuf31,rbuf32)
-c
-      do concurrent (j=1:n2, i=1:n1)
-        a(i,j, 1)=rbuf31(i,j)
-        a(i,j,n3)=rbuf32(i,j)
-      enddo
-c
-!$omp target exit data map(delete:sbuf31,sbuf32,rbuf31,rbuf32)
-c
-c ****** Seam the first dimension.
-c
-      if (nproc_r.gt.1) then
-c
-!$omp target enter data map(alloc:sbuf11,sbuf12,rbuf11,rbuf12)
-c
-        lbuf=n2*n3
-c
-        do concurrent (j=1:n3, i=1:n2)
-          sbuf11(i,j)=a(n1-1,i,j)
-          sbuf12(i,j)=a(   2,i,j)
-        enddo
-c
-!$omp target update from(sbuf11,sbuf12)
-        call MPI_Isend (sbuf11,lbuf,ntype_real,iproc_rp,tag,
-     &                  comm_all,reqs(1),ierr)
-c
-        call MPI_Isend (sbuf12,lbuf,ntype_real,iproc_rm,tag,
-     &                  comm_all,reqs(2),ierr)
-c
-        call MPI_Irecv (rbuf11,lbuf,ntype_real,iproc_rm,tag,
-     &                  comm_all,reqs(3),ierr)
-c
-        call MPI_Irecv (rbuf12,lbuf,ntype_real,iproc_rp,tag,
-     &                  comm_all,reqs(4),ierr)
-c
-        call MPI_Waitall (4,reqs,MPI_STATUSES_IGNORE,ierr)
-!$omp target update to(rbuf11,rbuf12)
-c
-        if (iproc_rm.ne.MPI_PROC_NULL) then
-          do concurrent (j=1:n3, i=1:n2)
-            a( 1,i,j)=rbuf11(i,j)
-          enddo
-        end if
-c
-        if (iproc_rp.ne.MPI_PROC_NULL) then
-          do concurrent (j=1:n3, i=1:n2)
-            a(n1,i,j)=rbuf12(i,j)
-          enddo
-        end if
-c
-!$omp target exit data map(delete:sbuf11,sbuf12,rbuf11,rbuf12)
-      end if
-c
-c ****** Seam the second dimension.
-c
-      if (nproc_t.gt.1) then
-c
-!$omp target enter data map(alloc:sbuf21,sbuf22,rbuf21,rbuf22)
-c
-        lbuf=n1*n3
-c
-        do concurrent (j=1:n3, i=1:n1)
-          sbuf21(i,j)=a(i,n2-1,j)
-          sbuf22(i,j)=a(i,   2,j)
-        enddo
-c
-!$omp target update from(sbuf21,sbuf22)
-        call MPI_Isend (sbuf21,lbuf,ntype_real,iproc_tp,tag,
-     &                  comm_all,reqs(1),ierr)
-c
-        call MPI_Isend (sbuf22,lbuf,ntype_real,iproc_tm,tag,
-     &                  comm_all,reqs(2),ierr)
-c
-        call MPI_Irecv (rbuf21,lbuf,ntype_real,iproc_tm,tag,
-     &                  comm_all,reqs(3),ierr)
-c
-        call MPI_Irecv (rbuf22,lbuf,ntype_real,iproc_tp,tag,
-     &                  comm_all,reqs(4),ierr)
-c
-        call MPI_Waitall (4,reqs,MPI_STATUSES_IGNORE,ierr)
-!$omp target update to(rbuf21,rbuf22)
-c
-        if (iproc_tm.ne.MPI_PROC_NULL) then
-          do concurrent (j=1:n3, i=1:n1)
-            a(i, 1,j)=rbuf21(i,j)
-          enddo
-        end if
-c
-        if (iproc_tp.ne.MPI_PROC_NULL) then
-          do concurrent (j=1:n3, i=1:n1)
-            a(i,n2,j)=rbuf22(i,j)
-          enddo
-        end if
-c
-!$omp target exit data map(delete:sbuf21,sbuf22,rbuf21,rbuf22)
-      end if
-c
-      call timer_off (c_seam)
-c
-      end subroutine
-c#######################################################################
-      subroutine write_solution
-c
-c-----------------------------------------------------------------------
-c
-c ****** Write the global solution.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use global_dims
-      use global_mesh
-      use fields
-      use vars
-      use solve_params
-      use mpidefs
-      use decomposition
-      use assemble_array_interface
-      use timing
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-c ****** Declaration for the global arrays.
-c ****** These arrays are only allocated on processor IPROC0.
-c
-      real(r_typ), dimension(:,:,:), allocatable :: phi_g
-      real(r_typ), dimension(:,:,:), allocatable :: br_g
-      real(r_typ), dimension(:,:,:), allocatable :: bt_g
-      real(r_typ), dimension(:,:,:), allocatable :: bp_g
-c
-c-----------------------------------------------------------------------
-c
-      integer :: ierr
-      character(256) :: fname
-c
-c-----------------------------------------------------------------------
-c
-      call timer_on
-c
-c ****** Potential.
-c
-      if (phifile.ne.'') then
-!$omp target update from(phi)
-c
-c ****** Allocate the global array PHI_G (on processor IPROC0).
-c
-        if (iamp0) then
-          allocate (phi_g(nr_g,nt_g,np_g))
-        else
-          allocate (phi_g(1,1,1))
-        end if
-c
-c ****** Assemble the global PHI array.
-c
-        call assemble_array (map_rh,map_th,map_ph,phi,phi_g)
-c
-        fname=phifile
-c
-c ****** Write out the potential to a file.
-c
-        if (iamp0) then
-            write (*,*)
-            write (*,*) '### COMMENT from WRITE_SOLUTION:'
-            write (*,*)
-            write (*,*) 'Writing the potential to file: ',trim(fname)
-            call wrhdf_3d (fname,.true.,nr_g,nt_g,np_g,
-     &                     phi_g,rh_g,th_g,ph_g,hdf32,ierr)
-        end if
-c
-        deallocate (phi_g)
-c
-      end if
-c
-c ****** Br.
-c
-      if (brfile.ne.'') then
-!$omp target update from(br)
-c
-        fname=brfile
-c
-        if (iamp0) then
-          allocate (br_g(nrm1_g,nt_g,np_g))
-        else
-          allocate (br_g(1,1,1))
-        end if
-c
-c ****** Assemble the global PHI array.
-c
-        call assemble_array (map_rm,map_th,map_ph,br,br_g)
-c
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### COMMENT from WRITE_SOLUTION:'
-          write (*,*)
-          write (*,*) 'Writing Br to file: ',trim(fname)
-          call wrhdf_3d (fname,.true.,nrm1_g,nt_g,np_g,
-     &                   br_g,r_g,th_g,ph_g,hdf32,ierr)
-        end if
-c
-        deallocate (br_g)
-c
-      end if
-c
-c ****** Bt.
-c
-      if (btfile.ne.'') then
-!$omp target update from(bt)
-c
-        fname=btfile
-c
-        if (iamp0) then
-          allocate (bt_g(nr_g,ntm1_g,np_g))
-        else
-          allocate (bt_g(1,1,1))
-        end if
-c
-c ****** Assemble the global PHI array.
-c
-        call assemble_array (map_rh,map_tm,map_ph,bt,bt_g)
-c
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### COMMENT from WRITE_SOLUTION:'
-          write (*,*)
-          write (*,*) 'Writing Bt to file: ',trim(fname)
-          call wrhdf_3d (fname,.true.,nr_g,ntm1_g,np_g,
-     &                   bt_g,rh_g,t_g,ph_g,hdf32,ierr)
-c
-        end if
-c
-        deallocate (bt_g)
-c
-      end if
-c
-c ****** Bp.
-c
-      if (bpfile.ne.'') then
-!$omp target update from(bp)
-c
-        fname=bpfile
-c
-        if (iamp0) then
-          allocate (bp_g(nr_g,nt_g,npm1_g))
-        else
-          allocate (bp_g(1,1,1))
-        end if
-c
-c ****** Assemble the global PHI array.
-c
-        call assemble_array (map_rh,map_th,map_pm,bp,bp_g)
-c
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### COMMENT from WRITE_SOLUTION:'
-          write (*,*)
-          write (*,*) 'Writing Bp to file: ',trim(fname)
-          call wrhdf_3d (fname,.true.,nr_g,nt_g,npm1_g,
-     &                   bp_g,rh_g,th_g,p_g,hdf32,ierr)
-c
-        end if
-c
-        deallocate (bp_g)
-c
-      end if
-c
-      call timer_off (t_io)
-c
-      end subroutine
-c#######################################################################
-      subroutine getb
-c
-c-----------------------------------------------------------------------
-c
-c ****** Calculate B from grad-phi.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use global_dims
-      use global_mesh
-      use vars
-      use fields
-      use local_dims_r
-      use local_dims_tp
-      use local_mesh_r
-      use local_mesh_tp
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j,k
-c
-c-----------------------------------------------------------------------
-c
-!$omp target enter data map(alloc:br,bt,bp)
-c
-c ****** Get Br.
-c
-      do concurrent (k=1:np, j=1:nt, i=1:nrm1)
-        br(i,j,k)=(phi(i+1,j,k)-phi(i,j,k))/dr(i)
-      enddo
-c
-c ****** Get Bt.
-c
-      do concurrent (k=1:np, j=1:ntm1, i=1:nr)
-        bt(i,j,k)=(phi(i,j+1,k)-phi(i,j,k))/(rh(i)*dt(j))
-      enddo
-c
-c ****** Get Bp.
-c
-      do concurrent (k=1:npm1, j=1:nt, i=1:nr)
-        bp(i,j,k)=(phi(i,j,k+1)-phi(i,j,k))/(rh(i)*sth(j)*dp(k))
-      enddo
-c
-      end subroutine
-c#######################################################################
-      subroutine magnetic_energy
-c
-c-----------------------------------------------------------------------
-c
-c ****** Calculate magnetic energy from B.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use global_dims
-      use global_mesh
-      use vars
-      use fields
-      use mpidefs
-      use local_dims_r
-      use local_dims_tp
-      use local_mesh_r
-      use local_mesh_tp
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: half=.5_r_typ
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j,k,ierr
-      real(r_typ) :: brav,btav,bpav,dv
-      real(r_typ) :: wr,wt,wp
-      real(r_typ), dimension(3) :: w
-      character(32) :: fmtstr
-c
-c-----------------------------------------------------------------------
-c
-      if (hdf32) then
-        fmtstr="(A, ES14.8)"
-      else
-        fmtstr="(A,ES22.16)"
-      end if
-c
-      wr=0.
-      wt=0.
-      wp=0.
-!$omp target teams loop collapse(3) reduction(+:wr,wt,wp)
-      do k=2,npm1
-        do j=2,ntm1
-          do i=2,nrm1
-            dv=rh(i)**2*drh(i)*dth(j)*sth(j)*dph(k)
-            brav=half*(br(i,j,k)+br(i-1,j,k))
-            btav=half*(bt(i,j,k)+bt(i,j-1,k))
-            bpav=half*(bp(i,j,k)+bp(i,j,k-1))
-            wr=wr+half*brav**2*dv
-            wt=wt+half*btav**2*dv
-            wp=wp+half*bpav**2*dv
-          enddo
-        enddo
-      enddo
-c
-c ****** Sum up all processors into final values and print.
-c
-      w(1)=wr
-      w(2)=wt
-      w(3)=wp
-      call MPI_Allreduce(MPI_IN_PLACE,w,3,ntype_real,
-     &                   MPI_SUM,comm_all,ierr)
-c
-      if (iamp0) then
-        write (*,*)
-        write (*,*) '### COMMENT from GETB:'
-        write (*,*) '### Magnetic energy diagnostic:'
-        write (*,*)
-        write (*,trim(fmtstr)) 'Energy in Br**2 = ',w(1)
-        write (*,trim(fmtstr)) 'Energy in Bt**2 = ',w(2)
-        write (*,trim(fmtstr)) 'Energy in Bp**2 = ',w(3)
-        write (*,trim(fmtstr)) 'Magnetic energy = ',SUM(w)
-        write (9,*)
-        write (9,*) '### COMMENT from GETB:'
-        write (9,*) '### Magnetic energy diagnostic:'
-        write (9,*)
-        write (9,trim(fmtstr)) 'Energy in Br**2 = ',w(1)
-        write (9,trim(fmtstr)) 'Energy in Bt**2 = ',w(2)
-        write (9,trim(fmtstr)) 'Energy in Bp**2 = ',w(3)
-        write (9,trim(fmtstr)) 'Magnetic energy = ',SUM(w)
-      end if
-c
-      end subroutine
-c#######################################################################
-      subroutine assemble_array (map_r,map_t,map_p,a,a_g)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Assemble a global array (into A_G) on processor IPROC0 by
-c ****** fetching the local sections (A) from all the processors.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use decomposition
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      type(map_struct), dimension(0:nproc-1) :: map_r,map_t,map_p
-      real(r_typ), dimension(:,:,:) :: a,a_g
-c
-c-----------------------------------------------------------------------
-c
-c ****** Storage for the buffers.
-c
-      integer :: lbuf,lsbuf
-      real(r_typ), dimension(:), allocatable :: sbuf
-      real(r_typ), dimension(:), allocatable :: rbuf
-c
-c-----------------------------------------------------------------------
-c
-      integer :: tag=0
-      integer :: irank,l1,l2,l3,i,j,k,ii
-      integer :: i0,j0,k0,i1,j1,k1
-      integer :: i0g,j0g,k0g
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-      l1=map_r(iproc)%n
-      l2=map_t(iproc)%n
-      l3=map_p(iproc)%n
-      lsbuf=l1*l2*l3
-c
-      i0=map_r(iproc)%i0
-      i1=map_r(iproc)%i1
-      j0=map_t(iproc)%i0
-      j1=map_t(iproc)%i1
-      k0=map_p(iproc)%i0
-      k1=map_p(iproc)%i1
-c
-c ***** Extract 1D array of non-overlapping values from local array.
-c
-      allocate (sbuf(lsbuf))
-c
-      sbuf=reshape(a(i0:i1,j0:j1,k0:k1),(/lsbuf/))
-c
-c ****** If proc0, recieve/store local arrays into global array.
-c
-      if (iamp0) then
-        do irank=0,nproc-1
-c
-          l1=map_r(irank)%n
-          l2=map_t(irank)%n
-          l3=map_p(irank)%n
-          lbuf=l1*l2*l3
-c
-          i0g=map_r(irank)%offset
-          j0g=map_t(irank)%offset
-          k0g=map_p(irank)%offset
-c
-c ****** If proc0 is the current rank in loop, simply copy local array.
-          if (iproc==irank) then
-            do k=1,l3
-              do j=1,l2
-                do i=1,l1
-                  ii=l2*l1*(k-1)+l1*(j-1)+i
-                  a_g(i0g+i-1,j0g+j-1,k0g+k-1)=sbuf(ii)
-                enddo
-              enddo
-            enddo
-c ****** Otherwise recieve data:
-          else
-            allocate (rbuf(lbuf))
-            call MPI_Recv (rbuf,lbuf,ntype_real,irank,tag,
-     &                     comm_all,MPI_STATUS_IGNORE,ierr)
-            do k=1,l3
-              do j=1,l2
-                do i=1,l1
-                  ii=l2*l1*(k-1)+l1*(j-1)+i
-                  a_g(i0g+i-1,j0g+j-1,k0g+k-1)=rbuf(ii)
-                enddo
-              enddo
-            enddo
-            deallocate(rbuf)
-          end if
-        enddo
-      else
-c
-c ****** Send local array to iproc0.
-c
-      call MPI_Ssend (sbuf,lsbuf,ntype_real,iproc0,tag,comm_all,ierr)
-c
-      end if
-      deallocate (sbuf)
-c
-      return
-      end
-c#######################################################################
-      subroutine timer_on
-c
-c-----------------------------------------------------------------------
-c
-c ****** Push an entry onto the timing stack and initialize
-c ****** a timing event.
-c
-c-----------------------------------------------------------------------
-c
-c ****** This routine can be called in a nested way to measure
-c ****** multiple timing events.  Calls to TIMER_ON and TIMER_OFF
-c ****** need to be nested like do-loops in FORTRAN.
-c
-c-----------------------------------------------------------------------
-c
-      use mpidefs
-      use timer
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      if (istack.ge.nstack) then
-        write (*,*)
-        write (*,*) '### WARNING from TIMER_ON:'
-        write (*,*) '### Timing stack depth exceeded.'
-        write (*,*) 'This may signal an incorrect nesting of '//
-     &              'TIMER_ON/TIMER_OFF calls.'
-        write (*,*) 'Timing information will not be valid.'
-        return
-      else
-        istack=istack+1
-      end if
-c
-      tstart(istack)=MPI_Wtime()
-c
-      return
-      end
-c#######################################################################
-      subroutine timer_off (tused)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Increment the CPU time used since the call to TIMER_ON
-c ****** in variable TUSED, and pop an entry off the timing
-c ****** stack.
-c
-c-----------------------------------------------------------------------
-c
-c ****** This routine can be called in a nested way to measure
-c ****** multiple timing events.  Calls to TIMER_ON and TIMER_OFF
-c ****** need to be nested like do-loops in FORTRAN.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use mpidefs
-      use timer
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ) :: tused
-c
-c-----------------------------------------------------------------------
-c
-      if (istack.le.0) then
-        write (*,*)
-        write (*,*) '### WARNING from TIMER_OFF:'
-        write (*,*) '### Timing stack cannot be popped.'
-        write (*,*) 'This may signal an incorrect nesting of '//
-     &              'TIMER_ON/TIMER_OFF calls.'
-        write (*,*) 'Timing information will not be valid.'
-        return
-      else
-        istack=istack-1
-      end if
-c
-      tused=tused+MPI_Wtime()-tstart(istack+1)
-c
-      return
-      end
-c#######################################################################
-      subroutine write_timing
-c
-c-----------------------------------------------------------------------
-c
-c ****** Write out the timing info.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use mpidefs
-      use timing
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-c ****** Timing buffers.
-c
-      integer, parameter :: lbuf=10
-      real(r_typ), dimension(lbuf) :: sbuf
-      real(r_typ), dimension(lbuf,0:nproc-1) :: tbuf
-c
-c ****** Timing statistics.
-c
-      real(r_typ), dimension(lbuf) :: tmin,tmax,tavg,tsdev
-c
-c-----------------------------------------------------------------------
-c
-      integer :: ierr,irank
-      real(r_typ) :: t_tot_avg,c_tot_avg,c_tot
-c
-      character(80) :: tfile='timing.out'
-c
-c-----------------------------------------------------------------------
-c
-c ****** Gather the timing information for all processors into TBUF.
-c
-      sbuf(1)=t_solve
-      sbuf(2)=t_startup
-      sbuf(3)=t_io
-      sbuf(4)=t_ax
-      sbuf(5)=t_pc_load
-      sbuf(6)=t_pc
-      sbuf(7)=c_seam
-      sbuf(8)=c_cgdot
-      sbuf(9)=c_sumphi
-      sbuf(10)=t_wall
-c
-      call MPI_Allgather (sbuf,lbuf,ntype_real,
-     &                    tbuf,lbuf,ntype_real,comm_all,ierr)
-c
-c ****** Calculate the timing statistics.
-c
-      tavg=sum(tbuf,dim=2)/nproc
-      tmin=minval(tbuf,dim=2)
-      tmax=maxval(tbuf,dim=2)
-c
-      tsdev(:)=0.
-      do irank=0,nproc-1
-        tsdev(:)=tsdev(:)+(tbuf(:,irank)-tavg(:))**2
-      enddo
-      tsdev(:)=sqrt(tsdev(:)/nproc)
-c
-      t_tot_avg=tavg(10)
-      c_tot_avg=tavg(7)+tavg(8)+tavg(9)
-c
-      if (iamp0) then
-c
-        call ffopen (1,tfile,'rw',ierr)
-c
-        if (ierr.ne.0) then
-          write (*,*)
-          write (*,*) '### WARNING from WRITE_TIMING:'
-          write (*,*) '### Could not create the timing file.'
-          write (*,*) 'File name: ',trim(tfile)
-        end if
-c
-        do irank=0,nproc-1
-          c_tot=tbuf(7,irank)+tbuf(8,irank)+tbuf(9,irank)
-          write (1,*)
-          write (1,100)
-          write (1,*)
-          write (1,*) 'Processor id = ',irank
-          write (1,*)
-          write (1,200) 'Comm. time in SEAM    = ',tbuf(7,irank)
-          write (1,200) 'Comm. time in CGDOT   = ',tbuf(8,irank)
-          write (1,200) 'Comm. time in SUMPHI  = ',tbuf(9,irank)
-          write (1,*)   '------------------------------------'
-          write (1,200) 'Total comm. time      = ',c_tot
-          write (1,*)
-          write (1,200) 'Time used in start-up = ',tbuf(2,irank)
-          write (1,200) 'Time used in i/o      = ',tbuf(3,irank)
-          write (1,200) 'Time used in POTFLD   = ',tbuf(1,irank)
-          write (1,200) 'Time used in AX       = ',tbuf(4,irank)
-          write (1,200) 'Time used in PCLOAD   = ',tbuf(5,irank)
-          write (1,200) 'Time used in PC       = ',tbuf(6,irank)
-          write (1,*)   '------------------------------------'
-          write (1,200) 'Total time used       = ',tbuf(10,irank)
-  100     format (80('-'))
-  200     format (1x,a,f12.6)
-        enddo
-        write (1,*)
-        write (1,100)
-c
-        write (1,*)
-        write (1,*) 'Average times:'
-        write (1,*) '-------------'
-        write (1,*)
-        write (1,300) 'Avg         Min         Max      S. Dev'
-        write (1,300) '---         ---         ---      ------'
-        write (1,400) 'Avg comm. time in SEAM    = ',
-     &                tavg(7),tmin(7),tmax(7),tsdev(7)
-        write (1,400) 'Avg comm. time in CGDOT   = ',
-     &                tavg(8),tmin(8),tmax(8),tsdev(8)
-        write (1,400) 'Avg comm. time in SUMPHI  = ',
-     &                tavg(9),tmin(9),tmax(9),tsdev(9)
-        write (1,400) 'Avg time used in start-up = ',
-     &                tavg(2),tmin(2),tmax(2),tsdev(2)
-        write (1,400) 'Avg time used in i/o      = ',
-     &                tavg(3),tmin(3),tmax(3),tsdev(3)
-        write (1,400) 'Avg time used in POTFLD   = ',
-     &                tavg(1),tmin(1),tmax(1),tsdev(1)
-        write (1,400) 'Avg time used in AX       = ',
-     &                tavg(4),tmin(4),tmax(4),tsdev(4)
-        write (1,400) 'Avg time used in PCLOAD   = ',
-     &                tavg(5),tmin(5),tmax(5),tsdev(5)
-        write (1,400) 'Avg time used in PC       = ',
-     &                tavg(6),tmin(6),tmax(6),tsdev(6)
-        write (1,400) 'Avg total time            = ',
-     &                tavg(10),tmin(10),tmax(10),tsdev(10)
-  300   format (1x,33x,a)
-  400   format (1x,a,4f12.3)
-c
-        write (1,*)
-        write (1,200) 'Average time used per proc  = ',t_tot_avg
-        write (1,200) 'Average comm. time per proc = ',c_tot_avg
-        write (1,*)
-        write (1,100)
-        write (1,*)
-c
-        close (1)
-c
-        write(*,*)
-        write(*,200) 'Total time:               ',tmax(10)
-        write(*,200) 'Max Comm/Imbalance time:  ',
-     &                tmax(7)+tmax(8)+tmax(9)
-        write(*,*)
-c
-      end if
-c
-      end subroutine
-c#######################################################################
-      subroutine readbr (fname,br0_g,ierr)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Read in the radial magnetic field at the photosphere
-c ****** and interpolate it into array BR0_G.
-c
-c ****** FNAME is the name of the file to read.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use global_dims
-      use global_mesh
-      use vars
-      use rdhdf_2d_interface
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      character(*) :: fname
-      real(r_typ), dimension(nt_g,np_g) :: br0_g
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: two=2._r_typ
-c
-c-----------------------------------------------------------------------
-c
-c ****** Br array read in and its scales.
-c
-      real(r_typ), dimension(:,:), pointer :: bn
-      real(r_typ), dimension(:), pointer :: tn,pn
-c
-c-----------------------------------------------------------------------
-c
-      integer :: ntn,npn,j,k
-      logical :: scale
-      real(r_typ) :: sum0,sum1,area,fluxp,fluxm,da,br00err
-c
-c-----------------------------------------------------------------------
-c
-      ierr=0
-c
-c ****** Read in the normal field.
-c
-      write (*,*)
-      write (*,*) '### COMMENT from READBR:'
-      write (*,*) '### Reading Br file: ',trim(fname)
-c
-      call rdhdf_2d (fname,scale,ntn,npn,bn,tn,pn,ierr)
-c
-      if (ierr.ne.0) then
-        write (*,*)
-        write (*,*) '### ERROR in READBR:'
-        write (*,*) '### The flux file has the wrong format.'
-        write (*,*) 'IERR (from RDHDF_2D) = ',ierr
-        write (*,*) 'File name: ',trim(fname)
-        ierr=1
-        return
-      end if
-c
-c ****** Check that the arrays has scales.
-c
-      if (.not.scale) then
-        write (*,*)
-        write (*,*) '### ERROR in READBR:'
-        write (*,*) '### The flux file does not have scales.'
-        write (*,*) 'File name: ',trim(fname)
-        ierr=2
-        return
-      end if
-c
-c ****** Interpolate the field to the code mesh (into array BR0_G).
-c
-      call intrp2d (ntn,npn,tn,pn,bn,
-     &              nt_g-2,np_g-2,th_g(2:ntm1_g),ph_g(2:npm1_g),
-     &              br0_g(2:ntm1_g,2:npm1_g),ierr)
-c
-      if (ierr.ne.0) then
-        write (*,*)
-        write (*,*) '### ERROR in READBR:'
-        write (*,*) '### The scales in the Br file are invalid.'
-        write (*,*) 'File name: ',trim(fname)
-        ierr=3
-        return
-      end if
-c
-c ****** De-allocate the memory for the BN array and its scales.
-c
-      deallocate (bn)
-      deallocate (tn)
-      deallocate (pn)
-c
-c ****** Set Br to be periodic.
-c
-      br0_g(:,1)=br0_g(:,npm1_g)
-      br0_g(:,np_g)=br0_g(:,2)
-c
-c ****** Set BCs at the poles.
-c
-c ****** Br has only an m=0 component there.
-c
-      sum0=sum(br0_g(     2,2:npm1_g)*dph_g(2:npm1_g))*pl_i
-      sum1=sum(br0_g(ntm1_g,2:npm1_g)*dph_g(2:npm1_g))*pl_i
-c
-      br0_g(1   ,:)=two*sum0-br0_g(     2,:)
-      br0_g(nt_g,:)=two*sum1-br0_g(ntm1_g,:)
-c
-c ****** Calculate the total flux.
-c
-      area=0.
-      fluxp=0.
-      fluxm=0.
-      do j=2,ntm1_g
-        do k=2,npm1_g
-          da=sth_g(j)*dth_g(j)*dph_g(k)
-          if (br0_g(j,k).gt.0.) then
-            fluxp=fluxp+br0_g(j,k)*da
-          else
-            fluxm=fluxm+br0_g(j,k)*da
-          end if
-          area=area+da
-        enddo
-      enddo
-c
-      write (*,*)
-      write (*,*) '### COMMENT from READBR:'
-      write (*,*) '### Computed flux balance:'
-      write (*,*)
-      write (*,*) 'Positive flux = ',fluxp
-      write (*,*) 'Negative flux = ',fluxm
-c
-c ****** Fix the magnetic field so that the total flux is zero
-c ****** (unless this has not been requested).
-c
-      if (.not.((option.eq.'ss'.or.option.eq.'open')
-     &          .and.do_not_balance_flux)) then
-c
-        br00err=(fluxp+fluxm)/area
-c
-        do k=1,np_g
-          do j=1,nt_g
-            br0_g(j,k)=br0_g(j,k)-br00err
-          enddo
-        enddo
-c
-        write (*,*)
-        write (*,*) '### COMMENT from READBR:'
-        write (*,*) '### Flux balance correction:'
-        write (*,*)
-        write (*,*) 'BR00 (monopole Br field magnitude) = ',br00err
-c
-      end if
-c
-      return
-      end
-c#######################################################################
-      subroutine intrp2d (nxi,nyi,xi,yi,fi,nx,ny,x,y,f,ierr)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Interpolate a 2D field from array FI(NXI,NYI), defined
-c ****** on the mesh XI(NXI) x YI(NYI), into the array F(NX,NY),
-c ****** defined on the mesh X(NX) x Y(NY).
-c
-c ****** Note that if a data point is outside the bounds of
-c ****** the XI x YI mesh, IERR=2 is returned.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: nxi,nyi
-      real(r_typ), dimension(nxi) :: xi
-      real(r_typ), dimension(nyi) :: yi
-      real(r_typ), dimension(nxi,nyi) :: fi
-      integer :: nx,ny
-      real(r_typ), dimension(nx) :: x
-      real(r_typ), dimension(ny) :: y
-      real(r_typ), dimension(nx,ny) :: f
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: zero=0._r_typ
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), external :: flint
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j,ii,jj
-      real(r_typ) :: dum(1)=0.
-      real(r_typ) :: dummy,xv,yv,ax,ay
-c
-c-----------------------------------------------------------------------
-c
-      ierr=0
-c
-c ****** Check that the scales XI and YI are monotonic.
-c
-      dummy=flint(zero,nxi,xi,dum,1,ierr)
-      if (ierr.ne.0) go to 900
-c
-      dummy=flint(zero,nyi,yi,dum,1,ierr)
-      if (ierr.ne.0) go to 900
-c
-c ****** Interpolate the data.
-c
-      do j=1,ny
-        yv=y(j)
-        if (yv.lt.yi(1).or.yv.gt.yi(nyi)) then
-          go to 910
-        end if
-        call interp (yi,nyi,yv,jj,ay)
-        do i=1,nx
-          xv=x(i)
-          if (xv.lt.xi(1).or.xv.gt.xi(nxi)) then
-            go to 910
-          end if
-          call interp (xi,nxi,xv,ii,ax)
-          f(i,j)= (1.-ax)*((1.-ay)*fi(ii  ,jj  )+ay*fi(ii  ,jj+1))
-     &           +    ax *((1.-ay)*fi(ii+1,jj  )+ay*fi(ii+1,jj+1))
-        enddo
-      enddo
-c
-      return
-c
-c ****** Error exit: invalid scales.
-c
-  900 continue
-c
-      write (*,*)
-      write (*,*) '### ERROR in INTRP2D:'
-      write (*,*) '### Scales are not monotonically increasing.'
-      ierr=1
-c
-      return
-c
-c ****** Error exit: data outside range of scales.
-c
-  910 continue
-c
-      write (*,*)
-      write (*,*) '### ERROR in INTRP2D:'
-      write (*,*) '### An interpolation was attempted outside the'//
-     &            ' range of the defined scales.'
-      ierr=2
-c
-      return
-      end
-c#######################################################################
-      function flint (x,n,xn,fn,icheck,ierr)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Interpolate a function linearly.
-c
-c-----------------------------------------------------------------------
-c
-c ****** The funcion is defined at N nodes, with values given by
-c ****** FN(N) at positions XN(N).  The function value returned is
-c ****** the linear interpolant at X.
-c
-c ****** Note that if X.lt.XN(1), the function value returned
-c ****** is FN(1), and if X.gt.XN(N), the function value returned
-c ****** is FN(N).
-c
-c ****** Call once with ICHECK.ne.0 to check that the values
-c ****** in XN(N) are monotonically increasing.  In this mode
-c ****** the array XN(N) is checked, and X and FN(N) are not
-c ****** accessed.  If the check is passed, IERR=0 is returned.
-c ****** Otherwise, IERR=1 is returned.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ) :: flint
-      real(r_typ) :: x
-      integer :: n
-      real(r_typ), dimension(n) :: xn,fn
-      integer :: icheck
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j
-      real(r_typ) :: x1,x2,alpha
-c
-c-----------------------------------------------------------------------
-c
-      ierr=0
-      flint=0.
-c
-c ****** If ICHECK.ne.0, check the abscissa table.
-c
-      if (icheck.ne.0) then
-        if (n.le.0) then
-          write (*,*)
-          write (*,*) '### ERROR in FLINT:'
-          write (*,*) '### Bad dimension of abscissa table.'
-          write (*,*) 'N = ',n
-          ierr=1
-          return
-        end if
-        do 100 i=1,n-1
-          if (xn(i+1).le.xn(i)) then
-            write (*,*)
-            write (*,*) '### ERROR in FLINT:'
-            write (*,*) '### Bad data in abscissa table.'
-            write (*,*) 'N = ',n
-            write (*,*) 'XN = '
-            write (*,*) (xn(j),j=1,n)
-            ierr=1
-            return
-          end if
-  100   continue
-        return
-      end if
-c
-c ****** Get the interpolated value.
-c
-      if (x.le.xn(1)) then
-        flint=fn(1)
-      else if (x.gt.xn(n)) then
-        flint=fn(n)
-      else
-        do 200 i=1,n-1
-          if (x.ge.xn(i).and.x.lt.xn(i+1)) go to 300
-  200   continue
-  300   continue
-        x1=xn(i)
-        x2=xn(i+1)
-        alpha=(x-x1)/(x2-x1)
-        flint=fn(i)*(1.-alpha)+fn(i+1)*alpha
-      end if
-c
-      return
-      end
-c#######################################################################
-      subroutine interp (x,n,xv,i,alpha)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Get interpolation factor ALPHA and table index I.
-c
-c ****** This routine does not do the actual interpolation.  Use the
-c ****** returned values of I and ALPHA to get the interpolated value.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: n
-      real(r_typ), dimension(n) :: x
-      real(r_typ) :: xv
-      integer :: i
-      real(r_typ) :: alpha
-c
-c-----------------------------------------------------------------------
-c
-      do 100 i=1,n-1
-        if (xv.ge.x(i).and.xv.le.x(i+1)) then
-          alpha=(xv-x(i))/(x(i+1)-x(i))
-          go to 200
-        end if
-  100 continue
-c
-c ****** Value not found --- signal error and stop.
-c
-      write (*,*)
-      write (*,*) '### ERROR in INTERP:'
-      write (*,*) '### Value not found in table.'
-      write (*,*) 'Value requested = ',xv
-      write (*,*) 'Min table value = ',x(1)
-      write (*,*) 'Max table value = ',x(n)
-      call endrun (.true.)
-c
-  200 continue
-c
-      return
-      end
-c#######################################################################
-c
-c ****** Revision history:
-c
-c ### Version 1.00, 03/02/2006, file pot3d.f, modified by ZM:
-c
-c       - Cleaned up the previous version of POT3D.
-c
-c ### Version 1.01, 03/06/2006, file pot3d.f, modified by ZM:
-c
-c       - Added the ability to do a "source-surface plus
-c         current-sheet" solution.  Select this by setting
-c         OPTION='ss+cs'.
-c
-c ### Version 1.02, 06/18/2007, file pot3d.f, modified by ZM:
-c
-c       - Fixed a bug that caused the code to hang when an error
-c         was encountered (when running a parallel job).
-c
-c ### Version 1.03, 03/17/2009, file pot3d.f, modified by ZM:
-c
-c       - Added the ability to write the boundary flux before the
-c         sign flip for current-sheet solutions (i.e., OPTION='open').
-c         Set the variable BR_PHOTO_ORIGINAL_FILE to the desired
-c         file name to request this.
-c
-c ### Version 1.50, 01/25/2016, file pot3d.f, modified by RC:
-c
-c       - Added new (much faster) BILU0 preconditioner to CG solver.
-c         To activate, set ifprec=2 in pot3d.dat file.
-c       - Modified CG solve to use 1D arrays
-c         for SAXPY and DOT operations.
-c
-c ### Version 2.00, 06/06/2017, file pot3d.f, modified by RC:
-c
-c       - Added OpenACC support to the code.
-c         - OpenACC support is currently ONLY on 'standard'
-c           pot3d runs (not inner-outer-iteratative mode)
-c           and is only efficient on GPUs when using ifprec=1.
-c         - OpenACC adds support for running the code on
-c           Nvidia GPUs/Intel KNL/x86-multicore/OpenPower.
-c         - To use OpenACC, simply compile the code with a compiler
-c           that supports OpenACC with the correct flags activated.
-c         - Multi-gpu support is included by using the new
-c           ngpus_per_node input parameter.  This should be set
-c           to the number of GPUs available per node.
-c           The number of MPI ranks per node should match the
-c           number of gpus per node.  This can be launched with
-c           "mpiexec -np <np> -ntasks-per-node <ngpus_per_node>".
-c         - The GPU features of the code are fully portable, i.e.
-c           the code can be compiled/used as before on CPUs with no
-c           changes in compilation or run-time.
-c       - Modified some routines to be "nicer" for OpenACC
-c         and optimized some MPI communications.
-c       - Added wall-clock timer and corrected placement of
-c         MPI_Finalize().  The wall clock timer now reflects the
-c         true runtime.
-c
-c ### Version 2.01, 10/02/2017, file pot3d.f, modified by RC:
-c
-c       - Optimized OpenACC.
-c       - Renamed cgsolv() to cgsolve().
-c
-c ### Version 2.10, 01/15/2018, file pot3d.f, modified by ZM+RC:
-c
-c       - Added the ability to skip the balancing of the flux
-c         when doing a PFSS or OPEN field.  To invoke this, set
-c         DO_NOT_BALNCE_FLUX=.true..
-c       - Changed some pointers to allocatables for better
-c         vectorization.
-c
-c ### Version 2.11, 03/19/2018, file pot3d.f, modified by RC:
-c
-c       - Added 'fmt' input parameter to set output file type.
-c         Set fmt='h5' to write out HDF5 (default is 'hdf').
-c
-c ### Version 2.12, 10/08/2018, file pot3d.f, modified by RC:
-c
-c       - COMPATIBILITY CHANGE! Renamed gpus_per_node to gpn.
-c         gpn is default 0 which will set gpn to the number of
-c         MPI ranks in the local node.
-c         Setting gpn manually is not recommended and only used for
-c         oversubscribing the GPUs.
-c       - Added MPI shared communicator for automatically setting gpn.
-c         This requires an MPI-3 capable MPI library.
-c       - Changed layout of matrix coefficient arrays to be more
-c         vector-friendly instead of cache-friendly.
-c
-c ### Version 2.13, 11/19/2018, file pot3d.f, modified by RC:
-c
-c       - Small modifications to polar boundary condition calculations.
-c       - Updated array ops and ACC clauses to be F2003 optimized.
-c
-c ### Version 2.20, 01/09/2019, file pot3d.f, modified by RC:
-c
-c       - Added double precision output option.
-c         Set hdf32=.false. to activate 64-bit output.
-c       - Updated magnetic field computation.  B is now computed
-c         in parallel.  3D output fields now gathered to rank 0
-c         using Sends and Receives instead of Gatherv in order
-c         to allow very large resolutions.
-c       - Added automatic topology.  Now, nprocs is optional.
-c         One can specify one or more topology dimensions and
-c         use the flag value "-1" to indicate dimensions to auto-set.
-c         It is recommended to simply delete nprocs from input files.
-c       - Added output file flushing so CG iterations can be monitored.
-c       - Added new MPI rank diagnostics including
-c         estimated load imbalance.
-c       - Processor topology and magnetic energy output now written to
-c         pot3d.out as well as the terminal.
-c
-c ### Version 2.21, 01/11/2019, file pot3d.f, modified by RC:
-c
-c       - Small updates to magnetic_energy routine.
-c
-c ### Version 2.22, 11/27/2019, file pot3d.f, modified by RC:
-c
-c       - Optimized some OpenACC directives.  Expanded some
-c         array-syntax lines to full loops.
-c
-c ### Version 2.23, 08/11/2020, file pot3d.f, modified by RC:
-c
-c       - Small bug fix for default output file names and
-c         format option fmt.
-c
-c ### Version 3.0.0, 02/10/2021, file pot3d.f, modified by RC:
-c
-c       - Changed version number scheme to semantic versioning.
-c
-c ### Version 3.1.0, 08/11/2020, file pot3d.f, modified by RC:
-c
-c       - Removed ss+cs inner-outer iterative mode.
-c         It had not been used in many years and never worked
-c         quite right.  Algorithm is saved in previous releases
-c         on the github and svn.
-c       - Added a validation run mode.  To activate, set:
-c         VALIDATION_RUN=.TRUE.
-c         This will set up a tilted dipole exact solution, write it
-c         out, and then solve for it.  It will ignore input br files.
-c         To change the angle of the dipole, set DIPOLE_ANGLE
-c         (default is pi/4).
-c       - Added NCGFLUSH parameter to flush output file during solver
-c         iterations if NCGHIST>0 every NCGFLUSH iteration (default 25).
-c         This is useful to track progress of the solver.
-c
-c ### Version 3.1.1, 11/02/2021, modified by RC:
-c
-c       - Fixed default output name mechanics.  The default names
-c         were ignoring "fmt", making "fmt" almost useless.
-c
-c ### Version 3.2.0, 12/21/2021, modified by RC:
-c
-c       - GPN is now depricated.  Now, # of GPUs per node is assumed
-c         to be equal to # of MPI ranks per node.
-c         This removes the ability to oversubscribe GPUs, but this
-c         was not being used ever, so no great loss.
-c         This change was done to allow setting the gpu device
-c         number before any allocations so that it works correctly
-c         when using NVIDIA unified managed memory.
-c
-c ### Version 3.3.0, 02/14/2022, modified by RC:
-c
-c       - Added new timers for AX, PC, PCLOAD, and modified some timers
-c         to be more useful.
-c       - Updated ILU0 PC2 preconditioner to use single precision.
-c         This speeds up the solve both on CPUs and GPUs.
-c
-c ### Version 4.0.0, 03/02/2022, modified by RC/MS:
-c
-c       - Added cuSparse library calls for ifprec=2 on NVIDIA GPUs.
-c       - Added #ifdef
-c         preprocessor directives which will either compile a cusparse
-c         GPU version of the code or a CPU compatible version
-c         without cusparse calls and no c code references
-c
-c ### Version 4.0.1, 03/31/2022, modified by RC:
-c
-c       - Added 'target' attribute to arrays used with C_LOC.
-c         This fixes a compiler error with GFORTRAN when
-c         using cuSparse.
-c
-c ### Version 4.1.0, 10/24/2022, modified by RC:
-c
-c       - Removed NCGFLUSH input parameter.  The output is
-c         flushed at every write now.
-c       - Changed behavior of solver residual reporting
-c         to also output to terminal.
-c       - Removed flush_output_file() routine in favor of Fortran 2003
-c         intrinsic FLUSH.
-c
-c ### Version 4.2.0, 08/21/2024, modified by MS/RC:
-c
-c       - Replaced OpenACC with OpenMP Target directives.
-c         There remains a single OpenACC directive for device selection
-c         in order to work with nvfortran.
-c
-c ### Version 4.2.1, 09/19/2024, modified by RC:
-c
-c       - Changed array reduction loops to use 
-c         OpenMP reductions instead of atomics.
-c       - Updated OpenMP target directives to use the 
-c         more streamlined "loop" syntax.
-c
-c ### Version 4.2.2, 12/05/2024, modified by RC:
-c
-c       - Fixed bug with OpenMP target loops.
-c         Was using 'target loop' instead of 'target teams loop'
-c         nvfortran seemed to allow this, but ifx does not.
-c
-c#######################################################################
diff --git a/src/no_gpu_mpi/pot3d.F90 b/src/no_gpu_mpi/pot3d.F90
new file mode 100644
index 0000000..c895c1e
--- /dev/null
+++ b/src/no_gpu_mpi/pot3d.F90
@@ -0,0 +1,7217 @@
+!#######################################################################
+!  _____   ____ _______ ____  _____
+! |  __ \ / __ \__   __|___ \|  __ \
+! | |__) | |  | | | |    __) | |  | |
+! |  ___/| |  | | | |   |__ <| |  | |
+! | |    | |__| | | |   ___) | |__| |
+! |_|     \____/  |_|  |____/|_____/
+!
+! ****** POT3D: 3D potential magnetic field outside a sphere.
+!
+! ****** This program can find the classical potential field, the
+! ****** fully open field, the source-surface field, and the
+! ****** source-surface plus current-sheet field.
+!
+!        Authors:  Zoran Mikic
+!                  Ronald M. Caplan
+!                  Jon A. Linker
+!                  Roberto Lionello
+!                  Miko Stulajter
+!
+!        Predictive Science Inc.
+!        www.predsci.com
+!        San Diego, California, USA 92121
+!
+!#######################################################################
+! Copyright 2021 Predictive Science Inc.
+!
+! Licensed under the Apache License, Version 2.0 (the "License");
+! you may not use this file except in compliance with the License.
+! You may obtain a copy of the License at
+!
+!    http://www.apache.org/licenses/LICENSE-2.0
+!
+! Unless required by applicable law or agreed to in writing, software
+! distributed under the License is distributed on an "AS IS" BASIS,
+! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+! implied.
+! See the License for the specific language governing permissions and
+! limitations under the License.
+!#######################################################################
+!
+!#######################################################################
+!
+module ident
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+! ****** Code name.
+!-----------------------------------------------------------------------
+!
+      character(*), parameter :: idcode='POT3D'
+      character(*), parameter :: vers  ='4.3.1_nogpumpi'
+      character(*), parameter :: update='12/05/2024'
+!
+end module
+!#######################################################################
+module number_types
+!
+!-----------------------------------------------------------------------
+! ****** Basic number types.
+! ****** This module is used to set the default precision for REALs.
+!-----------------------------------------------------------------------
+!
+      use iso_fortran_env
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+      integer, parameter :: KIND_REAL_4=REAL32
+      integer, parameter :: KIND_REAL_8=REAL64
+      integer, parameter :: KIND_REAL_16=max(REAL128,REAL64)
+!
+      integer, parameter :: r_typ=KIND_REAL_8
+!
+end module
+!#######################################################################
+module number_types_pc
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use iso_fortran_env
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer, parameter :: r_typ_pc=REAL32
+!
+end module
+!#######################################################################
+module constants
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+      real(r_typ), parameter :: pi=3.1415926535897932_r_typ
+!
+end module
+!#######################################################################
+module global_dims
+!
+!-----------------------------------------------------------------------
+! ****** Global number of mesh points.
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+! ****** Global mesh size.
+!
+      integer :: nr_g,nrm1_g
+      integer :: nt_g,ntm1_g
+      integer :: np_g,npm1_g
+!
+! ****** Flag to indicate an axisymmetric run.
+!
+      logical :: axisymmetric=.false.
+!
+end module
+!#######################################################################
+module local_dims
+!
+!-----------------------------------------------------------------------
+! ****** Local number of mesh points.
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+! ****** Flags to indicate whether this processor has points
+! ****** on the physical boundaries.
+!
+      logical :: rb0,rb1,tb0,tb1
+!
+! ****** Local mesh size.
+!
+      integer :: nr,nrm1
+      integer :: nt,ntm1
+      integer :: np,npm1
+!
+! ****** Dimensions of arrays on the "main" mesh.
+!
+      integer :: nrm
+      integer :: ntm
+      integer :: npm
+!
+! ****** Indices of start and end points in the global mesh
+! ****** belonging to this processor.
+!
+      integer :: i0_g,i1_g
+      integer :: j0_g,j1_g
+      integer :: k0_g,k1_g
+!
+end module
+!#######################################################################
+module global_mesh
+!
+!-----------------------------------------------------------------------
+! ****** Global mesh.
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use constants
+!
+      implicit none
+!
+      real(r_typ), dimension(:), allocatable :: r_g,rh_g,dr_g,drh_g
+      real(r_typ), dimension(:), allocatable :: t_g,th_g,dt_g,dth_g
+      real(r_typ), dimension(:), allocatable :: p_g,ph_g,dp_g,dph_g
+!
+      real(r_typ), dimension(:), allocatable :: st_g,ct_g,sth_g,cth_g
+      real(r_typ), dimension(:), allocatable :: sp_g,cp_g,sph_g,cph_g
+!
+! ****** Physical mesh size.
+!
+      real(r_typ), parameter, private :: one=1._r_typ
+      real(r_typ), parameter, private :: two=2._r_typ
+!
+      real(r_typ) :: r0=1._r_typ
+      real(r_typ) :: r1=30._r_typ
+      real(r_typ), parameter :: t0=0.
+      real(r_typ), parameter :: t1=pi
+      real(r_typ), parameter :: p0=0.
+      real(r_typ), parameter :: p1=two*pi
+!
+      real(r_typ), parameter :: pl=p1-p0
+      real(r_typ), parameter :: pl_i=one/pl
+!
+end module
+!#######################################################################
+module local_mesh
+!
+!-----------------------------------------------------------------------
+! ****** Local mesh.
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+      implicit none
+!
+      real(r_typ), dimension(:), allocatable :: r,r2,rh,dr,drh
+      real(r_typ) :: dr1,drn
+!
+      real(r_typ), dimension(:), allocatable :: t,th,dt,dth
+      real(r_typ), dimension(:), allocatable :: p,ph,dp,dph
+!
+      real(r_typ), dimension(:), allocatable :: st,ct,sth,cth
+      real(r_typ), dimension(:), allocatable :: sp,cp,sph,cph
+!
+! ****** Inverse quantities (for efficiency).
+!
+      real(r_typ), dimension(:), allocatable :: r_i,rh_i
+      real(r_typ), dimension(:), allocatable :: dr_i,drh_i
+      real(r_typ), dimension(:), allocatable :: dt_i,dth_i
+      real(r_typ), dimension(:), allocatable :: st_i,sth_i
+      real(r_typ), dimension(:), allocatable :: dp_i,dph_i
+!
+end module
+!#######################################################################
+module mpidefs
+!
+!-----------------------------------------------------------------------
+! ****** MPI variables, processor topology, and processor information.
+!-----------------------------------------------------------------------
+!
+      use mpi
+!
+      implicit none
+!
+! ****** Total number of processors.
+!
+      integer :: nproc
+!
+! ****** Total number of processors per node.
+!
+      integer :: nprocsh
+!
+! ****** Processor rank of this process in communicator
+! ****** MPI_COMM_WORLD.
+!
+      integer :: iprocw
+!
+! ****** Processor rank of this process in communicator
+! ****** comm_shared.
+!
+      integer :: iprocsh
+!
+! ****** Flag to designate that this is the processor with
+! ****** rank 0 in communicator MPI_COMM_WORLD.
+!
+      logical :: iamp0
+!
+! ****** Communicator over all processors in the Cartesian topology.
+!
+      integer :: comm_all
+!
+! ****** Processor rank of this process in communicator
+! ****** COMM_ALL.
+!
+      integer :: iproc
+!
+! ****** Processor rank in communicator COMM_ALL for the
+! ****** processor that has rank 0 in MPI_COMM_WORLD.
+!
+      integer :: iproc0
+!
+! ****** Communicators over all processors in the phi dimension.
+!
+      integer :: comm_phi
+!
+! ****** Communicator over all shared processors on the node.
+!
+      integer :: comm_shared
+!
+! ****** Communicators over all processors in the r dimension.
+!
+      integer :: comm_r
+!
+! ****** Processor coordinate indices of this process
+! ****** in the Cartesian topology.
+!
+      integer :: iproc_r,iproc_t,iproc_p
+!
+! ****** Processor coordinate indices of the neighboring
+! ****** processors in the Cartesian topology.
+!
+      integer :: iproc_rm,iproc_rp
+      integer :: iproc_tm,iproc_tp
+      integer :: iproc_pm,iproc_pp
+!
+! ****** Number of processors along r, theta, and phi.
+!
+      integer :: nproc_r,nproc_t,nproc_p
+!
+! ****** Number type for REALs to be used in MPI calls.
+!
+      integer :: ntype_real
+!
+end module
+!#######################################################################
+module decomposition_params
+!
+!-----------------------------------------------------------------------
+! ****** Input parameters that define the domain decomposition
+! ****** among processors.
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+! ****** Number of processors per dimension.
+!
+      integer, dimension(3) :: nprocs=(/-1,-1,-1/)
+!
+end module
+!#######################################################################
+module decomposition
+!
+!-----------------------------------------------------------------------
+! ****** Information defining the domain decomposition.
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+! ****** Define the structure type for mapping local arrays
+! ****** into global arrays.
+!
+      type :: map_struct
+        integer :: n
+        integer :: i0
+        integer :: i1
+        integer :: offset
+      end type
+!
+! ****** Mapping structures for the different mesh types.
+!
+      type(map_struct), dimension(:), pointer :: map_rh
+      type(map_struct), dimension(:), pointer :: map_rm
+      type(map_struct), dimension(:), pointer :: map_th
+      type(map_struct), dimension(:), pointer :: map_tm
+      type(map_struct), dimension(:), pointer :: map_ph
+      type(map_struct), dimension(:), pointer :: map_pm
+!
+end module
+!#######################################################################
+module meshdef
+!
+!-----------------------------------------------------------------------
+! ****** Variables that define the mesh-point distributions.
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+      implicit none
+!
+      integer, parameter :: nmseg=30
+!
+      real(r_typ), dimension(nmseg) :: drratio=0.
+      real(r_typ), dimension(nmseg) :: dtratio=0.
+      real(r_typ), dimension(nmseg) :: dpratio=0.
+      real(r_typ), dimension(nmseg) :: rfrac=0.
+      real(r_typ), dimension(nmseg) :: tfrac=0.
+      real(r_typ), dimension(nmseg) :: pfrac=0.
+!
+      integer :: nfrmesh=0
+      integer :: nftmesh=0
+      integer :: nfpmesh=0
+!
+      real(r_typ) :: phishift=0.
+!
+end module
+!#######################################################################
+module fields
+!
+!-----------------------------------------------------------------------
+! ****** Local field arrays.
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+      implicit none
+!
+! ****** Potential solution array and cg temp array.
+!
+      real(r_typ), dimension(:,:,:), allocatable :: phi
+      real(r_typ), dimension(:,:,:), allocatable :: x_ax
+!
+! ****** Boundary radial magnetic field arrays.
+!
+      real(r_typ), dimension(:,:), allocatable :: br0
+      real(r_typ), dimension(:,:), allocatable :: br1
+!
+! ****** Arrays used in polar boundary conditions.
+!
+      real(r_typ), dimension(:), allocatable :: sum0,sum1
+!
+! ****** Arrays used for final magnetic field.
+!
+      real(r_typ), dimension(:,:,:), allocatable :: br,bt,bp
+!
+end module
+!#######################################################################
+module cgcom
+!
+      use number_types
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+! ****** Number of equations to solve in the CG solver.
+!-----------------------------------------------------------------------
+!
+      integer :: ncgeq
+!
+!-----------------------------------------------------------------------
+! ****** CG field solver parameters.
+!-----------------------------------------------------------------------
+!
+      integer :: ifprec=1
+      integer :: ncgmax=1000000
+      integer :: ncghist=100
+      real(r_typ) :: epscg=1.e-9
+!
+!-----------------------------------------------------------------------
+! ****** CG field solver variables.
+!-----------------------------------------------------------------------
+!
+      integer :: ncg
+      real(r_typ) :: epsn
+!
+! ****** Seam buffers.
+!
+      real(r_typ), allocatable, dimension(:,:) :: sbuf_rt1,sbuf_rt2
+      real(r_typ), allocatable, dimension(:,:) :: sbuf_tp1,sbuf_tp2
+      real(r_typ), allocatable, dimension(:,:) :: sbuf_rp1,sbuf_rp2
+!
+      real(r_typ), allocatable, dimension(:,:) :: rbuf_rt1,rbuf_rt2
+      real(r_typ), allocatable, dimension(:,:) :: rbuf_tp1,rbuf_tp2
+      real(r_typ), allocatable, dimension(:,:) :: rbuf_rp1,rbuf_rp2
+!
+end module
+!#######################################################################
+module vars
+!
+!-----------------------------------------------------------------------
+! ****** Miscellaneous input variables.
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+      implicit none
+!
+      character(256) :: outfile='pot3d.out'
+      character(256) :: phifile='default'
+      character(256) :: br0file='default'
+      character(256) :: brfile='default'
+      character(256) :: btfile='default'
+      character(256) :: bpfile='default'
+      character(256) :: br_photo_file='default'
+      character(256) :: br_photo_original_file='default'
+!
+! ****** Type of field solution.
+! ****** Select between 'potential', 'open', and 'source-surface'.
+!
+      character(16) :: option='potential'
+!
+! ****** Interval at which to dump diagonstics during the
+! ****** iteration for the source-surface plus current-sheet
+! ****** solution.
+!
+      integer :: ndump=0
+!
+! ****** Flag to skip the balancing of the flux (for PFSS and
+! ****** OPEN field options only).
+
+      logical :: do_not_balance_flux=.false.
+!
+! ****** Set format for output files.
+!
+      character(3) :: fmt='h5'
+!
+      logical :: hdf32=.true.
+!
+! ***** Validation run (tilted dipole).
+!
+      logical :: validation_run=.false.
+!
+      real(r_typ) :: dipole_angle=0.7853981633974483_r_typ
+!
+end module
+!#######################################################################
+module solve_params
+!
+!-----------------------------------------------------------------------
+! ****** Parameters used in the solver.
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+      implicit none
+!
+! ****** Boundary condition switch at r=R1.
+!
+      real(r_typ) :: pm_r1
+!
+end module
+!#######################################################################
+module timer
+!
+!-----------------------------------------------------------------------
+! ****** Timer stack.
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+      implicit none
+!
+      integer, parameter :: nstack=10
+      integer :: istack=0
+      real(r_typ), dimension(nstack) :: tstart=0.
+!
+end module
+!#######################################################################
+module timing
+!
+!-----------------------------------------------------------------------
+! ****** Timing variables.
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+      implicit none
+!
+      real(r_typ) :: t_startup=0.
+      real(r_typ) :: t_solve=0.
+      real(r_typ) :: t_pc_load=0.
+      real(r_typ) :: t_pc=0.
+      real(r_typ) :: t_ax=0.
+      real(r_typ) :: t_io=0.
+      real(r_typ) :: c_seam=0.
+      real(r_typ) :: c_cgdot=0.
+      real(r_typ) :: c_sumphi=0.
+      real(r_typ) :: t_wall=0.
+!
+end module
+!#######################################################################
+module debug
+!
+!-----------------------------------------------------------------------
+! ****** Debugging level.
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+      integer :: idebug=0
+!
+end module
+!#######################################################################
+module assemble_array_interface
+      interface
+  subroutine assemble_array (map_r,map_t,map_p,a,a_g)
+        use number_types
+        use decomposition
+        use mpidefs
+        implicit none
+        type(map_struct), dimension(0:nproc-1) :: map_r,map_t,map_p
+        real(r_typ), dimension(:,:,:) :: a,a_g
+  end subroutine
+      end interface
+end module
+!#######################################################################
+module cusparse_interface
+!
+      use, intrinsic :: iso_c_binding
+!
+#ifdef CUSPARSE
+      interface
+  subroutine load_lusol_cusparse(CSR_A,CSR_AI,CSR_AJ,N,M) &
+          BIND(C, name="load_lusol_cusparse")
+          use, intrinsic :: iso_c_binding
+          implicit none
+          integer(C_INT), value :: N,M
+          type(C_PTR), value :: CSR_A,CSR_AI,CSR_AJ
+  end subroutine load_lusol_cusparse
+!
+  subroutine lusol_cusparse(x) &
+          BIND(C, name="lusol_cusparse")
+          use, intrinsic :: iso_c_binding
+          implicit none
+          type(C_PTR), value :: x
+  end subroutine lusol_cusparse
+!
+  subroutine unload_lusol_cusparse() &
+          BIND(C, name="unload_lusol_cusparse")
+  end subroutine unload_lusol_cusparse
+      end interface
+#endif
+!
+      integer(c_int) :: cN,cM
+!
+end module
+!#######################################################################
+module matrix_storage_pot3d_solve
+!
+!-----------------------------------------------------------------------
+! ****** Storage for the matrix/preconditioners of the solve.
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use number_types_pc
+!
+      implicit none
+!
+      real(r_typ), dimension(:,:,:,:), allocatable :: a
+      real(r_typ_pc), dimension(:), allocatable :: a_i
+!
+      integer, dimension(7) :: a_offsets
+
+      integer :: N,M
+      real(r_typ_pc), dimension(:), allocatable, target :: a_csr
+      real(r_typ_pc), dimension(:), allocatable :: lu_csr
+      real(r_typ_pc), dimension(:), allocatable :: a_csr_d
+      integer, dimension(:), allocatable :: lu_csr_ja
+      integer, dimension(:), allocatable, target :: a_csr_ia
+      integer, dimension(:), allocatable, target :: a_csr_ja
+      integer, dimension(:), allocatable :: a_N1
+      integer, dimension(:), allocatable :: a_N2
+      integer, dimension(:), allocatable :: a_csr_dptr
+!
+end module
+!#######################################################################
+program POT3D
+!
+!-----------------------------------------------------------------------
+!
+      use ident
+      use mpidefs
+      use vars
+      use solve_params
+      use timing
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+! ****** Initialize MPI.
+!
+      call init_mpi
+!
+! ****** Start the wall-clock timer.
+!
+      call timer_on
+!
+! ****** Write the code name and version.
+!
+      if (iamp0) then
+        write (*,*)
+        write (*,*) idcode,' Version ',vers,', updated on ',update
+      end if
+!
+      call timer_on
+!
+! ****** Read the input file.
+!
+      call read_input_file
+!
+! ****** Create the output file.
+!
+      if (iamp0) then
+        call ffopen (9,outfile,'rw',ierr)
+        if (ierr.ne.0) then
+          write (*,*)
+          write (*,*) '### ERROR in POT3D:'
+          write (*,*) '### Could not create the output file.'
+          write (*,*) 'File name: ',trim(outfile)
+        end if
+      end if
+      call check_error_on_p0 (ierr)
+!
+! ****** Check the input parameters.
+!
+      call check_input
+!
+! ****** Check the processor topology.
+!
+      call check_proc_topology
+!
+! ****** Decompose the domain.
+!
+      call decompose_domain
+!
+! ****** Allocate global arrays.
+!
+      call allocate_global_arrays
+!
+! ****** Set the global meshes.
+!
+      call set_global_mesh
+!
+! ****** Decompose the mesh.
+!
+      call decompose_mesh_r
+      call decompose_mesh_tp
+!
+! ****** Allocate local arrays.
+!
+      call allocate_local_arrays_r
+      call allocate_local_arrays_tp
+!
+! ****** Set the local meshes.
+!
+      call set_local_mesh_r
+      call set_local_mesh_tp
+!
+! ****** Print decomposition diagnostics.
+!
+      call decomp_diags
+!
+! ****** Set up seam for solver.
+!
+      call seam_setup
+!
+! ****** Initialize the flux and if validating, write analytic solution.
+!
+      if (validation_run) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### COMMENT from POT3D:'
+          write (*,*) '### Validation run requested.'
+          write (*,*) '### Ignoring br input file,'
+          write (*,*) '### setting HDF32=.FALSE.,'
+          write (*,*) '### and overriding output filenames.'
+          write (9,*)
+          write (9,*) '### COMMENT from POT3D:'
+          write (9,*) '### Validation run requested.'
+          write (9,*) '### Ignoring br input file'
+          write (9,*) '### setting HDF32=.FALSE.,'
+          write (9,*) '### and overriding output filenames.'
+        end if
+        hdf32=.false.
+        brfile='br_solved.'//trim(fmt)
+        btfile='bt_solved.'//trim(fmt)
+        bpfile='bp_solved.'//trim(fmt)
+        phifile='phi_solved.'//trim(fmt)
+        call set_validation_flux
+        call write_validation_solution
+      else
+        call set_flux
+      end if
+!
+      call timer_off (t_startup)
+!
+! ****** Find the solution.
+!
+      if (iamp0) then
+        write (*,*)
+        write (*,*) '### COMMENT from POT3D:'
+        write (*,*) '### Starting PCG solve.'
+        call FLUSH(OUTPUT_UNIT)
+        write (9,*)
+        write (9,*) '### COMMENT from POT3D:'
+        write (9,*) '### Starting PCG solve.'
+        call FLUSH(9)
+      end if
+!
+      call timer_on
+!
+      call potfld
+!
+      call timer_off (t_solve)
+!
+! ****** Compute B.
+!
+      call getb
+!
+! ****** Write solution to file.
+!
+      call write_solution
+!
+! ****** Magnetic energy diagnostics.
+!
+      call magnetic_energy
+!
+      call MPI_Barrier(MPI_COMM_WORLD,ierr)
+      call timer_off (t_wall)
+!
+      call write_timing
+!
+      call endrun (.false.)
+!
+end program pot3d
+!#######################################################################
+subroutine read_input_file
+!
+!-----------------------------------------------------------------------
+!
+! ****** Read the input file.
+!
+!-----------------------------------------------------------------------
+!
+      use global_dims
+      use global_mesh
+      use mpidefs
+      use meshdef
+      use cgcom
+      use debug
+      use vars
+      use decomposition_params
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+! ****** Values for the global mesh size.
+! ****** Since these names conflict with those in LOCAL_DIMS*, it is
+! ****** important not to use these modules here.
+!
+      integer :: nr=0
+      integer :: nt=0
+      integer :: np=0
+!
+!-----------------------------------------------------------------------
+!
+      namelist /topology/ &
+        nprocs,           &! MPI topology triplet.  Default -1,-1,-1
+                           !   automatically sets "best" topology.
+        nr,               &! Grid resolution in the `r` direction.
+        nt,               &! Grid resolution in the `theta` direction.
+        np                 ! Grid resolution in the `phi` direction.
+!
+!-----------------------------------------------------------------------
+!
+      namelist /inputvars/       &
+        r0,                      &! Lower radial boundary.
+        r1,                      &! Upper radial boundary.
+        drratio,                 &! Ratio of grid spacing at the end
+                                  ! of each segment to that at the
+                                  ! beginning for the radial grid
+                                  ! [ length(drratio) = length(rfrac)-1 ].
+        dtratio,                 &! Ratio of grid spacing (theta)
+        dpratio,                 &! Ratio of grid spacing (phi)
+        rfrac,                   &! Normalized positions of grid segment
+                                  ! boundaries (frac of domain size)
+                                  ! for radial grid.
+        tfrac,                   &! Normalized positions of grid (theta)
+        pfrac,                   &! Normalized positions of grid (phi)
+        nfrmesh,                 &! Number of times to filter/smooth
+                                  ! the radial grid spacing.
+        nftmesh,                 &! Number of times to filter (theta)
+        nfpmesh,                 &! Number of times to filter (phi)
+        phishift,                &! Apply an optional phi shift
+                                  ! (radians) to the input Br at r0.
+        ifprec,                  &! Preconditioner method:
+                                  !  1: Diagonal (use for GPU runs)
+                                  !  2: ILU0     (use for CPU runs or
+                                  !   GPU runs when built with cusparse)
+        ncgmax,                  &! Maximum alowed solver iterations.
+        ncghist,                 &! Iteration information.
+                                  !  0: Only write # total iterations.
+                                  ! >0: Write every NCGHIST iteration.
+        epscg,                   &! Solver convergence tolerance,
+                                  !|residual|/|right-hand-side|.
+        idebug,                  &! Output debugging info during run.
+        br0file,                 &! Filename of input 2D (tp) Br
+        phifile,                 &! Filename to write 3D PHI potential.
+        brfile,                  &! Filename to write 3D Br field.
+        btfile,                  &! Filename to write 3D Bt field.
+        bpfile,                  &! Filename to write 3D Bp field.
+        br_photo_file,           &! Filename to write 2D Br@r=r0
+                                  !(after interp/flux-balance).
+        br_photo_original_file,  &! Filename to write 2D Br@r=r0
+                                  ! (option='open' only, writes Br
+                                  !  before sign change).
+        option,                  &!  'ss'        PFSS
+                                  !  'potential' PF with closed-wall
+                                  !              (requires flux balance).
+                                  !  'open'      Open field. Used for
+                                  !              current sheet and fully
+                                  !              open field runs.
+                                  !              B field will be unsigned.
+        do_not_balance_flux,     &! Do not balance flux of input Br@r0.
+        hdf32,                   &! Output precision:
+                                  !  .true.   Single (32-bit) output.
+                                  !  .false.  Double (64-bit) output.
+        validation_run,          &!  Set this to run a validation test
+                                  ! with an analytic tilted dipole
+                                  ! solution. Overrides other inputs.
+        dipole_angle              ! Tilt angle for validation run dipole.
+!
+!-----------------------------------------------------------------------
+!
+      integer :: ierr
+      character(80) :: infile='pot3d.dat'
+!
+!-----------------------------------------------------------------------
+!
+! ****** Read the input file.
+!
+      call ffopen (8,infile,'r',ierr)
+!
+      if (ierr.ne.0) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in READ_INPUT_FILE:'
+          write (*,*) '### Could not open the input file.'
+          write (*,*) 'File name: ',trim(infile)
+        end if
+        call endrun (.true.)
+      end if
+!
+      read (8,topology)
+!
+      read (8,inputvars)
+!
+      close (8)
+!
+      if (trim(fmt).ne.'h5') then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in READ_INPUT_FILE:'
+          write (*,*) '### "fmt" must be "h5".'
+          write (*,*) 'fmt: ',trim(fmt)
+        end if
+        call endrun (.true.)
+      end if
+!
+! ****** Check if output names were overwritten.
+! ****** If not, set default names with format fmt.
+!
+      if (trim(phifile).eq.'default') then
+        phifile='phi.'//trim(fmt)
+      end if
+      if (trim(br0file).eq.'default') then
+        br0file='br0.'//trim(fmt)
+      end if
+      if (trim(brfile).eq.'default') then
+        brfile='br.'//trim(fmt)
+      end if
+      if (trim(btfile).eq.'default') then
+        btfile='bt.'//trim(fmt)
+      end if
+      if (trim(bpfile).eq.'default') then
+        bpfile='bp.'//trim(fmt)
+      end if
+      if (trim(br_photo_file).eq.'default') then
+        br_photo_file='br_photo.'//trim(fmt)
+      end if
+      if (trim(br_photo_original_file).eq.'default') then
+        br_photo_original_file='br_photo_original.'//trim(fmt)
+      end if
+!
+      nr_g=nr
+      nt_g=nt
+      np_g=np
+!
+! ****** Check if the specified mesh dimensions are valid.
+!
+      call check_mesh_dimensions (nr_g,nt_g,np_g)
+!
+      nrm1_g=nr_g-1
+      ntm1_g=nt_g-1
+      npm1_g=np_g-1
+!
+end subroutine
+!#######################################################################
+subroutine check_error_on_p0 (ierr0)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Check if the error flag IERR0 on processor 0 in
+! ****** MPI_COMM_WORLD (i.e., processor IPROC0 in COMM_ALL)
+! ****** indicates that the code should exit.
+!
+! ****** If IERR0 is non-zero, all the processors are directed
+! ****** to call ENDRUN to terminate the code.
+!
+!-----------------------------------------------------------------------
+!
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: ierr0
+!
+!-----------------------------------------------------------------------
+!
+! ****** MPI error return.
+!
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+! ****** Broadcast IERR0 to all processors.
+!
+      call MPI_Bcast (ierr0,1,MPI_INTEGER,0,MPI_COMM_WORLD,ierr)
+!
+! ****** Call ENDRUN if IERR0 is non-zero.
+!
+      if (ierr0.ne.0) then
+        call endrun (.true.)
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine endrun (ifstop)
+!
+!-----------------------------------------------------------------------
+!
+! ****** End the run and exit the code.
+!
+!-----------------------------------------------------------------------
+!
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      logical :: ifstop
+!
+!-----------------------------------------------------------------------
+!
+! ****** MPI error return.
+!
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+! ****** Close the output file.
+!
+      if (iamp0) then
+        close (9)
+      end if
+!
+! ****** Exit MPI gracefully.
+!
+      call MPI_Finalize (ierr)
+!
+! ****** Call the STOP statement if requested.
+!
+      if (ifstop) then
+        stop
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine init_mpi
+!
+!-----------------------------------------------------------------------
+!
+! ****** Initialize MPI.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+! ****** MPI error return.
+!
+      integer :: ierr,tcheck
+!
+!-----------------------------------------------------------------------
+!
+! ****** Real number to determine the KIND of REALs.
+!
+      real(r_typ) :: def_real
+!
+!-----------------------------------------------------------------------
+!
+      call MPI_Init_thread (MPI_THREAD_FUNNELED,tcheck,ierr)
+!
+! ****** Get the total number of processors.
+!
+      call MPI_Comm_size (MPI_COMM_WORLD,nproc,ierr)
+!
+! ****** Get the index (rank) of the local processor in
+! ****** communicator MPI_COMM_WORLD in variable IPROCW.
+!
+      call MPI_Comm_rank (MPI_COMM_WORLD,iprocw,ierr)
+!
+! ****** Create a shared communicator for all ranks in the node.
+!
+      call MPI_Comm_split_type (MPI_COMM_WORLD,MPI_COMM_TYPE_SHARED,0, &
+                                MPI_INFO_NULL,comm_shared,ierr)
+!
+! ****** Get the total number of processors in node.
+!
+      call MPI_Comm_size (comm_shared,nprocsh,ierr)
+!
+! ****** Get the index (rank) of the local processor in the local node.
+!
+      call MPI_Comm_rank (comm_shared,iprocsh,ierr)
+!
+! ****** Set the flag to designate whether this processor
+! ****** has rank 0 in communicator MPI_COMM_WORLD.
+!
+      if (iprocw.eq.0) then
+        iamp0=.true.
+      else
+        iamp0=.false.
+      end if
+!
+! ****** Set the number type for communicating REAL
+! ****** numbers in MPI calls.
+!
+      if (kind(def_real).eq.KIND_REAL_4) then
+        ntype_real=MPI_REAL4
+      else if (kind(def_real).eq.KIND_REAL_8) then
+        ntype_real=MPI_REAL8
+      else
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in INIT_MPI:'
+          write (*,*) '### Unrecognized default REAL number KIND:'
+          write (*,*) 'KIND(default_real) = ',kind(def_real)
+          write (*,*) 'This is a fatal error.'
+        end if
+        call endrun (.true.)
+      end if
+!
+! ****** Set the GPU device number based on local rank.
+! ****** NOTE! This assumes than #GPUs per node = #MPI ranks per node.
+!
+!$    call omp_set_default_device (iprocsh)
+!$acc set device_num(iprocsh)
+!
+end subroutine
+!#######################################################################
+subroutine check_input
+!
+!-----------------------------------------------------------------------
+!
+! ****** Check the validity of the input parameters.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use vars
+      use solve_params
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: one=1._r_typ
+!
+!-----------------------------------------------------------------------
+!
+! ****** Check that OPTION is valid and set the boundary condition
+! ****** switches accordingly.
+!
+      if (option.eq.'potential') then
+!
+! ****** For a potential field, set d(phi)/dr to zero at r=R1
+! ****** (i.e., the field is tangential to the boundary).
+!
+        pm_r1=one
+!
+      else if (option.eq.'open') then
+!
+! ****** For an open field, set phi to zero at r=R1
+! ****** (i.e., the field is radial there).
+!
+        pm_r1=-one
+!
+      else if (option.eq.'ss') then
+!
+! ****** For a source surface field, set phi to zero at r=R1
+! ****** (i.e., the field is radial there).
+!
+        pm_r1=-one
+!
+      else
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in CHECK_INPUT:'
+          write (*,*) '### Invalid OPTION:'
+          write (*,*)
+          write (*,*) 'OPTION = ',trim(option)
+          write (*,*)
+          write (*,*) 'The options allowed are:'
+          write (*,*) '''potential'''
+          write (*,*) '''open'''
+          write (*,*) '''ss'''
+        end if
+        call endrun (.true.)
+      end if
+!
+      if (iamp0) then
+        write (*,*)
+        write (*,*) '### COMMENT from CHECK_INPUT:'
+        write (*,*) '### Field solve type:'
+        write (*,*)
+        write (*,*) 'OPTION = ',option
+        write (9,*)
+        write (9,*) '### COMMENT from CHECK_INPUT:'
+        write (9,*) '### Field solve type:'
+        write (9,*)
+        write (9,*) 'OPTION = ',option
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine set_proc_topology
+!
+!-----------------------------------------------------------------------
+!
+! ****** Set the optimal values of the MPI rank topology
+! ****** in dimensions not set by user.
+!
+!-----------------------------------------------------------------------
+!
+      use mpidefs
+      use decomposition_params
+      use number_types
+      use global_dims
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: one=1.0_r_typ
+      real(r_typ), parameter :: zero=0.0_r_typ
+      real(r_typ), parameter :: bigval=HUGE(1.0_r_typ)
+!
+!-----------------------------------------------------------------------
+!
+      integer, dimension(:), allocatable :: factors
+      integer, dimension(:,:), allocatable :: rank_factors
+      real(r_typ), dimension(:,:), allocatable :: nperrank
+      real(r_typ), dimension(:), allocatable :: penalty
+!
+      integer :: i,j,k,num_fac,num_rank_fac,best_idx
+      integer :: fr=1,ft=1,fp=1
+      real(r_typ) :: a12,a13,a23
+!
+!-----------------------------------------------------------------------
+!
+! ****** Extract nproc values.  A value of -1 indicates the dimension
+! ****** should be autoset.
+!
+      nproc_r=nprocs(1)
+      nproc_t=nprocs(2)
+      nproc_p=nprocs(3)
+!
+! ****** If no dimensions are to be autoset, return.
+!
+      if(nproc_r.ne.-1.and.nproc_t.ne.-1.and.nproc_p.ne.-1) return
+!
+! ****** Get all factors of nproc and store them in factors array.
+!
+      i=1
+      num_fac=0
+      do while(i.le.nproc)
+        if (MOD(nproc,i).eq.0) then
+          num_fac=num_fac+1
+        endif
+        i=i+1
+      enddo
+      allocate (factors(num_fac))
+      i=1
+      num_fac=0
+      do while(i.le.nproc)
+        if (MOD(nproc,i).eq.0) then
+          num_fac=num_fac+1
+          factors(num_fac)=i
+        endif
+        i=i+1
+      enddo
+!
+! ****** Set penalty function parameters and any fixed dimensions
+! ****** based on which dimensions are to be autoset.
+!
+      a12=one
+      a13=one
+      a23=one
+!
+      if (nproc_r.ne.-1) then
+        fr=nproc_r
+        a12=zero
+        a13=zero
+      end if
+      if (nproc_t.ne.-1) then
+        ft=nproc_t
+        a12=zero
+        a23=zero
+      end if
+      if (nproc_p.ne.-1) then
+        fp=nproc_p
+        a13=zero
+        a23=zero
+      end if
+!
+! ****** Loop over all combinations of factors and save those that
+! ****** yield the correct number of MPI ranks into rank_factors array.
+!
+      num_rank_fac=0
+      do k=1,num_fac
+        do j=1,num_fac
+          do i=1,num_fac
+            if(nproc_r.eq.-1) fr=factors(i)
+            if(nproc_t.eq.-1) ft=factors(j)
+            if(nproc_p.eq.-1) fp=factors(k)
+            if (fr*ft*fp.eq.nproc) then
+              num_rank_fac=num_rank_fac+1
+            end if
+          enddo
+        enddo
+      enddo
+!
+      if (num_rank_fac.eq.0) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in SET_PROC_TOPOLOGY:'
+          write (*,*) '### Processor topology specification error.'
+          write (*,*) 'No valid topologies found for selected options.'
+          write (*,*) 'Number of MPI ranks = ',nproc
+          write (*,*) 'NPROC_R = ',nproc_r
+          write (*,*) 'NPROC_T = ',nproc_t
+          write (*,*) 'NPROC_P = ',nproc_p
+        end if
+        call endrun (.true.)
+      end if
+!
+      allocate(rank_factors(num_rank_fac,3))
+      allocate(nperrank(num_rank_fac,3))
+      allocate(penalty(num_rank_fac))
+!
+      rank_factors(:,:)=-1
+      penalty(:)=bigval
+!
+      num_rank_fac=0
+      do k=1,num_fac
+        do j=1,num_fac
+          do i=1,num_fac
+            if(nproc_r.eq.-1) fr=factors(i)
+            if(nproc_t.eq.-1) ft=factors(j)
+            if(nproc_p.eq.-1) fp=factors(k)
+            if (fr*ft*fp.eq.nproc) then
+              num_rank_fac=num_rank_fac+1
+              rank_factors(num_rank_fac,1)=fr
+              rank_factors(num_rank_fac,2)=ft
+              rank_factors(num_rank_fac,3)=fp
+            end if
+          enddo
+        enddo
+      enddo
+!
+! ****** Get number of grid points per rank for each dimension.
+!
+      nperrank(:,1)=real(nr_g)/rank_factors(:,1)
+      nperrank(:,2)=real(nt_g)/rank_factors(:,2)
+      nperrank(:,3)=real(np_g)/rank_factors(:,3)
+!
+! ****** Compute penalty function.
+!
+      penalty(:)=a12*(nperrank(:,1)-nperrank(:,2))**2 &
+                +a23*(nperrank(:,2)-nperrank(:,3))**2 &
+                +a13*(nperrank(:,3)-nperrank(:,1))**2
+!
+! ****** Eliminate any choices that yield less than a minimum number
+! ****** of grid points per rank.
+!
+      do i=1,num_rank_fac
+        if (nperrank(i,1).lt.4) penalty(i)=bigval
+        if (nperrank(i,2).lt.4) penalty(i)=bigval
+        if (nperrank(i,3).lt.3) penalty(i)=bigval
+      enddo
+!
+! ****** Find optimal topology.
+!
+      best_idx=MINLOC(penalty,1)
+!
+      if (penalty(best_idx).eq.bigval) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in SET_PROC_TOPOLOGY:'
+          write (*,*) '### Processor topology specification error.'
+          write (*,*) 'No valid topologies found for selected options'
+          write (*,*) 'with selected grid.  '
+          write (*,*) 'It is likely you are using too many MPI ranks.'
+          write (*,*) 'Number of MPI ranks = ',nproc
+          write (*,*) 'NPROC_R = ',nproc_r
+          write (*,*) 'NPROC_T = ',nproc_t
+          write (*,*) 'NPROC_P = ',nproc_p
+          write (*,*) 'NR = ',nr_g
+          write (*,*) 'NT = ',nt_g
+          write (*,*) 'NP = ',np_g
+        end if
+        call endrun (.true.)
+      end if
+!
+! ****** Set optimal topology.
+!
+      nprocs(1)=rank_factors(best_idx,1)
+      nprocs(2)=rank_factors(best_idx,2)
+      nprocs(3)=rank_factors(best_idx,3)
+!
+      deallocate(factors)
+      deallocate(rank_factors)
+      deallocate(nperrank)
+      deallocate(penalty)
+!
+end subroutine
+!#######################################################################
+subroutine check_proc_topology
+!
+!-----------------------------------------------------------------------
+!
+! ****** Check the validity of the requested processor topology.
+!
+!-----------------------------------------------------------------------
+!
+      use mpidefs
+      use decomposition_params
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,nreq
+!
+!-----------------------------------------------------------------------
+!
+! ****** Check the processor topology.
+!
+      do i=1,3
+        if (nprocs(i).lt.1.and.nprocs(i).ne.-1) then
+          if (iamp0) then
+            write (*,*)
+            write (*,*) '### ERROR in CHECK_PROC_TOPOLOGY:'
+            write (*,*) '### Processor topology specification error.'
+            write (*,*) 'Invalid number of processors requested.'
+            write (*,*) 'Dimension = ',i
+            write (*,*) 'Number of processors requested = ', &
+                        nprocs(i)
+          end if
+          call endrun (.true.)
+        end if
+      enddo
+!
+! ****** Set the optimal values of the topology if requested.
+!
+      call set_proc_topology
+!
+! ****** Check that the number of processors available
+! ****** matches the number requested.
+!
+      nreq=nprocs(1)*nprocs(2)*nprocs(3)
+!
+      if (nreq.ne.nproc) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in CHECK_PROC_TOPOLOGY:'
+          write (*,*) '### Processor topology specification error.'
+          write (*,*) 'The number of processors requested does not'// &
+                     ' equal the number available.'
+          write (*,*) 'Number of processors requested = ',nreq
+          write (*,*) 'Number of processors available = ',nproc
+        end if
+        call endrun (.true.)
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine decompose_domain
+!
+!-----------------------------------------------------------------------
+!
+! ****** Decompose the domain into a Cartesian MPI topology.
+!
+!-----------------------------------------------------------------------
+!
+      use mpidefs
+      use decomposition_params
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+      integer, parameter :: ndim=3
+      integer, dimension(ndim) :: coords
+      logical, dimension(ndim) :: periodic
+      logical :: reorder
+      logical, dimension(ndim) :: keep_dim
+!
+!-----------------------------------------------------------------------
+!
+! ****** Create a communicator over all processors, COMM_ALL,
+! ****** that has a Cartesian topology.
+!
+! ****** Specify the periodicity of the coordinate system.
+!
+      periodic(1)=.false.
+      periodic(2)=.false.
+      periodic(3)=.true.
+!
+! ****** Allow re-ordering in the Cartesian topology.
+!
+      reorder=.true.
+!
+      call MPI_Cart_create (MPI_COMM_WORLD,ndim,nprocs, &
+                            periodic,reorder,comm_all,ierr)
+!
+! ****** Get the index (rank) of the local processor in
+! ****** communicator COMM_ALL in variable IPROC.
+!
+! ****** IMPORTANT NOTE:
+! ****** If re-odering was allowed in the Cartesian topology
+! ****** creation (above), then the rank of the local processor
+! ****** in communicator COMM_ALL may be different from its rank
+! ****** in communicator MPI_COMM_WORLD.
+!
+      call MPI_Comm_rank (comm_all,iproc,ierr)
+!
+! ****** Set the processor rank IPROC0 in communicator COMM_ALL
+! ****** for the processor that has rank 0 in MPI_COMM_WORLD.
+! ****** This value is broadcast to all the processors.
+!
+      if (iamp0) then
+        iproc0=iproc
+      end if
+      call MPI_Bcast (iproc0,1,MPI_INTEGER,0,MPI_COMM_WORLD,ierr)
+!
+! ****** Get the coordinate indices of this processor in the
+! ****** Cartesian MPI topology.
+!
+      call MPI_Cart_coords (comm_all,iproc,ndim,coords,ierr)
+!
+      iproc_r=coords(1)
+      iproc_t=coords(2)
+      iproc_p=coords(3)
+!
+      nproc_r=nprocs(1)
+      nproc_t=nprocs(2)
+      nproc_p=nprocs(3)
+!
+! ****** Get the rank of the neighboring processors in the
+! ****** Cartesian MPI topology.
+!
+      call MPI_Cart_shift (comm_all,0,1,iproc_rm,iproc_rp,ierr)
+      call MPI_Cart_shift (comm_all,1,1,iproc_tm,iproc_tp,ierr)
+      call MPI_Cart_shift (comm_all,2,1,iproc_pm,iproc_pp,ierr)
+!
+! ****** Create communicators for operations involving all
+! ****** processors in the phi dimension.  These communicators
+! ****** are stored in COMM_PHI (and generally represent different
+! ****** communicators on different processors).
+!
+      keep_dim(1)=.false.
+      keep_dim(2)=.false.
+      keep_dim(3)=.true.
+!
+      call MPI_Cart_sub (comm_all,keep_dim,comm_phi,ierr)
+!
+! ****** Create communicators for operations involving
+! ****** all processors in the r dimension.
+! ****** These communicators are stored in COMM_R
+! ****** (and generally represent different communicators on
+! ****** different processors).
+!
+      keep_dim(1)=.true.
+      keep_dim(2)=.false.
+      keep_dim(3)=.false.
+!
+      call MPI_Cart_sub (comm_all,keep_dim,comm_r,ierr)
+!
+end subroutine
+!#######################################################################
+subroutine decompose_mesh_r
+!
+!-----------------------------------------------------------------------
+!
+! ****** Decompose the r mesh between processors.
+!
+!-----------------------------------------------------------------------
+!
+      use global_dims
+      use local_dims
+      use decomposition
+      use solve_params
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: ierr,i,npts
+      integer :: i0_h,i1_h,i0_m,i1_m
+      integer, dimension(nproc_r) :: mp_r
+!
+!-----------------------------------------------------------------------
+!
+! ****** Decompose the r dimension.
+!
+      npts=nr_g
+!
+      call decompose_dimension (npts,nproc_r,mp_r,ierr)
+      if (ierr.ne.0) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in DECOMPOSE_MESH:'
+          write (*,*) '### Anomaly in decomposing the mesh'// &
+                      ' between processors.'
+          write (*,*) '### Could not decompose the r mesh.'
+          write (*,*) 'Number of mesh points in r = ',npts
+          write (*,*) 'Number of processors along r = ',nproc_r
+        end if
+        call endrun (.true.)
+      end if
+!
+! ****** Check that the resulting mesh topology is valid.
+!
+      call check_mesh_topology (nproc_r,mp_r,1,'r')
+!
+! ****** Compute the mapping between the processor decomposition
+! ****** and the global mesh.
+!
+! ****** Note that there is a two-point overlap in the mesh
+! ****** between adjacent processors in r.
+!
+      i0_g=1
+      do i=1,iproc_r
+        i0_g=i0_g+mp_r(i)
+      enddo
+      nr=mp_r(iproc_r+1)+2
+      i1_g=i0_g+nr-1
+!
+      nrm1=nr-1
+!
+! ****** Set the flags to indicate whether this processor has
+! ****** points on the physical boundaries.
+!
+      if (iproc_r.eq.0) then
+        rb0=.true.
+      else
+        rb0=.false.
+      end if
+!
+      if (iproc_r.eq.nproc_r-1) then
+        rb1=.true.
+      else
+        rb1=.false.
+      end if
+!
+! ****** Set the dimensions of arrays for the "main" meshes
+! ****** (i.e., the "m" mesh) for which normal derivatives are
+! ****** needed (e.g., v).  These vary on different processors,
+! ****** depending if they are left-boundary, internal, or
+! ****** right-boundary processors.
+!
+      if (rb1) then
+        nrm=nrm1
+      else
+        nrm=nr
+      end if
+!
+! ****** Store the mapping structure (for this processor).
+!
+      allocate (map_rh(0:nproc-1))
+      allocate (map_rm(0:nproc-1))
+!
+      if (rb0) then
+        i0_h=1
+      else
+        i0_h=2
+      end if
+      if (rb1) then
+        i1_h=nr
+      else
+        i1_h=nrm1
+      end if
+!
+      if (rb0) then
+        i0_m=1
+      else
+        i0_m=2
+      end if
+      i1_m=nrm1
+!
+      map_rh(iproc)%i0=i0_h
+      map_rh(iproc)%i1=i1_h
+!
+      map_rm(iproc)%i0=i0_m
+      map_rm(iproc)%i1=i1_m
+!
+      map_rh(iproc)%offset=i0_g+map_rh(iproc)%i0-1
+      map_rh(iproc)%n=map_rh(iproc)%i1-map_rh(iproc)%i0+1
+!
+      map_rm(iproc)%offset=i0_g+map_rm(iproc)%i0-1
+      map_rm(iproc)%n=map_rm(iproc)%i1-map_rm(iproc)%i0+1
+!
+! ****** Gather the mapping information by communicating among
+! ****** all processors.
+!
+      call gather_mapping_info (map_rh)
+      call gather_mapping_info (map_rm)
+!
+end subroutine
+!#######################################################################
+subroutine decompose_mesh_tp
+!
+!-----------------------------------------------------------------------
+!
+! ****** Decompose the theta and phi mesh between processors.
+!
+!-----------------------------------------------------------------------
+!
+      use global_dims
+      use local_dims
+      use decomposition
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: ierr,j,k
+      integer :: j0_h,j1_h,j0_m,j1_m
+      integer :: k0_h,k1_h,k0_m,k1_m
+      integer, dimension(nproc_t) :: mp_t
+      integer, dimension(nproc_p) :: mp_p
+!
+!-----------------------------------------------------------------------
+!
+! ****** Decompose the t dimension.
+!
+      call decompose_dimension (nt_g,nproc_t,mp_t,ierr)
+      if (ierr.ne.0) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in DECOMPOSE_MESH_TP:'
+          write (*,*) '### Anomaly in decomposing the mesh'// &
+                      ' between processors.'
+          write (*,*) '### Could not decompose the theta mesh.'
+          write (*,*) 'Number of mesh points in theta = ',nt_g
+          write (*,*) 'Number of processors along theta = ',nproc_t
+        end if
+        call endrun (.true.)
+      end if
+!
+! ****** Decompose the p dimension.
+!
+      call decompose_dimension (np_g,nproc_p,mp_p,ierr)
+      if (ierr.ne.0) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in DECOMPOSE_MESH_TP:'
+          write (*,*) '### Anomaly in decomposing the mesh'// &
+                      ' between processors.'
+          write (*,*) '### Could not decompose the phi mesh.'
+          write (*,*) 'Number of mesh points in phi = ',np_g
+          write (*,*) 'Number of processors along phi = ',nproc_p
+        end if
+        call endrun (.true.)
+      end if
+!
+! ****** Check that the resulting mesh topology is valid.
+!
+      call check_mesh_topology (nproc_t,mp_t,1,'theta')
+      call check_mesh_topology (nproc_p,mp_p,1,'phi')
+!
+! ****** Set the flag for an axisymmetric run (requested by
+! ****** setting NP_G = 3).
+!
+      if (np_g.eq.3) then
+        axisymmetric=.true.
+      else
+        axisymmetric=.false.
+      end if
+!
+! ****** Compute the mapping between the processor decomposition
+! ****** and the global mesh.
+!
+! ****** Note that there is a two-point overlap in the mesh
+! ****** between adjacent processors in theta and phi.
+!
+      j0_g=1
+      do j=1,iproc_t
+        j0_g=j0_g+mp_t(j)
+      enddo
+      nt=mp_t(iproc_t+1)+2
+      j1_g=j0_g+nt-1
+!
+      k0_g=1
+      do k=1,iproc_p
+        k0_g=k0_g+mp_p(k)
+      enddo
+      np=mp_p(iproc_p+1)+2
+      k1_g=k0_g+np-1
+!
+      ntm1=nt-1
+      npm1=np-1
+!
+! ****** Set the flags to indicate whether this processor has
+! ****** points on the physical boundaries.
+!
+      if (iproc_t.eq.0) then
+        tb0=.true.
+      else
+        tb0=.false.
+      end if
+!
+      if (iproc_t.eq.nproc_t-1) then
+        tb1=.true.
+      else
+        tb1=.false.
+      end if
+!
+! ****** Set the dimensions of arrays for the "main" meshes
+! ****** (i.e., the "m" mesh) for which normal derivatives are
+! ****** needed (e.g., v).  These vary on different processors,
+! ****** depending if they are left-boundary, internal, or
+! ****** right-boundary processors.
+!
+      if (tb1) then
+        ntm=ntm1
+      else
+        ntm=nt
+      end if
+!
+! ****** Since the phi dimension is periodic, all processors
+! ****** have the same mesh limits.
+!
+      npm=np
+!
+! ****** Store the mapping structure (for this processor).
+!
+      allocate (map_th(0:nproc-1))
+      allocate (map_tm(0:nproc-1))
+      allocate (map_ph(0:nproc-1))
+      allocate (map_pm(0:nproc-1))
+!
+      if (tb0) then
+        j0_h=1
+      else
+        j0_h=2
+      end if
+      if (tb1) then
+        j1_h=nt
+      else
+        j1_h=ntm1
+      end if
+!
+      if (tb0) then
+        j0_m=1
+      else
+        j0_m=2
+      end if
+      j1_m=ntm1
+!
+      if (iproc_p.eq.0) then
+        k0_m=1
+      else
+        k0_m=2
+      end if
+      k1_m=npm1
+!
+      if (iproc_p.eq.0) then
+        k0_h=1
+      else
+        k0_h=2
+      end if
+      if (iproc_p.eq.nproc_p-1) then
+        k1_h=np
+      else
+        k1_h=npm1
+      end if
+!
+      map_th(iproc)%i0=j0_h
+      map_th(iproc)%i1=j1_h
+!
+      map_tm(iproc)%i0=j0_m
+      map_tm(iproc)%i1=j1_m
+!
+      map_ph(iproc)%i0=k0_h
+      map_ph(iproc)%i1=k1_h
+!
+      map_pm(iproc)%i0=k0_m
+      map_pm(iproc)%i1=k1_m
+!
+      map_th(iproc)%offset=j0_g+map_th(iproc)%i0-1
+      map_th(iproc)%n=map_th(iproc)%i1-map_th(iproc)%i0+1
+!
+      map_tm(iproc)%offset=j0_g+map_tm(iproc)%i0-1
+      map_tm(iproc)%n=map_tm(iproc)%i1-map_tm(iproc)%i0+1
+!
+      map_ph(iproc)%offset=k0_g+map_ph(iproc)%i0-1
+      map_ph(iproc)%n=map_ph(iproc)%i1-map_ph(iproc)%i0+1
+!
+      map_pm(iproc)%offset=k0_g+map_pm(iproc)%i0-1
+      map_pm(iproc)%n=map_pm(iproc)%i1-map_pm(iproc)%i0+1
+!
+! ****** Gather the mapping information by communicating among
+! ****** all processors.
+!
+      call gather_mapping_info (map_th)
+      call gather_mapping_info (map_tm)
+      call gather_mapping_info (map_ph)
+      call gather_mapping_info (map_pm)
+!
+end subroutine
+!#######################################################################
+subroutine check_mesh_dimensions (nr_g,nt_g,np_g)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Check that the requested (global) mesh dimensions are
+! ****** valid.
+!
+!-----------------------------------------------------------------------
+!
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: nr_g,nt_g,np_g
+!
+!-----------------------------------------------------------------------
+!
+      if (nr_g.lt.4) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in CHECK_MESH_DIMENSIONS:'
+          write (*,*) '### Invalid number of r mesh points'// &
+                    ' requested.'
+          write (*,*) '### The minimum number of mesh points is 4.'
+          write (*,*)
+          write (*,*) 'Number of mesh points requested = ',nr_g
+        end if
+        call endrun (.true.)
+      end if
+!
+      if (nt_g.lt.4) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in CHECK_MESH_DIMENSIONS:'
+          write (*,*) '### Invalid number of theta mesh points'// &
+                    ' requested.'
+          write (*,*) '### The minimum number of mesh points is 4.'
+          write (*,*)
+          write (*,*) 'Number of mesh points requested = ',nt_g
+        end if
+        call endrun (.true.)
+      end if
+!
+      if (np_g.lt.3) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in CHECK_MESH_DIMENSIONS:'
+          write (*,*) '### Invalid number of phi mesh points'// &
+                    ' requested.'
+          write (*,*) '### The minimum number of mesh points is 3.'
+          write (*,*)
+          write (*,*) 'Number of mesh points requested = ',np_g
+        end if
+        call endrun (.true.)
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine decompose_dimension (nx,np,mp,ierr)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Decompose the mesh points NX along NP processors.
+!
+! ****** The decomposed mesh points are returned in array MP.
+!
+!-----------------------------------------------------------------------
+!
+! ****** This routine attempts to assign the mesh points as equally
+! ****** as possible between the processors.
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: nx
+      integer :: np
+      integer, dimension(np) :: mp
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+      integer :: nxm2,mpav,nrem
+!
+!-----------------------------------------------------------------------
+!
+      ierr=0
+!
+      nxm2=nx-2
+!
+      if (nxm2.le.0) then
+        ierr=1
+        return
+      end if
+!
+      if (np.le.0) then
+        ierr=2
+        return
+      end if
+!
+      mpav=nxm2/np
+!
+      mp(:)=mpav
+!
+      nrem=nxm2-mpav*np
+!
+      mp(1:nrem)=mp(1:nrem)+1
+!
+end subroutine
+!#######################################################################
+subroutine check_mesh_topology (np,mp,min_pts,coord)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Check the validity of the requested mesh topology.
+!
+!-----------------------------------------------------------------------
+!
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: np
+      integer, dimension(np) :: mp
+      integer :: min_pts
+      character(*) :: coord
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i
+!
+!-----------------------------------------------------------------------
+!
+! ****** Check that the number of mesh points on each processor
+! ****** is valid.
+!
+      do i=1,np
+        if (mp(i).lt.min_pts) then
+          if (iamp0) then
+            write (*,*)
+            write (*,*) '### ERROR in CHECK_MESH_TOPOLOGY:'
+            write (*,*) '### Mesh topology specification error.'
+            write (*,*) 'Invalid number of ',coord, &
+                        ' mesh points requested.'
+            write (*,*) 'Processor index = ',i
+            write (*,*) 'Number of mesh points requested = ',mp(i)
+            write (*,*) 'Minimum number of mesh points allowed = ', &
+                        min_pts
+          end if
+          call endrun (.true.)
+        end if
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine gather_mapping_info (map)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Gather a mapping information array by communicating
+! ****** among all processors.
+!
+!-----------------------------------------------------------------------
+!
+      use mpidefs
+      use decomposition
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      type(map_struct), dimension(0:nproc-1) :: map
+!
+!-----------------------------------------------------------------------
+!
+! ****** Buffers for packing the data.
+!
+      integer, parameter :: lbuf=4
+      integer, dimension(lbuf) :: sbuf
+      integer, dimension(lbuf,0:nproc-1) :: rbuf
+!
+!-----------------------------------------------------------------------
+!
+      integer :: ierr,irank
+!
+!-----------------------------------------------------------------------
+!
+! ****** Put the local section of the mapping information
+! ****** array into a buffer.
+!
+      sbuf(1)=map(iproc)%n
+      sbuf(2)=map(iproc)%i0
+      sbuf(3)=map(iproc)%i1
+      sbuf(4)=map(iproc)%offset
+!
+! ****** Communicate among all processors.  After this call, all
+! ****** processors have the complete mapping information.
+!
+      call MPI_Allgather (sbuf,lbuf,MPI_INTEGER, &
+                          rbuf,lbuf,MPI_INTEGER,comm_all,ierr)
+!
+! ****** Extract the mapping information from the buffer.
+!
+      do irank=0,nproc-1
+        map(irank)%n     =rbuf(1,irank)
+        map(irank)%i0    =rbuf(2,irank)
+        map(irank)%i1    =rbuf(3,irank)
+        map(irank)%offset=rbuf(4,irank)
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine decomp_diags
+!
+!-----------------------------------------------------------------------
+!
+! ****** Print diagnostics about the mesh decomposition.
+!
+!-----------------------------------------------------------------------
+!
+      use global_dims
+      use global_mesh
+      use local_dims
+      use local_mesh
+      use mpidefs
+      use solve_params
+      use debug
+      use decomposition
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: ierr
+      integer :: irank
+      real(r_typ) :: n_per_grid_min,n_per_grid_max
+!
+!-----------------------------------------------------------------------
+!
+      if (iamp0) then
+!
+        n_per_grid_min=floor(real(nr_g)/nproc_r) &
+                      *floor(real(nt_g)/nproc_t) &
+                      *floor(real(np_g)/nproc_p)
+!
+        n_per_grid_max=ceiling(real(nr_g)/nproc_r) &
+                      *ceiling(real(nt_g)/nproc_t) &
+                      *ceiling(real(np_g)/nproc_p)
+!
+        write (*,*)
+        write (*,*) 'Total number of MPI ranks = ',nproc
+        write (*,*)
+        write (*,*) 'Number of MPI ranks in r = ',nproc_r
+        write (*,*) 'Number of MPI ranks in t = ',nproc_t
+        write (*,*) 'Number of MPI ranks in p = ',nproc_p
+        write (*,*)
+        write (*,*) 'Global mesh dimension in r = ',nr_g
+        write (*,*) 'Global mesh dimension in t = ',nt_g
+        write (*,*) 'Global mesh dimension in p = ',np_g
+        write (*,*)
+        write (*,'(A,F6.1)') ' Average # of r mesh pts per rank = ', &
+                     real(nr_g)/nproc_r
+        write (*,'(A,F6.1)') ' Average # of t mesh pts per rank = ', &
+                     real(nt_g)/nproc_t
+        write (*,'(A,F6.1)') ' Average # of p mesh pts per rank = ', &
+                     real(np_g)/nproc_p
+        write (*,*)
+        write (*,'(A,F6.2,A)') ' Estimated load imbalance = ',       &
+            100.0*(1.0-real(n_per_grid_min)/real(n_per_grid_max)),' %'
+!
+        write (9,*)
+        write (9,*) 'Total number of MPI ranks = ',nproc
+        write (9,*)
+        write (9,*) 'Number of MPI ranks in r = ',nproc_r
+        write (9,*) 'Number of MPI ranks in t = ',nproc_t
+        write (9,*) 'Number of MPI ranks in p = ',nproc_p
+        write (9,*)
+        write (9,*) 'Global mesh dimension in r = ',nr_g
+        write (9,*) 'Global mesh dimension in t = ',nt_g
+        write (9,*) 'Global mesh dimension in p = ',np_g
+        write (9,*)
+        write (9,'(A,F6.1)') ' Average # of r mesh pts per rank = ', &
+                     real(nr_g)/nproc_r
+        write (9,'(A,F6.1)') ' Average # of t mesh pts per rank = ', &
+                     real(nt_g)/nproc_t
+        write (9,'(A,F6.1)') ' Average # of p mesh pts per rank = ', &
+                     real(np_g)/nproc_p
+        write (9,*)
+        write (9,'(A,F6.2,A)') ' Estimated load imbalance = ',       &
+            100.0*(1.0-real(n_per_grid_min)/real(n_per_grid_max)),' %'
+!
+      end if
+!
+      if (idebug.le.1) return
+!
+      do irank=0,nproc-1
+        call MPI_Barrier (comm_all,ierr)
+        if (irank.eq.iproc) then
+          write (*,*)
+          write (*,*)
+          write (*,*) 'Rank id = ',iproc
+          write (*,*) 'nr = ',nr
+          write (*,*) 'nt = ',nt
+          write (*,*) 'np = ',np
+          write (*,*) 'i0_g = ',i0_g
+          write (*,*) 'i1_g = ',i1_g
+          write (*,*) 'j0_g = ',j0_g
+          write (*,*) 'j1_g = ',j1_g
+          write (*,*) 'k0_g = ',k0_g
+          write (*,*) 'k1_g = ',k1_g
+          write (*,*) 'Rank index in r    = ',iproc_r
+          write (*,*) 'Rank index in t    = ',iproc_t
+          write (*,*) 'Rank index in p    = ',iproc_p
+          write (*,*) 'Rank to left  in r = ',iproc_rm
+          write (*,*) 'Rank to right in r = ',iproc_rp
+          write (*,*) 'Rank to left  in t = ',iproc_tm
+          write (*,*) 'Rank to right in t = ',iproc_tp
+          write (*,*) 'Rank to left  in p = ',iproc_pm
+          write (*,*) 'Rank to right in p = ',iproc_pp
+          write (*,*)
+          write (*,*) 'Rank in MPI_COMM_WORLD = ',iprocw
+          write (*,*) 'Rank in COMM_ALL       = ',iproc
+          if (idebug.gt.2) then
+            write (*,*)
+            write (*,*) 'r mesh:'
+            write (*,*) r
+            write (*,*)
+            write (*,*) 'theta mesh:'
+            write (*,*) t
+            write (*,*)
+            write (*,*) 'phi mesh:'
+            write (*,*) p
+          end if
+        end if
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine allocate_global_arrays
+!
+!-----------------------------------------------------------------------
+!
+! ****** Allocate global arrays.
+!
+!-----------------------------------------------------------------------
+!
+      use global_dims
+      use global_mesh
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+! ****** Allocate global mesh arrays.
+!
+      allocate (r_g (nrm1_g))
+      allocate (dr_g(nrm1_g))
+!
+      allocate (rh_g (nr_g))
+      allocate (drh_g(nr_g))
+!
+      allocate (t_g (ntm1_g))
+      allocate (dt_g(ntm1_g))
+!
+      allocate (th_g (nt_g))
+      allocate (dth_g(nt_g))
+!
+      allocate (p_g  (np_g))
+      allocate (dp_g (np_g))
+      allocate (ph_g (np_g))
+      allocate (dph_g(np_g))
+!
+      allocate (st_g(ntm1_g))
+      allocate (ct_g(ntm1_g))
+!
+      allocate (sth_g(nt_g))
+      allocate (cth_g(nt_g))
+!
+      allocate (sp_g (np_g))
+      allocate (cp_g (np_g))
+      allocate (sph_g(np_g))
+      allocate (cph_g(np_g))
+!
+end subroutine
+!#######################################################################
+subroutine allocate_local_arrays_r
+!
+!-----------------------------------------------------------------------
+!
+! ****** Allocate local arrays for the r dimension and 3D arrays.
+!
+!-----------------------------------------------------------------------
+!
+      use local_dims
+      use local_mesh
+      use fields
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      allocate (r (nrm))
+      allocate (r2 (nrm))
+      allocate (dr(nrm))
+!
+      allocate (rh (nr))
+      allocate (drh(nr))
+!
+! ****** Allocate inverse quantities.
+!
+      allocate (r_i (nrm))
+      allocate (dr_i(nrm))
+!
+      allocate (rh_i (nr))
+      allocate (drh_i(nr))
+!
+! ****** Allocate the potential array and cg scratch array.
+!
+      allocate (phi(nr,nt,np))
+      allocate (x_ax(nr,nt,np))
+      phi(:,:,:)=0.
+      x_ax(:,:,:)=0.
+!$omp target enter data map(to:phi,x_ax)
+!
+! ****** Allocate polar boundary arrays.
+!
+      allocate (sum0(nr))
+      allocate (sum1(nr))
+      sum0(:)=0.
+      sum1(:)=0.
+!$omp target enter data map(to:sum0,sum1)
+!
+! ****** Allocate the local magnetic field arrays.
+!
+      allocate (br(nrm,nt,np))
+      allocate (bt(nr,ntm,np))
+      allocate (bp(nr,nt,npm))
+      br(:,:,:)=0.
+      bt(:,:,:)=0.
+      bp(:,:,:)=0.
+!
+end subroutine
+!#######################################################################
+subroutine allocate_local_arrays_tp
+!
+!-----------------------------------------------------------------------
+!
+! ****** Allocate local arrays for the theta and phi dimensions.
+!
+!-----------------------------------------------------------------------
+!
+      use local_dims
+      use local_mesh
+      use fields
+      use vars
+      use solve_params
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      allocate (t (ntm))
+      allocate (dt(ntm))
+!
+      allocate (th (nt))
+      allocate (dth(nt))
+!
+      allocate (p (np))
+      allocate (dp(np))
+!
+      allocate (ph (np))
+      allocate (dph(np))
+!
+      allocate (st(ntm))
+      allocate (ct(ntm))
+!
+      allocate (sth(nt))
+      allocate (cth(nt))
+!
+      allocate (sp (np))
+      allocate (cp (np))
+      allocate (sph(np))
+      allocate (cph(np))
+!
+! ****** Allocate inverse quantities.
+!
+      allocate (dt_i(ntm))
+      allocate (st_i(ntm))
+!
+      allocate (dth_i(nt))
+      allocate (sth_i(nt))
+!
+      allocate (dp_i (np))
+      allocate (dph_i(np))
+!
+! ****** Allocate the boundary radial magnetic field array(s).
+!
+      allocate (br0(nt,np))
+      br0(:,:)=0.
+!$omp target enter data map(to:br0)
+!
+      if (validation_run) then
+        allocate (br1(nt,np))
+        br1(:,:)=0.
+!$omp target enter data map(to:br1)
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine set_global_mesh
+!
+!-----------------------------------------------------------------------
+!
+! ****** Define the global mesh arrays.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use global_dims
+      use global_mesh
+      use meshdef
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: zero=0._r_typ
+      real(r_typ), parameter :: one=1._r_typ
+      real(r_typ), parameter :: half=.5_r_typ
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j,k
+!
+!-----------------------------------------------------------------------
+!
+! ****** Define the radial mesh.
+!
+      call genmesh (9,'r',nrm1_g,r0,r1,nmseg,rfrac,drratio,nfrmesh, &
+                    .false.,zero,r_g)
+!
+      do i=2,nrm1_g
+        rh_g(i)=.5*(r_g(i)+r_g(i-1))
+        drh_g(i)=r_g(i)-r_g(i-1)
+      enddo
+      rh_g(1)=rh_g(2)-drh_g(2)
+      rh_g(nr_g)=rh_g(nrm1_g)+drh_g(nrm1_g)
+      drh_g(1)=drh_g(2)
+      drh_g(nr_g)=drh_g(nrm1_g)
+!
+      do i=1,nrm1_g
+        dr_g(i)=rh_g(i+1)-rh_g(i)
+      enddo
+!
+! ****** Define the theta mesh.
+!
+      call genmesh (9,'t',ntm1_g,t0,t1,nmseg,tfrac,dtratio, &
+                    nftmesh,.false.,zero,t_g)
+!
+      do j=2,ntm1_g
+        th_g(j)=.5*(t_g(j)+t_g(j-1))
+        dth_g(j)=t_g(j)-t_g(j-1)
+      enddo
+      th_g(1)=th_g(2)-dth_g(2)
+      th_g(nt_g)=th_g(ntm1_g)+dth_g(ntm1_g)
+      dth_g(1)=dth_g(2)
+      dth_g(nt_g)=dth_g(ntm1_g)
+!
+      do j=1,ntm1_g
+        dt_g(j)=th_g(j+1)-th_g(j)
+      enddo
+!
+! ****** Define the periodic phi mesh.
+!
+      call genmesh (9,'p',npm1_g,p0,p1,nmseg,pfrac,dpratio, &
+                    nfpmesh,.true.,phishift,p_g)
+      p_g(np_g)=p_g(2)+pl
+!
+      do k=2,np_g
+        ph_g(k)=half*(p_g(k)+p_g(k-1))
+        dph_g(k)=p_g(k)-p_g(k-1)
+      enddo
+      ph_g(1)=ph_g(npm1_g)-pl
+      dph_g(1)=dph_g(npm1_g)
+!
+      do k=1,npm1_g
+        dp_g(k)=ph_g(k+1)-ph_g(k)
+      enddo
+      dp_g(np_g)=dp_g(2)
+!
+! ****** Enforce exact periodicity to protect symmetry properties
+! ****** from round-off errors (especially for axisymmetric cases).
+!
+      dph_g(np_g)=dph_g(2)
+      dp_g(1)=dp_g(npm1_g)
+!
+! ****** Define global auxiliary mesh-related arrays.
+!
+      st_g(:)=sin(t_g(:))
+      ct_g(:)=cos(t_g(:))
+      sth_g(:)=sin(th_g(:))
+      cth_g(:)=cos(th_g(:))
+!
+      sp_g(:)=sin(p_g(:))
+      cp_g(:)=cos(p_g(:))
+      sph_g(:)=sin(ph_g(:))
+      cph_g(:)=cos(ph_g(:))
+!
+! ****** For an axisymmetric case, set the exact values of
+! ****** sin(phi) and cos(phi) to preserve symmetry properties
+! ****** in the presence of round-off errors.
+!
+      if (axisymmetric) then
+        sp_g(2)=0.
+        cp_g(2)=one
+        sph_g(2)=0.
+        cph_g(2)=-one
+      end if
+!
+! ****** Enforce exact periodicity to protect symmetry properties
+! ****** from round-off errors (especially for axisymmetric cases).
+!
+      sph_g(1)=sph_g(npm1_g)
+      sph_g(np_g)=sph_g(2)
+      cph_g(1)=cph_g(npm1_g)
+      cph_g(np_g)=cph_g(2)
+      sp_g(1)=sp_g(npm1_g)
+      sp_g(np_g)=sp_g(2)
+      cp_g(1)=cp_g(npm1_g)
+      cp_g(np_g)=cp_g(2)
+!
+end subroutine
+!#######################################################################
+subroutine set_local_mesh_r
+!
+!-----------------------------------------------------------------------
+!
+! ****** Define the local r mesh arrays.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use global_dims
+      use global_mesh
+      use local_dims
+      use local_mesh
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: one=1._r_typ
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i
+!
+!-----------------------------------------------------------------------
+!
+! ****** Define the local meshes.
+!
+      do i=1,nrm
+        r(i)=r_g(i0_g+i-1)
+        dr(i)=dr_g(i0_g+i-1)
+      enddo
+!
+      dr1=dr(1)
+      drn=dr(nrm)
+!
+      do i=1,nr
+        rh(i)=rh_g(i0_g+i-1)
+        drh(i)=drh_g(i0_g+i-1)
+      enddo
+!
+! ****** Define local auxiliary mesh-related arrays.
+!
+      r2(:)=r(:)**2
+      r_i(:)=one/r(:)
+      dr_i(:)=one/dr(:)
+      rh_i(:)=one/rh(:)
+      drh_i(:)=one/drh(:)
+!
+!$omp target enter data map(to:r,r2,r_i,dr,dr_i,dr1,drn,rh,rh_i,drh,drh_i)
+end subroutine
+!#######################################################################
+subroutine set_local_mesh_tp
+!
+!-----------------------------------------------------------------------
+!
+! ****** Define the local theta and phi mesh arrays.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use global_dims
+      use global_mesh
+      use local_dims
+      use local_mesh
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: one=1._r_typ
+!
+!-----------------------------------------------------------------------
+!
+      integer :: j,k,j0,j1
+!
+!-----------------------------------------------------------------------
+!
+! ****** Define the local meshes.
+!
+      do j=1,ntm
+        t(j)=t_g(j0_g+j-1)
+        dt(j)=dt_g(j0_g+j-1)
+      enddo
+!
+      do j=1,nt
+        th(j)=th_g(j0_g+j-1)
+        dth(j)=dth_g(j0_g+j-1)
+      enddo
+!
+      do k=1,npm
+        p(k)=p_g(k0_g+k-1)
+        dp(k)=dp_g(k0_g+k-1)
+      enddo
+!
+      do k=1,np
+        ph(k)=ph_g(k0_g+k-1)
+        dph(k)=dph_g(k0_g+k-1)
+      enddo
+!
+! ****** Define local auxiliary mesh-related arrays.
+!
+      do j=1,ntm
+        st(j)=st_g(j0_g+j-1)
+        ct(j)=ct_g(j0_g+j-1)
+      enddo
+!
+      do j=1,nt
+        sth(j)=sth_g(j0_g+j-1)
+        cth(j)=cth_g(j0_g+j-1)
+      enddo
+!
+      do k=1,npm
+        sp(k)=sp_g(k0_g+k-1)
+        cp(k)=cp_g(k0_g+k-1)
+      enddo
+!
+      do k=1,np
+        sph(k)=sph_g(k0_g+k-1)
+        cph(k)=cph_g(k0_g+k-1)
+      enddo
+!
+      dt_i(:)=one/dt(:)
+      dth_i(:)=one/dth(:)
+      sth_i(:)=one/sth(:)
+      dp_i(:)=one/dp(:)
+      dph_i(:)=one/dph(:)
+!
+! ****** Prevent division by zero at the poles for sin(theta).
+!
+      if (tb0) then
+        j0=2
+      else
+        j0=1
+      end if
+      if (tb1) then
+        j1=ntm1-1
+      else
+        j1=ntm1
+      end if
+!
+      st_i(:)=0.
+      do j=j0,j1
+        st_i(j)=one/st(j)
+      enddo
+!
+!$omp target enter data map(to:t,th,dt,dth,p,ph,dp,dph,st,ct,sth,cth, &
+!$omp                   sp,cp,sph,cph,dt_i,dth_i,st_i,sth_i,dp_i,dph_i)
+end subroutine
+!#######################################################################
+subroutine genmesh (io,label,nc,c0,c1,nseg,frac,dratio,nfilt,periodic, &
+                    c_shift,c)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Generate a one-dimensional mesh.
+!
+!-----------------------------------------------------------------------
+!
+! ****** Input arguments:
+!
+!          IO      : [integer]
+!                    Fortran file unit number to which to write
+!                    mesh diagnostics.  Set IO=0 if diagnostics
+!                    are not of interest.  It is assumed that
+!                    unit IO has been connected to a file prior
+!                    to calling this routine.
+!
+!          LABEL   : [character(*)]
+!                    Name for the mesh coordinate (example: 'x').
+!
+!          NC      : [integer]
+!                    Number of mesh points to load.
+!
+!          C0      : [real]
+!                    The starting location for the coordinate.
+!
+!          C1      : [real]
+!                    The ending location for the coordinate.
+!                    It is required that C1.gt.C0.
+!
+!          NSEG    : [integer]
+!                    Maximum number of mesh segments.
+!                    The mesh spacing in each segment varies
+!                    exponentially with a uniform amplification
+!                    factor.  The actual number of mesh segments
+!                    used is NSEG or less.  It is obtained from the
+!                    information in array FRAC.
+!
+!          FRAC    : [real array, dimension NSEG]
+!                    The normalized positions of the mesh segment
+!                    boundaries (as a fraction of the size of the
+!                    domain).  For a non-periodic mesh, the first
+!                    value of FRAC specified must equal 0. and the
+!                    last value must equal 1.  For a periodic mesh,
+!                    FRAC must not contain both 0. and 1., since
+!                    these represent the same point.
+!
+!          DRATIO  : [real array, dimension NSEG]
+!                    The ratio of the mesh spacing at the end of a
+!                    segment to that at the beginning.
+!
+!          NFILT   : [integer]
+!                    The number of times to filter the mesh-point
+!                    distribution array.  Set NFILT=0 if filtering
+!                    is not desired.  Filtering can reduce
+!                    discontinuities in the derivative of the mesh
+!                    spacing.
+!
+!          PERIODIC: [logical]
+!                    A flag to indicate whether the mesh to be
+!                    generated represents a periodic coordinate.
+!                    If the coordinate is specified as periodic,
+!                    the range [C0,C1] should be the whole periodic
+!                    interval; the first mesh point is set at C0
+!                    and the last mesh point, C(NC), is set at C1.
+!
+!          C_SHIFT : [real]
+!                    Amount by which to shift the periodic coordinate.
+!                    C_SHIFT is only used when PERIODIC=.true.,
+!                    and is ignored otherwise.  A positive C_SHIFT
+!                    moves the mesh points to the right.
+!
+! ****** Output arguments:
+!
+!          C       : [real array, dimension NC]
+!                    The locations of the mesh points.
+!
+!-----------------------------------------------------------------------
+!
+! ****** The arrays DRATIO and FRAC define the mesh as follows.
+!
+! ****** For example, suppose that a (non-periodic) mesh with three
+! ****** segments is desired.  Suppose the domain size is c=[0:2].
+! ****** In the first segment (with c between 0 and .5) the mesh
+! ****** spacing is decreasing with c, such that DC at c=.5 is half
+! ****** DC at c=0.  From c=.5 to c=1, the mesh is uniform.  From c=1
+! ****** to c=2, the mesh spacing is increasing with c such that DC at
+! ****** c=2 is 10 times DC at c=1.  This mesh would be specified by:
+! ******
+! ******     FRAC=0.,.25,.5,1.
+! ******     DRATIO=.5,1.,10.
+! ******
+! ****** The variable C_SHIFT can be used to shift the mesh point
+! ****** distribution for a periodic coordinate.  For example,
+! ****** suppose C represents mesh points in the interval [0,2*pi].
+! ****** C_SHIFT=.5*pi would move the distribution of mesh points
+! ****** so that the original mesh point with C(1)=0. would be
+! ****** close to .5*pi in the new mesh.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use mpidefs
+      use debug
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer, intent(in) :: io
+      character(*), intent(in) :: label
+      integer, intent(in) :: nc
+      real(r_typ), intent(in) :: c0,c1
+      integer, intent(in) :: nseg
+      real(r_typ), dimension(nseg), intent(in) :: frac,dratio
+      integer, intent(in) :: nfilt
+      logical, intent(in) :: periodic
+      real(r_typ), intent(in) :: c_shift
+      real(r_typ), dimension(nc), intent(out) :: c
+!
+!-----------------------------------------------------------------------
+!
+! ****** Storage for the coordinate transformation.
+!
+      integer :: ns
+      real(r_typ), dimension(:), allocatable :: xi,cs,a,r,f
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: zero=0._r_typ
+      real(r_typ), parameter :: one=1._r_typ
+      real(r_typ), parameter :: half=.5_r_typ
+      real(r_typ), parameter :: eps=1.e-5_r_typ
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j,nf,nr,ll,j0
+      real(r_typ) :: alpha,dr,fac,d,dxi,xiv,cshft,xi_shift
+      real(r_typ), dimension(:), allocatable :: dc,rdc
+!
+!-----------------------------------------------------------------------
+!
+! ****** Check that the number of mesh points is valid.
+!
+      if (nc.lt.2) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in GENMESH:'
+          write (*,*) '### Invalid number of mesh points specified.'
+          write (*,*) '### There must be at least two mesh points.'
+          write (*,*) 'Mesh coordinate: ',label
+          write (*,*) 'Number of mesh points specified = ',nc
+        end if
+        call endrun (.true.)
+      end if
+!
+! ****** Check that a positive mesh interval has been specified.
+!
+      if (c0.ge.c1) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in GENMESH:'
+          write (*,*) '### Invalid mesh interval specified.'
+          write (*,*) '### C1 must be greater than C0.'
+          write (*,*) 'Mesh coordinate: ',label
+          write (*,*) 'C0 = ',c0
+          write (*,*) 'C1 = ',c1
+        end if
+        call endrun (.true.)
+      end if
+!
+! ****** Find the number of values of FRAC specified.
+!
+      do nf=1,nseg-1
+        if (frac(nf+1).eq.zero) exit
+      enddo
+!
+! ****** When no values have been specified (NF=1, the default),
+! ****** a uniform mesh is produced.
+!
+      if (nf.eq.1.and.frac(1).eq.zero) then
+        ns=1
+        allocate (cs(ns+1))
+        allocate (r(ns))
+        cs(1)=c0
+        cs(2)=c1
+        r(1)=one
+        go to 100
+      end if
+!
+! ****** Check that the specified values of FRAC are monotonically
+! ****** increasing.
+!
+      do i=2,nf
+        if (frac(i).lt.frac(i-1)) then
+          if (iamp0) then
+            write (*,*)
+            write (*,*) '### ERROR in GENMESH:'
+            write (*,*) '### Invalid mesh specification.'
+            write (*,*) 'Mesh coordinate: ',label
+            write (*,*) 'The values in FRAC must increase'// &
+                       ' monotonically.'
+            write (*,*) 'FRAC = ',frac(1:nf)
+          end if
+          call endrun (.true.)
+        end if
+      enddo
+!
+! ****** Check the specified values of FRAC.
+!
+      if (periodic) then
+!
+! ****** A periodic mesh requires the specified values of FRAC
+! ****** to be in the range 0. to 1.
+!
+        if (frac(1).lt.zero.or.frac(nf).gt.one) then
+          if (iamp0) then
+            write (*,*)
+            write (*,*) '### ERROR in GENMESH:'
+            write (*,*) '### Invalid mesh specification.'
+            write (*,*) 'Mesh coordinate: ',label
+            write (*,*) 'For a periodic coordinate, the values in'// &
+                       ' FRAC must be between 0. and 1.'
+            write (*,*) 'FRAC = ',frac(1:nf)
+          end if
+          call endrun (.true.)
+        end if
+!
+! ****** A periodic mesh cannot contain both 0. and 1. in FRAC,
+! ****** since these represent the same point.
+!
+        if (frac(1).eq.zero.and.frac(nf).eq.one) then
+          if (iamp0) then
+            write (*,*)
+            write (*,*) '### ERROR in GENMESH:'
+            write (*,*) '### Invalid mesh specification.'
+            write (*,*) 'Mesh coordinate: ',label
+            write (*,*) 'For a periodic coordinate, FRAC must not'// &
+                        ' contain both 0. and 1.'
+            write (*,*) 'FRAC = ',frac(1:nf)
+          end if
+          call endrun (.true.)
+        end if
+!
+      else
+!
+! ****** A non-periodic mesh requires the first specified value
+! ****** of FRAC to be 0., and the last value to equal 1.
+!
+        if (frac(1).ne.zero) then
+          if (iamp0) then
+            write (*,*)
+            write (*,*) '### ERROR in GENMESH:'
+            write (*,*) '### Invalid mesh specification.'
+            write (*,*) 'Mesh coordinate: ',label
+            write (*,*) 'For a non-periodic coordinate, the first'// &
+                      ' value of FRAC must equal 0.'
+            write (*,*) 'FRAC = ',frac(1:nf)
+          end if
+          call endrun (.true.)
+        end if
+!
+        if (frac(nf).ne.one) then
+          if (iamp0) then
+            write (*,*)
+            write (*,*) '### ERROR in GENMESH:'
+            write (*,*) '### Invalid mesh specification.'
+            write (*,*) 'Mesh coordinate: ',label
+            write (*,*) 'For a non-periodic coordinate, the last'// &
+                        ' value of FRAC must equal 1.'
+            write (*,*) 'FRAC = ',frac(1:nf)
+          end if
+          call endrun (.true.)
+        end if
+!
+      end if
+!
+! ****** Check that the required values of DRATIO have been set,
+! ****** and are positive.
+!
+      if (periodic) then
+        nr=nf
+      else
+        nr=nf-1
+      end if
+!
+      do i=1,nr
+        if (dratio(i).le.zero) then
+          if (iamp0) then
+            write (*,*)
+            write (*,*) '### ERROR in GENMESH:'
+            write (*,*) '### Invalid mesh specification.'
+            write (*,*) 'Mesh coordinate: ',label
+            write (*,*) 'A required value in DRATIO has not been'// &
+                        ' set or is not positive.'
+            write (*,*) 'DRATIO = ',dratio(1:nr)
+          end if
+          call endrun (.true.)
+        end if
+      enddo
+!
+! ****** Check that an inherently discontinuous mesh has not been
+! ****** specified inadvertently.
+!
+      if (periodic.and.nr.eq.1.and.dratio(1).ne.one) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### WARNING from GENMESH:'
+          write (*,*) '### Discontinuous mesh specification.'
+          write (*,*) 'Mesh coordinate: ',label
+          write (*,*) 'An inherently discontinuous mesh has been'// &
+                      ' specified.'
+          write (*,*) 'FRAC = ',frac(1:nf)
+          write (*,*) 'DRATIO = ',dratio(1:nr)
+        end if
+      end if
+!
+! ****** Set the number of segments.
+!
+      ns=nf-1
+!
+! ****** For a periodic coordinate, add points at XI=0. and XI=1.
+! ****** if they are not already present.
+!
+      if (periodic) then
+        if (frac(1).ne.zero) ns=ns+1
+        if (frac(nf).ne.one) ns=ns+1
+      end if
+!
+      allocate (cs(ns+1))
+      allocate (r(ns))
+!
+! ****** Set up the coordinate limits of the segments.
+!
+      if (periodic) then
+        if (frac(1).ne.zero) then
+          cs(1)=c0
+          cs(2:nf+1)=c0+(c1-c0)*frac(1:nf)
+          if (frac(nf).ne.one) then
+            alpha=(one-frac(nf))/(frac(1)+one-frac(nf))
+            r(1)=dratio(nr)/(one+alpha*(dratio(nr)-one))
+            r(2:nr+1)=dratio(1:nr)
+            cs(ns+1)=c1
+            r(ns)=one+alpha*(dratio(nr)-one)
+          else
+            r(1)=dratio(nr)
+            r(2:nr)=dratio(1:nr-1)
+          end if
+        else
+          cs(1:nf)=c0+(c1-c0)*frac(1:nf)
+          r(1:nr)=dratio(1:nr)
+          cs(ns+1)=c1
+        end if
+      else
+        cs(1:nf)=c0+(c1-c0)*frac(1:nf)
+        r(1:nr)=dratio(1:nr)
+      end if
+!
+  100 continue
+!
+      allocate (xi(ns+1))
+      allocate (a(ns))
+      allocate (f(ns))
+!
+! ****** Compute the XI values at the segment limits.
+!
+      do i=1,ns
+        dr=r(i)-one
+        if (abs(dr).lt.eps) then
+          f(i)=(cs(i+1)-cs(i))*(one+half*dr)
+        else
+          f(i)=(cs(i+1)-cs(i))*log(r(i))/dr
+        end if
+      enddo
+!
+      fac=zero
+      do i=ns,1,-1
+        fac=fac/r(i)+f(i)
+      enddo
+!
+      d=f(1)/fac
+      xi(1)=zero
+      do i=2,ns
+        xi(i)=xi(i-1)+d
+        if (i.lt.ns) d=d*f(i)/(f(i-1)*r(i-1))
+      enddo
+      xi(ns+1)=one
+!
+! ****** Set the amplification factor for each segment.
+!
+      do i=1,ns
+        a(i)=log(r(i))/(xi(i+1)-xi(i))
+      enddo
+!
+! ****** For a periodic coordinate, find the XI shift corresponding
+! ****** to a shift of C_SHIFT in the coordinate.
+! ****** Note that a positive value of C_SHIFT moves the mesh
+! ****** points to the right.
+!
+      if (periodic) then
+        cshft=-c_shift
+        call map_c_to_xi (periodic,ns,xi,cs,a,r,cshft,xi_shift)
+      else
+        xi_shift=0.
+      end if
+!
+! ****** Compute the location of the mesh points in array C
+! ****** by mapping from the XI values.
+!
+      dxi=one/(nc-one)
+!
+      c(1)=c0
+      do j=2,nc-1
+        xiv=(j-1)*dxi
+        call map_xi_to_c (periodic,ns,xi,cs,a,r,cshft,xi_shift,xiv,c(j))
+      enddo
+      c(nc)=c1
+!
+! ****** Filter the mesh if requested.
+!
+      if (nfilt.gt.0) then
+        do i=1,nfilt
+          if (periodic) then
+            call filter_coord_periodic (c1-c0,nc,c)
+          else
+            call filter_coord (nc,c)
+          end if
+        enddo
+      end if
+!
+! ****** Write out the mesh information.
+!
+      if (io.gt.0.and.iamp0) then
+!
+        write (io,*)
+        write (io,*) '### COMMENT from GENMESH:'
+        write (io,*) '### Mesh information for coordinate ',label,':'
+!
+        if (idebug.gt.0) then
+          write (io,*)
+          write (io,*) 'Flag to indicate a periodic mesh: ',periodic
+          write (io,*) 'Number of mesh points = ',nc
+          write (io,*) 'Lower mesh limit = ',c0
+          write (io,*) 'Upper mesh limit = ',c1
+          write (io,*) 'Number of times to filter the mesh = ',nfilt
+          if (periodic) then
+            write (io,*) 'Amount to shift the mesh = ',c_shift
+          end if
+        end if
+!
+        write (io,*)
+        write (io,*) 'Number of mesh segments = ',ns
+!
+        ll=len_trim(label)
+!
+        write (io,900) 'Segment      xi0       xi1'//   &
+                       repeat (' ',16-ll)//label//'0'// &
+                       repeat (' ',16-ll)//label//'1'// &
+                       '            ratio'
+        do i=1,ns
+          write (io,910) i,xi(i),xi(i+1),cs(i),cs(i+1),r(i)
+        enddo
+!
+        allocate (dc(nc))
+        allocate (rdc(nc))
+!
+        dc=c-cshift(c,-1)
+        if (periodic) dc(1)=dc(nc)
+        rdc=dc/cshift(dc,-1)
+        if (periodic) rdc(1)=rdc(nc)
+!
+        write (io,*)
+        write (io,920) 'Mesh-point locations:'
+        write (io,920) '     i'//                         &
+                       repeat (' ',18-ll)//label//        &
+                       repeat (' ',17-ll)//'d'//label//   &
+                       '             ratio'
+!
+        if (periodic) then
+          j0=1
+        else
+          j0=3
+          write (io,930) 1,c(1)
+          write (io,930) 2,c(2),dc(2)
+        end if
+        do j=j0,nc
+          write (io,930) j,c(j),dc(j),rdc(j)
+        enddo
+!
+        deallocate (dc)
+        deallocate (rdc)
+!
+      end if
+!
+  900 format (/,tr1,a)
+  910 format (tr1,i4,2x,2f10.6,4f17.8)
+  920 format (tr1,a)
+  930 format (tr1,i6,3f18.8)
+!
+      deallocate (cs)
+      deallocate (r)
+      deallocate (xi)
+      deallocate (a)
+      deallocate (f)
+!
+end subroutine
+!#######################################################################
+subroutine map_xi_to_c (periodic,ns,xi,cs,a,r,cshft,xi_shift,xiv,cv)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Get the mesh coordinate value CV for the specified
+! ****** xi value XIV.
+!
+! ****** Set PERIODIC=.true. for a periodic coordinate.
+! ****** NS is the number of segments in the mesh definition.
+! ****** The arrays XI, CS, A, and R define the mesh mapping.
+!
+! ****** CSHFT represents the amount to shift a periodic coordinate.
+! ****** XI_SHIFT represents the corresponding amount to shift xi.
+!
+! ****** This is a utility routine for GENMESH.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      logical, intent(in) :: periodic
+      integer, intent(in) :: ns
+      real(r_typ), dimension(ns+1), intent(in) :: xi,cs
+      real(r_typ), dimension(ns), intent(in) :: a,r
+      real(r_typ), intent(in) :: cshft,xi_shift
+      real(r_typ), intent(in) :: xiv
+      real(r_typ), intent(out) :: cv
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: zero=0._r_typ
+      real(r_typ), parameter :: one=1._r_typ
+      real(r_typ), parameter :: half=.5_r_typ
+      real(r_typ), parameter :: eps=1.e-5_r_typ
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i
+      real(r_typ) :: xiv_p,d,d1,da,da1,fac
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), external :: fold
+!
+!-----------------------------------------------------------------------
+!
+! ****** Find the index of the segment to which XIV belongs.
+!
+      if (periodic) then
+!
+! ****** Shift XIV by XI_SHIFT.
+!
+        xiv_p=xiv+xi_shift
+!
+! ****** Fold XIV_P into the main interval.
+!
+        xiv_p=fold(zero,one,xiv_p)
+!
+      else
+!
+        xiv_p=xiv
+!
+      end if
+!
+      do i=1,ns
+        if (xiv_p.ge.xi(i).and.xiv_p.le.xi(i+1)) exit
+      enddo
+!
+      if (i.gt.ns) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in MAP_XI_TO_C:'
+          write (*,*) '### Error in finding the XI segment.'
+          write (*,*) '### Could not find XIV in the XI table.'
+          write (*,*) '[Utility routine for GENMESH.]'
+          write (*,*) '[This is an internal error.]'
+          write (*,*) 'XI = ',xi
+          write (*,*) 'XIV = ',xiv
+          write (*,*) 'XIV_P = ',xiv_p
+        end if
+        call endrun (.true.)
+      end if
+!
+      d =xiv_p  -xi(i)
+      d1=xi(i+1)-xi(i)
+!
+      da =a(i)*d
+      da1=a(i)*d1
+!
+! ****** Interpolate the mapping function at XIV_P.
+!
+      if (abs(da1).lt.eps) then
+        fac=(d*(one+half*da))/(d1*(one+half*da1))
+      else
+        fac=(exp(da)-one)/(r(i)-one)
+      end if
+!
+      cv=cs(i)+(cs(i+1)-cs(i))*fac
+!
+      if (periodic) then
+!
+! ****** Shift CV by the amount CSHFT.
+!
+        cv=cv-cshft
+!
+! ****** Fold CV into the main interval.
+!
+        cv=fold(cs(1),cs(ns+1),cv)
+!
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine map_c_to_xi (periodic,ns,xi,cs,a,r,cv,xiv)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Get the xi value XIV for the specified coordinate value CV.
+!
+! ****** Set PERIODIC=.true. for a periodic coordinate.
+! ****** NS is the number of segments in the mesh definition.
+! ****** The arrays XI, CS, A, and R define the mesh mapping.
+!
+! ****** This is a utility routine for GENMESH.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      logical, intent(in) :: periodic
+      integer, intent(in) :: ns
+      real(r_typ), dimension(ns+1), intent(in) :: xi,cs
+      real(r_typ), dimension(ns), intent(in) :: a,r
+      real(r_typ), intent(in) :: cv
+      real(r_typ), intent(out) :: xiv
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: one=1._r_typ
+      real(r_typ), parameter :: eps=1.e-5_r_typ
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i
+      real(r_typ) :: cv_p,d,da,fac
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), external :: fold
+!
+!-----------------------------------------------------------------------
+!
+! ****** Find the index of the segment to which CV belongs.
+!
+      if (periodic) then
+!
+! ****** Fold CV_P into the main interval.
+!
+        cv_p=fold(cs(1),cs(ns+1),cv)
+!
+      else
+!
+        cv_p=cv
+!
+      end if
+!
+      do i=1,ns
+        if (cv_p.ge.cs(i).and.cv_p.le.cs(i+1)) exit
+      enddo
+!
+      if (i.gt.ns) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in MAP_C_TO_XI:'
+          write (*,*) '### Error in finding the CS segment.'
+          write (*,*) '### Could not find CV in the CS table.'
+          write (*,*) '[Utility routine for GENMESH.]'
+          write (*,*) '[This is an internal error.]'
+          write (*,*) 'CS = ',cs
+          write (*,*) 'CV = ',cv
+          write (*,*) 'CV_P = ',cv_p
+        end if
+        call endrun (.true.)
+      end if
+!
+      d=(cv_p-cs(i))/(cs(i+1)-cs(i))
+      da=(r(i)-one)*d
+!
+! ****** Interpolate the mapping function at XIV_P.
+!
+      if (abs(da).lt.eps) then
+        fac=d*(xi(i+1)-xi(i))
+      else
+        fac=log(da+one)/a(i)
+      end if
+!
+      xiv=xi(i)+fac
+!
+end subroutine
+!#######################################################################
+subroutine filter_coord (n,f)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Apply a "(1,2,1)/4" low-pass digital filter to a
+! ****** 1D coordinate.
+!
+! ****** The end-points F(1) and F(N) are not changed.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: n
+      real(r_typ), dimension(n) :: f
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: two=2._r_typ
+      real(r_typ), parameter :: quarter=.25_r_typ
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), dimension(n) :: ff
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i
+!
+!-----------------------------------------------------------------------
+!
+! ****** Make a copy of the function.
+!
+      ff=f
+!
+! ****** Apply the filter.
+!
+      do i=2,n-1
+        f(i)=quarter*(ff(i-1)+two*ff(i)+ff(i+1))
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine filter_coord_periodic (xl,n,f)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Apply a "(1,2,1)/4" low-pass digital filter to a
+! ****** periodic 1D coordinate.
+!
+!-----------------------------------------------------------------------
+!
+! ****** XL is the periodic interval for the coordinate.
+!
+! ****** The filtered coordinate is translated so that F(1)
+! ****** is preserved.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ) :: xl
+      integer :: n
+      real(r_typ), dimension(n) :: f
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: two=2._r_typ
+      real(r_typ), parameter :: quarter=.25_r_typ
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), dimension(0:n+1) :: ff
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i
+      real(r_typ) :: f1old,f1new
+!
+!-----------------------------------------------------------------------
+!
+! ****** Save the value of F(1).
+!
+      f1old=f(1)
+!
+! ****** Make a periodic copy of the function.
+!
+      ff(1:n)=f(:)
+!
+      ff(0)=f(n-1)-xl
+      ff(n+1)=f(2)+xl
+!
+! ****** Apply the filter.
+!
+      do i=1,n
+        f(i)=quarter*(ff(i-1)+two*ff(i)+ff(i+1))
+      enddo
+!
+! ****** Translate F so that F(1) is preserved.
+!
+      f1new=f(1)
+      do i=1,n
+        f(i)=f(i)-f1new+f1old
+      enddo
+!
+end subroutine
+!#######################################################################
+function fold (x0,x1,x)
+!
+!-----------------------------------------------------------------------
+!
+! ****** "Fold" X into the periodic interval [X0,X1].
+!
+! ****** On return, X is such that X0.le.X.lt.X1.
+!
+!-----------------------------------------------------------------------
+!
+! ****** It is assumed that X0 does not equal X1, as is physically
+! ****** necessary.  If X0 and X1 are equal, the routine just
+! ****** returns with FOLD=X.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ) :: fold
+      real(r_typ) :: x0,x1,x
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ) :: xl
+!
+!-----------------------------------------------------------------------
+!
+      fold=x
+!
+      if (x0.eq.x1) return
+!
+      xl=x1-x0
+!
+      fold=mod(x-x0,xl)+x0
+!
+      if (fold.lt.x0) fold=fold+xl
+      if (fold.ge.x1) fold=fold-xl
+!
+      return
+end function
+!#######################################################################
+subroutine set_flux
+!
+!-----------------------------------------------------------------------
+!
+! ****** Set the radial magnetic field at the photosphere.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use global_dims
+      use global_mesh
+      use local_dims
+      use local_mesh
+      use fields
+      use vars
+      use mpidefs
+      use timing
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+! ****** Global flux array.
+!
+      real(r_typ), dimension(:,:), allocatable :: br0_g
+!
+!-----------------------------------------------------------------------
+!
+      integer :: j,k,ierr
+!
+!-----------------------------------------------------------------------
+!
+      call timer_on
+!
+      allocate (br0_g(nt_g,np_g))
+!
+! ****** Define the global flux array.
+!
+! ****** Read the flux from file BR0FILE (only on processor IPROC0).
+!
+      if (iamp0) then
+        call readbr (br0file,br0_g,ierr)
+      end if
+      call check_error_on_p0 (ierr)
+!
+! ****** Broadcast BR0_G to all the processors.
+!
+      call MPI_Bcast (br0_g,nt_g*np_g,ntype_real,0,comm_all,ierr)
+!
+! ****** For a fully open field, reverse negative Br
+! ****** (i.e., use the monopole trick).
+!
+      if (option.eq.'open') then
+!
+! ****** Write the boundary flux (before the sign flip) to a file
+! ****** if requested.
+!
+        if (iamp0) then
+          if (br_photo_original_file.ne.'') then
+            write (*,*)
+            write (*,*) '### COMMENT from SET_FLUX:'
+            write (*,*)
+            write (*,*) 'Writing BR0 (before sign flip) to file: ', &
+                        trim(br_photo_original_file)
+            write (9,*)
+            write (9,*) '### COMMENT from SET_FLUX:'
+            write (9,*)
+            write (9,*) 'Writing BR0 (before sign flip) to file: ', &
+                        trim(br_photo_original_file)
+            call wrhdf_2d (br_photo_original_file,.true.,nt_g,np_g, &
+                           br0_g,th_g,ph_g,hdf32,ierr)
+          end if
+        end if
+!
+! ****** Reverse Br.
+!
+        br0_g(:,:)=abs(br0_g(:,:))
+!
+      end if
+!
+! ****** Write the boundary flux to a file if requested.
+!
+      if (iamp0) then
+        if (br_photo_file.ne.' ') then
+          write (*,*)
+          write (*,*) '### COMMENT from SET_FLUX:'
+          write (*,*)
+          write (*,*) 'Writing BR0 to file: ',trim(br_photo_file)
+          write (9,*)
+          write (9,*) '### COMMENT from SET_FLUX:'
+          write (9,*)
+          write (9,*) 'Writing BR0 to file: ',trim(br_photo_file)
+          call wrhdf_2d (br_photo_file,.true.,nt_g,np_g, &
+                         br0_g,th_g,ph_g,hdf32,ierr)
+        end if
+      end if
+!
+      do j=1,nt
+        do k=1,np
+          br0(j,k)=br0_g(j0_g+j-1,k0_g+k-1)
+        enddo
+      enddo
+!$omp target update to(br0)
+!
+      deallocate(br0_g)
+!
+      call timer_off (t_io)
+!
+end subroutine
+!#######################################################################
+subroutine potfld
+!
+!-----------------------------------------------------------------------
+!
+! ****** Find the potential field solution.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use local_dims
+      use local_mesh
+      use fields
+      use cgcom
+      use solve_params
+      use mpidefs
+      use debug
+      use timing
+      use matrix_storage_pot3d_solve
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: one=1._r_typ
+!
+!-----------------------------------------------------------------------
+!
+      integer :: ierr,nrm2,ntm2,npm2,i
+!
+      real(r_typ), dimension(:), allocatable :: rhs_cg,x_cg
+!
+!-----------------------------------------------------------------------
+!
+! ****** Load matrix and preconditioner.
+!
+      nrm2=nrm1-1
+      ntm2=ntm1-1
+      npm2=npm1-1
+!
+      a_offsets(1)=-nrm2*ntm2
+      a_offsets(2)=-nrm2
+      a_offsets(3)=-1
+      a_offsets(4)= 0
+      a_offsets(5)= 1
+      a_offsets(6)= nrm2
+      a_offsets(7)= nrm2*ntm2
+!
+! ****** Allocate cg 1D vectors.
+!
+      N=nrm2*ntm2*npm2
+!
+! ****** Prepare the guess, and rhs for the solve.
+!
+      allocate(rhs_cg(N))
+      allocate(x_cg(N))
+      rhs_cg(:)=0.
+      x_cg(:)=0.
+!$omp target enter data map(to:rhs_cg,x_cg)
+!
+      call getM (N,a_offsets,M)
+      call alloc_pot3d_matrix_coefs
+      call load_matrix_pot3d_solve
+!$omp target enter data map(to:a)
+      call load_preconditioner_pot3d_solve
+!$omp target enter data map(to:a_i)
+!
+! ****** Use a trick to accumulate the contribution of the
+! ****** boundary conditions (i.e., the inhomogenous part).
+!
+      call set_boundary_points (x_ax,one)
+      call seam_gen (x_ax,nr,nt,np)
+      call delsq (x_ax,rhs_cg)
+!
+! ****** Original rhs is zero so just use negative of boundary
+!        trick contributions:
+!
+      do concurrent (i=1:N)
+        rhs_cg(i)=-rhs_cg(i)
+      enddo
+!
+! ****** Solve for the potential.
+!
+      if (idebug.gt.0.and.iamp0) then
+        write (*,*)
+        write (*,*) '### COMMENT from POTFLD:'
+        write (*,*) '### Doing a solution:'
+      end if
+!
+      call solve (x_cg,rhs_cg,N,ierr)
+!
+      if (ierr.ne.0) then
+        call endrun (.true.)
+      end if
+!
+      call unpack_scalar (phi,x_cg)
+!
+      call set_boundary_points (phi,one)
+      call seam_gen (phi,nr,nt,np)
+!
+!$omp target exit data map(delete:rhs_cg,x_cg,a,a_i)
+      call dealloc_pot3d_matrix_coefs
+      deallocate(rhs_cg)
+      deallocate(x_cg)
+!
+end subroutine
+!#######################################################################
+subroutine set_validation_flux
+!
+!-----------------------------------------------------------------------
+!
+! ****** Set the radial magnetic field at the photosphere.
+! ****** This uses the tilted dipole analytic solution for validation.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use global_dims
+      use global_mesh
+      use local_dims
+      use local_mesh
+      use fields
+      use vars
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: two=2.0_r_typ
+!
+!-----------------------------------------------------------------------
+!
+! ****** Global Br boundary arrays.
+!
+      real(r_typ), dimension(:,:), allocatable :: br0_g
+      real(r_typ), dimension(:,:), allocatable :: br1_g
+!
+!-----------------------------------------------------------------------
+!
+      integer :: j,k
+      real(r_typ) :: sum00,sum11
+!
+!-----------------------------------------------------------------------
+!
+! ****** Define the global boundary condition arrays.
+!
+      allocate (br0_g(nt_g,np_g))
+      allocate (br1_g(nt_g,np_g))
+!
+! ****** Set the tilted dipole analytic Br.
+!
+      do j=1,nt_g
+        do k=1,np_g
+          br0_g(j,k)=(two/r0**3)*(cth_g(j)*cos(dipole_angle) + &
+                                  sth_g(j)*cph_g(k)*sin(dipole_angle))
+          br1_g(j,k)=(two/r1**3)*(cth_g(j)*cos(dipole_angle) + &
+                                  sth_g(j)*cph_g(k)*sin(dipole_angle))
+        enddo
+      enddo
+!
+! ****** Set Br to be periodic (should not be nesessary?).
+!
+      br0_g(:,1)=br0_g(:,npm1_g)
+      br0_g(:,np_g)=br0_g(:,2)
+      br1_g(:,1)=br1_g(:,npm1_g)
+      br1_g(:,np_g)=br1_g(:,2)
+!
+! ****** Set BCs at the poles.
+!
+      sum00=sum(br0_g(     2,2:npm1_g)*dph_g(2:npm1_g))*pl_i
+      sum11=sum(br0_g(ntm1_g,2:npm1_g)*dph_g(2:npm1_g))*pl_i
+!
+      br0_g(1   ,:)=two*sum00-br0_g(     2,:)
+      br0_g(nt_g,:)=two*sum11-br0_g(ntm1_g,:)
+!
+      sum00=sum(br1_g(     2,2:npm1_g)*dph_g(2:npm1_g))*pl_i
+      sum11=sum(br1_g(ntm1_g,2:npm1_g)*dph_g(2:npm1_g))*pl_i
+!
+      br1_g(1   ,:)=two*sum00-br1_g(     2,:)
+      br1_g(nt_g,:)=two*sum11-br1_g(ntm1_g,:)
+!
+      do j=1,nt
+        do k=1,np
+          br0(j,k)=br0_g(j0_g+j-1,k0_g+k-1)
+          br1(j,k)=br1_g(j0_g+j-1,k0_g+k-1)
+        enddo
+      enddo
+!$omp target update to(br0,br1)
+!
+      deallocate(br0_g)
+      deallocate(br1_g)
+!
+end subroutine
+!#######################################################################
+subroutine write_validation_solution
+!
+!-----------------------------------------------------------------------
+!
+! ****** Write out the tilted dipole analytic solution for validation.
+! ****** This writes out the phi, B, and B computed from phi.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use global_dims
+      use global_mesh
+      use local_dims
+      use local_mesh
+      use fields
+      use vars
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: one=1.0_r_typ
+      real(r_typ), parameter :: two=2.0_r_typ
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j,k
+      character(256) :: fnamephi,fnamebr,fnamebt,fnamebp
+!
+!-----------------------------------------------------------------------
+!
+! ****** Set phi.
+!
+      do concurrent (k=1:np, j=1:nt, i=1:nr)
+        phi(i,j,k)=(-one/rh(i)**2)*(cth(j)*cos(dipole_angle)+ &
+                                   sth(j)*cph(k)*sin(dipole_angle))
+      enddo
+!
+      call set_boundary_points (phi,one)
+      call seam_gen (phi,nr,nt,np)
+!
+!$omp target enter data map(alloc:br,bt,bp)
+!
+! ****** Set Br.
+!
+      do concurrent (k=1:np, j=1:nt, i=1:nrm1)
+        br(i,j,k)=(two/r(i)**3)*(cth(j)*cos(dipole_angle) + &
+                                 sth(j)*cph(k)*sin(dipole_angle))
+      enddo
+!
+! ****** Set Bt.
+!
+      do concurrent (k=1:np, j=1:ntm1, i=1:nr)
+        bt(i,j,k)=(one/rh(i)**3)*(st(j)*cos(dipole_angle) - &
+                                  ct(j)*cph(k)*sin(dipole_angle))
+      enddo
+!
+! ****** Set Bp.
+!
+      do concurrent (k=1:npm1, j=1:nt, i=1:nr)
+        bp(i,j,k)=(one/rh(i)**3)*(sph(k)*sin(dipole_angle))
+      enddo
+!
+! ****** Due to half mesh trig issues on periodic domain, we need
+! ****** to seam these even though seaming is not needed in getb().
+!
+      call seam_gen (br,nrm,nt,np)
+      call seam_gen (bt,nr,ntm,np)
+      call seam_gen (bp,nr,nt,npm)
+!
+! ****** Backup fnames and set new ones so we can reuse write routine.
+!
+      fnamephi=phifile
+      phifile='phi_exact.'//trim(fmt)
+      fnamebr=brfile
+      brfile='br_exact.'//trim(fmt)
+      fnamebt=btfile
+      btfile='bt_exact.'//trim(fmt)
+      fnamebp=bpfile
+      bpfile='bp_exact.'//trim(fmt)
+!
+      call write_solution
+!
+! ****** Restore original output names.
+!
+      phifile=fnamephi
+      brfile=fnamebr
+      btfile=fnamebt
+      bpfile=fnamebp
+!
+! ****** Reset phi.
+!
+      phi(:,:,:)=0.
+!$omp target update to(phi)
+!$omp target exit data map(delete:br,bt,bp)
+!
+end subroutine
+!#######################################################################
+subroutine solve (x,rhs,N,ierr)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Solve the implicit equations iteratively.
+!
+!-----------------------------------------------------------------------
+!
+! ****** Return IERR=0 if the iteration converges; otherwise,
+! ****** IERR is set to a nonzero value.
+!
+! ****** X is the initial guess at the solution.
+! ****** RHS is the right-hand side.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use cgcom
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: N
+      real(r_typ), dimension(N) :: x,rhs
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+! ****** Solve the equations using the CG method.
+!
+      call cgsolve (x,rhs,N,ierr)
+!
+! ****** Check for convergence.
+!
+      if (ierr.ne.0) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in SOLVE:'
+          write (*,*) '### The field solver did not converge.'
+          write (*,*) 'IERR = ',ierr
+          write (*,100) ncg,epsn
+  100     format (1x,'N = ',i6,' EPSN = ',1pe13.6)
+        end if
+      else
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### COMMENT from SOLVE:'
+          write (*,*) '### The field solver converged.'
+          write (*,*) 'Number of iterations = ',ncg
+          write (9,*)
+          write (9,*) '### COMMENT from SOLVE:'
+          write (9,*) '### The field solver converged.'
+          write (9,*) 'Number of iterations = ',ncg
+        end if
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine cgsolve (x,r,N,ierr)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Solve the linear system:
+!
+!            A * x = b
+!
+! ****** using the classical Conjugate Gradient method for symmetric
+! ****** and positive-definite matrices.
+!
+!-----------------------------------------------------------------------
+!
+! ****** On input, X(N) contains a guess at the solution, and
+! ****** R(N) contains the right-hand side, b.
+!
+! ****** On exit, X contains an estimate to the solution, and
+! ****** R contains the residual (b-Ax).
+!
+! ****** IERR=0 indicates that the solution converged to the
+! ****** requested accuracy.  Other values indicate that the
+! ****** iteration did not converge for the given maximum number
+! ****** of iterations.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use cgcom
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: N
+      real(r_typ), dimension(N) :: x,r
+      integer :: ierr,i
+!
+!-----------------------------------------------------------------------
+!
+! ****** Scratch space for the CG iteration vectors.
+!
+      real(r_typ), dimension(N), target :: p,ap
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), external :: cgdot
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ) :: bdotb,rdotr,pdotap,alphai,rdotr_old,betai
+!
+!-----------------------------------------------------------------------
+!
+      ncg=0
+!$omp target enter data map(alloc:p,ap)
+!
+! ****** Get the norm of the RHS.
+!
+      do concurrent (i=1:N)
+        p(i)=r(i)
+      enddo
+!
+      call prec_inv (p)
+      bdotb=cgdot(r,p,N)
+!
+! ****** If the RHS is zero, return with a zero solution.
+!
+      if (bdotb.eq.0.) then
+        do concurrent (i=1:N)
+          x(i)=0.
+        enddo
+        epsn=0.
+        ierr=0
+        return
+      end if
+!
+!-----------------------------------------------------------------------
+! ****** Initialization.
+!-----------------------------------------------------------------------
+!
+      call ax (x,ap,N)
+!
+      do concurrent (i=1:N)
+        r(i)=r(i)-ap(i)
+        p(i)=r(i)
+      enddo
+!
+! ****** Find the initial error norm.
+!
+      call prec_inv (p)
+      rdotr=cgdot(r,p,N)
+!
+      call ernorm (bdotb,rdotr,ierr)
+      if (ierr.ge.0) return
+!
+!-----------------------------------------------------------------------
+! ****** Main iteration loop.
+!-----------------------------------------------------------------------
+!
+      do
+        ncg=ncg+1
+!
+        call ax (p,ap,N)
+!
+        pdotap=cgdot(p,ap,N)
+        alphai=rdotr/pdotap
+!
+        do concurrent (i=1:N)
+          x(i)=x(i)+alphai*p(i)
+          r(i)=r(i)-alphai*ap(i)
+          ap(i)=r(i)
+        enddo
+!
+        call prec_inv (ap)
+        rdotr_old=rdotr
+        rdotr=cgdot(r,ap,N)
+!
+! ****** Check for convergence.
+!
+        call ernorm (bdotb,rdotr,ierr)
+        if (ierr.ge.0) exit
+!
+        betai=rdotr/rdotr_old
+!
+        do concurrent (i=1:N)
+          p(i)=betai*p(i)+ap(i)
+        enddo
+!
+      enddo
+!
+!$omp target exit data map(delete:p,ap)
+end subroutine
+!#######################################################################
+subroutine ernorm (bdotb,rdotr,ierr)
+!
+!-----------------------------------------------------------------------
+!
+! ****** This subroutine checks if the iterative solver has
+! ****** converged or if the maximum allowed number of iterations,
+! ****** NCGMAX, has been exceeded.
+!
+!-----------------------------------------------------------------------
+!
+! ****** Convergence is deemed to have occurred when:
+! ******
+! ******     ||R||/||B|| .lt. EPSCG
+! ******
+! ****** where ||R|| is the norm of the (preconditioned)
+! ****** residual, ||B|| is the norm of the (preconditioned)
+! ****** RHS, and EPSCG is the specified convergence criterion.
+!
+! ****** Set IERR=0 if the error is below the error criterion
+! ****** (i.e., the solution has converged).
+! ****** Set IERR=-1 if the error does not yet meet the error
+! ****** criterion and the number of iterations is less than NCGMAX.
+! ****** Set IERR=1 if the maximum number of iterations has
+! ****** been exceeded without convergence.
+!
+!-----------------------------------------------------------------------
+!
+! ****** On input, BDOTB has the dot product of the RHS vector
+! ****** with itself, weighted by the preconditioning matrix.
+! ****** Similarly, RDOTR has the dot product of the residual vector
+! ****** with itself, weighted by the preconditioning matrix.
+! ****** This is used to normalize the error estimate.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use cgcom
+      use mpidefs
+      use vars
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ) :: bdotb,rdotr
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: one=1._r_typ
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ) :: epssq
+!
+!-----------------------------------------------------------------------
+!
+      ierr=-1
+!
+      epssq=rdotr/bdotb
+      epsn=sign(one,epssq)*sqrt(abs(epssq))
+!
+      if (ncghist.gt.0) then
+!
+        if (mod(ncg,ncghist).eq.0) then
+          if (iamp0) then
+            if (ncg.eq.0) then
+              write (9,*)
+              write (9,*) '### Convergence information:'
+              write (*,*)
+            end if
+            write (*,100) ncg,epsn
+            FLUSH(OUTPUT_UNIT)
+            write (9,100) ncg,epsn
+            FLUSH(9)
+  100       format (1x,'Iteration: ',i8,'   Residual: ',1pe23.15)
+          end if
+        end if
+      end if
+!
+! ****** Check for convergence.
+!
+      if (epsn.lt.epscg) then
+        if (ncghist.gt.0) then
+          if (iamp0) then
+            write (*,100) ncg,epsn
+            FLUSH(OUTPUT_UNIT)
+            write (9,*)
+            write (9,*) '### The CG solver has converged.'
+            write (9,100) ncg,epsn
+            FLUSH(9)
+          end if
+        end if
+        ierr=0
+      else if (ncg.ge.ncgmax) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in ERNORM:'
+          write (*,*) '### Exceeded maximum number of iterations.'
+          write (*,*) 'NCGMAX = ',ncgmax
+          write (*,*) 'EPSN = ',epsn
+          FLUSH(OUTPUT_UNIT)
+          write (9,*)
+          write (9,*) '### ERROR in ERNORM:'
+          write (9,*) '### Exceeded maximum number of iterations.'
+          write (9,*) 'NCGMAX = ',ncgmax
+          write (9,*) 'EPSN = ',epsn
+          FLUSH(9)
+        end if
+        ierr=1
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine alloc_pot3d_matrix_coefs
+!
+!-----------------------------------------------------------------------
+!
+! ****** Allocate the arrays in which the matrix coefficients
+! ****** for the pot3d solve are stored.
+!
+!-----------------------------------------------------------------------
+!
+      use matrix_storage_pot3d_solve
+      use cgcom
+      use local_dims
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      allocate (a(2:nrm1,2:ntm1,2:npm1,7))
+      a=0.
+      allocate (a_i(N))
+      a_i=0.
+!
+      if (ifprec.eq.2) then
+        allocate (a_csr(M))
+        allocate (lu_csr(M))
+        allocate (lu_csr_ja(M))
+        allocate (a_csr_ja(M))
+        allocate (a_csr_ia(1+N))
+        allocate (a_N1(N))
+        allocate (a_N2(N))
+        allocate (a_csr_d(N))
+        allocate (a_csr_dptr(N))
+      endif
+!
+end subroutine
+!#######################################################################
+subroutine dealloc_pot3d_matrix_coefs
+!
+!-----------------------------------------------------------------------
+!
+! ****** Deallocate the arrays in which the matrix coefficients
+! ****** for the pot3d solve are stored.
+!
+!-----------------------------------------------------------------------
+!
+      use matrix_storage_pot3d_solve
+      use cgcom
+      use cusparse_interface
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      deallocate (a)
+      deallocate (a_i)
+!$omp target exit data map(delete:a,a_i)
+!
+      if (ifprec.eq.2) then
+#ifdef CUSPARSE
+        call unload_lusol_cusparse
+!$omp target exit data map(delete:a_csr,a_csr_ja,a_csr_ia)
+#endif
+        deallocate (a_csr)
+        deallocate (lu_csr)
+        deallocate (lu_csr_ja)
+        deallocate (a_csr_ia)
+        deallocate (a_csr_ja)
+        deallocate (a_csr_d)
+        deallocate (a_N1)
+        deallocate (a_N2)
+        deallocate (a_csr_dptr)
+      endif
+!
+end subroutine
+!#######################################################################
+subroutine load_matrix_pot3d_solve
+!
+!-----------------------------------------------------------------------
+!
+! ****** Load the matrix coefficients for the pot3d solve.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use matrix_storage_pot3d_solve
+      use local_dims
+      use local_mesh
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j,k
+!
+!-----------------------------------------------------------------------
+!
+! ****** Set matrix coefs
+!
+      do k=2,npm1
+        do j=2,ntm1
+          do i=2,nrm1
+!           a*ps(i,j,k-1):
+            a(i,j,k,1)=-drh(i)*dth(j)*sth_i(j)*dp_i(k-1)
+!           a*ps(i,j-1,k):
+            a(i,j,k,2)=-drh(i)*dph(k)*st(j-1)*dt_i(j-1)
+!           a*ps(i-1,j,k):
+            a(i,j,k,3)=-sth(j)*dth(j)*dph(k)*r2(i-1)*dr_i(i-1)
+!           a*ps(i+1,j,k):
+            a(i,j,k,5)=-sth(j)*dth(j)*dph(k)*r2(i  )*dr_i(i  )
+!           a*ps(i,j+1,k):
+            a(i,j,k,6)=-drh(i)*dph(k)*st(j  )*dt_i(j  )
+!           a*ps(i,j,k+1):
+            a(i,j,k,7)=-drh(i)*dth(j)*sth_i(j)*dp_i(k  )
+!
+!           a*ps(i,j,k):
+            a(i,j,k,4)=-(a(i,j,k,1)+a(i,j,k,2)+a(i,j,k,3)+ &
+                         a(i,j,k,5)+a(i,j,k,6)+a(i,j,k,7))
+          enddo
+        enddo
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine load_preconditioner_pot3d_solve
+!
+!-----------------------------------------------------------------------
+!
+! ****** Load the preconditioner for the pot3d solve.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use matrix_storage_pot3d_solve
+      use cgcom
+      use local_dims
+      use, intrinsic :: iso_c_binding
+      use cusparse_interface
+      use timing
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: one=1._r_typ
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j,k,icode,ii
+!
+!-----------------------------------------------------------------------
+!
+      call timer_on
+!
+      if (ifprec.eq.0) return
+!
+      if (ifprec.eq.1) then
+!
+! ****** Diagonal scaling:
+!
+        ii=0
+        do k=2,npm1
+          do j=2,ntm1
+            do i=2,nrm1
+              ii=ii+1
+              a_i(ii)=real(one/a(i,j,k,4),r_typ_pc)
+            enddo
+          enddo
+        enddo
+!
+      elseif (ifprec.eq.2) then
+!
+! ****** Convert A matrix into CSR format:
+!
+        call diacsr (N,M,a,a_offsets,a_csr,a_csr_ja,a_csr_ia,a_csr_dptr)
+#ifdef CUSPARSE
+      cN=N
+      cM=M
+!$omp target enter data map(to:a_csr,a_csr_ja,a_csr_ia)
+!$omp target data use_device_ptr(a_csr,a_csr_ja,a_csr_ia)
+      call load_lusol_cusparse (C_LOC(a_csr(1)),          &
+                                C_LOC(a_csr_ia(1)),       &
+                                C_LOC(a_csr_ja(1)),cN,cM)
+!$omp end target data
+#else
+!
+! ****** Overwrite CSR A with preconditioner L and U matrices:
+!
+! ****** Incomplete LU (ILU)
+!
+        icode=0
+        call ilu0 (N,M,a_csr,a_csr_ja,a_csr_ia,a_csr_dptr,icode)
+!
+        if (icode.ne.0) then
+          print*, '### ERROR IN ILU FORMATION'
+        endif
+!
+! ****** Convert LU stored in A to LU matrix in optimized layout.
+!
+        call lu2luopt (N,M,lu_csr,a_csr,a_csr_ia,a_csr_ja,lu_csr_ja, &
+                       a_csr_dptr,a_N1,a_N2)
+!
+! ****** Store inverse of diagonal of LU matrix.
+!
+        do i=1,N
+          a_csr_d(i)=one/a_csr(a_csr_dptr(i))
+        enddo
+!
+#endif
+      endif
+!
+      call timer_off (t_pc_load)
+!
+end subroutine
+!#######################################################################
+subroutine ilu0 (N,M,A,JA,IA,A_da,icode)
+!
+!-----------------------------------------------------------
+!
+!     Set-up routine for ILU(0) preconditioner. This routine
+!     computes the L and U factors of the ILU(0) factorization
+!     of a general sparse matrix A stored in CSR format with
+!     1-based indexing. Since
+!     L is unit triangular, the L and U factors can be stored
+!     as a single matrix which occupies the same storage as A.
+!     New ja and ia arrays are not needed for the LU matrix
+!     since the pattern of the LU matrix is identical with
+!     that of A.
+!
+!     Original Author:  Yousef Saad
+!            Iterative Methods for Sparse Linear Systems 2nd Ed. pg. 309
+!     Modified by R.M. Caplan
+!
+!-----------------------------------------------------------
+!     INPUT:
+!     N         : Dimension of matrix
+!     A, JA, IA : Sparse matrix in CSR sparse storage format
+!     A_da      : Pointers to the diagonal elements in the CSR
+!                 data structure luval
+!
+!     OUTPUT:
+!     A     : L/U matrices stored together. On return A,
+!             JA, and IA are the combined CSR data structure for
+!             the L and U factors.
+!     icode : Integer indicating error code on return:
+!             (0): Normal return.
+!             (k): Encountered a zero pivot at step k.
+!------------------------------------------------------------
+!
+      use number_types
+      use number_types_pc
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: JA(M),IA(N+1),A_da(N),iw(N)
+      integer :: icode,N,M
+      real(r_typ_pc) :: A(M)
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,ik,kj,k,ij,IA_i,IA_ip1m1
+      real(r_typ_pc) :: Aik
+!
+!-----------------------------------------------------------------------
+!
+      icode=0
+!     Initialize scratch index array:
+      iw(:)=0
+!
+      do i=2,N
+!       Store index of (i,j) in A in scratch array of iw(j=1:N)
+!       This allows lookup given a column index (j) in row (k)
+!       to see if the column is in row (i).
+        IA_i    =IA(i)
+        IA_ip1m1=IA(i+1)-1
+!
+        do ij=IA_i,IA_ip1m1
+          iw(JA(ij))=ij
+        enddo
+!
+!       Loop from first element in row i to 1 less than diagonal elem:
+        do ik=IA_i,A_da(i)-1     !IA(i+1) !ik is index of (i,k) in A[]
+          k    =JA(ik)           !Actual column index in matrix (k)
+          Aik  =A(ik)/A(A_da(k)) !Save Aik for next loop as an optim.
+          A(ik)=Aik
+!
+!         Loop from 1 more than diag elem to last elem in row k:
+          do kj=A_da(k)+1,IA(k+1)-1 !kj is index of (k,j) in A[]
+!            Get ij location from scratch array (if 0, no ij present)
+             ij=iw(JA(kj))
+             if (ij .ne. 0) then
+               A(ij)=A(ij)-Aik*A(kj)
+             endif
+          enddo
+        enddo
+!
+        if (A(ik).eq.0) then
+          icode=i
+          exit
+        endif
+!
+!       Reset scratch index array:
+        do ij=IA_i,IA_ip1m1
+          iw(JA(ij))=0
+        enddo
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine lu2luopt (N,M,LU,A,IA,JA,LUJA,A_da,N1,N2)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Re-order elements of LU matrix in CSR format into custom,
+! ****** optimized format for use with lusol().
+! ****** (Eventually, this could be merged with the ilu0 and/or diacsr)
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use number_types_pc
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: N,M
+      integer :: JA(M),LUJA(M),IA(N+1),A_da(N)
+      integer :: N1(N),N2(N)
+      real(r_typ_pc) :: A(M),LU(M)
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,k,ii
+!
+!-----------------------------------------------------------------------
+!
+      ii=0
+!
+      do i=1,N
+        do k=IA(i),A_da(i)-1
+           ii=ii+1
+           LU(ii)=A(k)
+           LUJA(ii)=JA(k)
+        enddo
+!
+!       Store k1 and k2 ranges for lusolve:
+!
+        N1(i)=A_da(i)-1-IA(i)
+        N2(i)=IA(i+1)-2-A_da(i)
+      enddo
+!
+      do i=N,1,-1
+        do k=A_da(i)+1,IA(i+1)-1
+           ii=ii+1
+           LU(ii)=A(k)
+           LUJA(ii)=JA(k)
+        enddo
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine diacsr (N,M,Adia,ioff,Acsr,JA,IA,Adptr)
+!
+!-----------------------------------------------------------------------
+!
+! *** DIACSR_INNER converts a solver matrix in a MAS-style
+!     diagonal format to standard compressed sparse row (CSR)
+!     including periodic coefficents when nproc_p=1.
+!
+!     Author of original diacsr: Youcef Saad
+!     Modifications for MAS:     RM Caplan
+!
+!     Input:
+!                     N: Size of the matrix (NxN)
+!                     M: Number of non-zero entries in matrix
+!                        (computed with getM_tc())
+!         Adia(IDIAG,N): The matrix in modified "DIA" format
+!           ioff(IDIAG): Offsets of the diagonals in A.
+!
+!     Output:
+!            Acsr(M), JA(M), IA(N+1): The matrix A in CSR.
+!                           Adptr(N): Pointers to diag elements in A,
+!                                     [e.g. A(i,i) == A(Adptr(i))]
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use number_types_pc
+      use local_dims
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer, parameter :: IDIAG=7
+!
+!-----------------------------------------------------------------------
+!
+      real (r_typ_pc) :: Acsr(M)
+      real (r_typ) :: Adia(N,IDIAG)
+      integer :: N,M
+      integer :: Adptr(N)
+      integer :: IA(N+1)
+      integer :: JA(M)
+      integer :: ioff(IDIAG)
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j,jj,mi,mj,mk,ko,x
+      integer :: ioffok(IDIAG)
+!
+!-----------------------------------------------------------------------
+!
+      x=0
+!
+      IA(1)=1
+      ko=1
+      i=0
+!
+      do mk=2,npm1
+        do mj=2,ntm1
+          do mi=2,nrm1
+! ********* Set index of value and column indicies array:
+            i=i+1
+!
+! ********* Do not add coefs that multiply boundaries:
+!           For each boundary, there is a sub-set of coefs in the
+!           matrix row that should not be added.
+!           This makes "local" matrices have no bc info
+!
+! ********* Reset "i-offset-ok-to-use-coef-jj" array:
+!
+            ioffok(:)=1
+!
+            if (mi.eq.2) then
+              ioffok(3)=0;
+            endif
+!
+            if (mi.eq.nrm1) then
+              ioffok(5)=0;
+            endif
+!
+            if (mj.eq.2) then
+              ioffok(2)=0;
+            endif
+!
+            if (mj.eq.ntm1) then
+              ioffok(6)=0;
+            endif
+!
+! ********* Eliminate periodic ceofs in the case nproc_p>1
+!
+            if (nproc_p.gt.1) then
+              if (mk.eq.2) then
+                ioffok(1)=0
+              endif
+              if (mk.eq.npm1) then
+                ioffok(7)=0
+              endif
+            endif
+!
+! ********* To handle periodicity of phi in nproc_p=1 case:
+!           We want CSR matrix to be in order so
+!           have to sweep three times to avoid sorting:
+!
+! ********* Add periodic coefs of "right side":
+!
+            do jj=1,IDIAG
+              if (ioffok(jj).eq.1) then
+                j=i+ioff(jj)-x
+                if (j.gt.N-x) then
+                  j=j-N
+                  Acsr(ko)=real(Adia(i,jj),r_typ_pc)
+                  JA(ko)=j
+                  ko=ko+1
+                endif
+              endif
+            enddo
+!
+! ********* Now do non-periodic coefs:
+!
+            do jj=1,IDIAG
+              if (ioffok(jj).eq.1) then
+                j=i+ioff(jj)-x
+                if (j.ge.1.and.j.le.N-x) then
+!                 Store pointer to diagonal elements in A:
+                  if (jj.eq.4) Adptr(i)=ko
+                  Acsr(ko)=real(Adia(i,jj),r_typ_pc)
+                  JA(ko)=j
+                  ko=ko+1
+                endif
+              endif
+            enddo
+!
+! ********* Now do periodic coefs of "left side":
+!
+            do jj=1,IDIAG
+              if (ioffok(jj).eq.1) then
+                j=i+ioff(jj)-x
+                if (j.lt.1) then
+                  j=N+j
+                  Acsr(ko)=real(Adia(i,jj),r_typ_pc)
+                  JA(ko)=j
+                  ko=ko+1
+                endif
+              endif
+            enddo
+!
+! ********* Set row offset:
+!
+            IA(i+1)=ko-x
+          enddo
+        enddo
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine getM (N, ioff, M)
+!
+!-----------------------------------------------------------------------
+!
+! *** This routine computes the number of non-zeros in the
+!     solver matrix for use with allocating the matrices.
+!     See diacsr() for description of inputs.
+!
+!     Output:  M  # of nonzeros.
+!
+!-----------------------------------------------------------------------
+!
+      use mpidefs
+      use local_dims
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer, parameter :: IDIAG=7
+      integer :: N,M,i,j,jj,ko,mi,mj,mk,x
+      integer :: ioff(IDIAG)
+      integer :: ioffok(IDIAG)
+!
+      x=0
+!
+      ko=1
+      i=0
+!
+      do mk=2,npm1
+        do mj=2,ntm1
+          do mi=2,nrm1
+!
+            ioffok(:)=1
+!
+            if (mi.eq.2) then
+              ioffok(3)=0;
+            endif
+!
+            if (mi.eq.nrm1) then
+              ioffok(5)=0;
+            endif
+!
+            if (mj.eq.2) then
+              ioffok(2)=0;
+            endif
+!
+            if (mj.eq.ntm1) then
+              ioffok(6)=0;
+            endif
+!
+! ********* Eliminate periodic ceofs in the case nproc_p>1
+!
+            if (nproc_p.gt.1) then
+              if (mk.eq.2) then
+                ioffok(1)=0
+              endif
+              if (mk.eq.npm1) then
+                ioffok(7)=0
+              endif
+            endif
+!
+            do jj=1,IDIAG
+              if (ioffok(jj).eq.1) then
+                j=i+ioff(jj)-x
+                if (j.gt.N-x) then
+                  ko=ko+1
+                endif
+              endif
+            enddo
+!
+            do jj=1,IDIAG
+              if (ioffok(jj).eq.1) then
+                j=i+ioff(jj)-x
+                if (j.ge.1.and.j.le.N-x) then
+                  ko=ko+1
+                endif
+              endif
+            enddo
+!
+            do jj=1,IDIAG
+              if (ioffok(jj).eq.1) then
+                j=i+ioff(jj)-x
+                if (j.lt.1) then
+                  ko=ko+1
+                endif
+              endif
+            enddo
+          enddo
+        enddo
+      enddo
+!
+! *** Save number of non-zeros of matrix:
+!
+      M=ko-1
+!
+end subroutine
+!#######################################################################
+subroutine ax (x,y,N)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Set y = A * x.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use local_dims
+      use fields, ONLY : x_ax
+      use timing
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: zero=0._r_typ
+!
+!-----------------------------------------------------------------------
+!
+      integer :: N
+      real(r_typ), dimension(N) :: x,y
+!
+!-----------------------------------------------------------------------
+!
+! ****** Expand X array to allow for boundary and seam values.
+!
+      call unpack_scalar (x_ax,x)
+!
+! ****** Set the boundary values of X.
+!
+      call set_boundary_points (x_ax,zero)
+!
+! ****** Seam along edges between processors.
+!
+      call seam_gen (x_ax,nr,nt,np)
+!
+! ****** Get the matrix-vector product.
+!
+      call timer_on
+!
+      call delsq (x_ax,y)
+!
+      call timer_off (t_ax)
+!
+end subroutine
+!#######################################################################
+subroutine prec_inv (x)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Apply preconditioner: x := M(inv) * x.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use number_types_pc
+      use cgcom
+      use solve_params
+      use matrix_storage_pot3d_solve
+      use, intrinsic :: iso_c_binding
+      use cusparse_interface
+      use timing
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), dimension(N) :: x
+      real(r_typ_pc), dimension(N) :: x_32
+      integer :: i
+!
+!-----------------------------------------------------------------------
+!
+      if (ifprec.eq.0) return
+!
+      call timer_on
+!
+      if (ifprec.eq.1) then
+!
+! ****** Point-Jacobi (diagonal scaling):
+!
+        do concurrent (i=1:N)
+          x(i)=a_i(i)*x(i)
+        enddo
+!
+      elseif (ifprec.eq.2) then
+!
+! ****** ILU0 Partial-Block-Jacobi:
+!
+#ifdef CUSPARSE
+!$omp target data use_device_ptr(x)
+        call lusol_cusparse(C_LOC(x(1)))
+!$omp end target data
+!
+#else
+!$omp target update from(x)
+!
+! ****** Convert input array to single precision.
+!
+        do i=1,N
+          x_32(i) = real(x(i),r_typ_pc)
+        enddo
+!
+        call lusol (N,M,x_32,lu_csr,lu_csr_ja,a_N1,a_N2,a_csr_d)
+!
+! ****** Convert result back to double precision.
+!
+        do i=1,N
+          x(i) = real(x_32(i),r_typ)
+        enddo
+!
+!$omp target update to(x)
+#endif
+      endif
+!
+      call timer_off (t_pc)
+!
+end subroutine
+!#######################################################################
+subroutine lusol (N,M,x,LU,LU_ja,N1,N2,LUd_i)
+!
+!-----------------------------------------------------------
+!
+!     Performs a forward and a backward solve for the sparse system
+!     (LU) x=y  where LU is in an optimized custom CSR format
+!                                              (see lu2luopt())
+!
+!     For use where LU is an ILU or SSOR/SGS factorization.
+!
+!     Author of original lusol: Yousef Saad
+!           Iterative Methods for Sparse Linear Systems 2nd Ed. pg. 299
+!
+!     Modified by RM Caplan to include optimized memory access
+!     as described in
+!     B. Smith, H. Zhang  Inter. J. of High Perf. Comp. Appl.
+!     Vol. 25 #4 pg. 386-391 (2011)
+!
+!-----------------------------------------------------------
+!     PARAMETERS:
+!     N     : Dimension of problem
+!     x     : At input, x is rhs (y), at output x is the solution.
+!     LU    : Values of the LU matrix. L and U are stored together in
+!             order of access in this routine.
+!     LU_ja : Column indices of elements in LU.
+!     N1    : Row-start indicies in original CSR LU.
+!     N2    : Indices of diagonal elements in orig CSR LU
+!     LUd_i : Inverse diagonal elements of U
+!------------------------------------------------------------
+!
+      use number_types
+      use number_types_pc
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ_pc) :: x(N),LUd_i(N),LU(M)
+      integer :: N1(N),N2(N),LU_ja(M)
+      integer :: N,M
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,k,k1,k2
+!
+!-----------------------------------------------------------------------
+!
+! ****** FORWARD SOLVE: Solve L x'=y
+!
+      k2=0
+      do i=1,N
+!       Compute x(i) := x(i) - sum L(i,j) * x(j)
+        k1=k2+1
+        k2=k1+N1(i)
+        do k=k1,k2
+          x(i)=x(i)-LU(k)*x(LU_ja(k))
+        enddo
+!       Diagonal is always 1 for L so no division here is nessesary.
+      enddo
+!
+! ****** BACKWARD SOLVE: Solve U x=x'
+!
+      do i=N,1,-1
+!       Compute x(i) := x(i) - sum U(i,j) * x(j)
+        k1=k2+1
+        k2=k1+N2(i)
+        do k=k1,k2
+          x(i)=x(i)-LU(k)*x(LU_ja(k))
+        enddo
+!       Compute x(i) := x(i) / U(i,i)
+        x(i)=x(i)*LUd_i(i)
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine unpack_scalar (s,x)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Unpack the scalar x into
+! ****** three-dimensional array s leaving room for boundaries.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use local_dims
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), dimension(nr,nt,np) :: s
+      real(r_typ), dimension(2:nrm1,2:ntm1,2:npm1) :: x
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j,k
+!
+!-----------------------------------------------------------------------
+!
+      do concurrent (k=2:npm1, j=2:ntm1, i=2:nrm1)
+        s(i,j,k)=x(i,j,k)
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine delsq (x,y)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Set Y = - (dV * del-squared X) at the internal points.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use local_dims
+      use matrix_storage_pot3d_solve
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), dimension(nr,nt,np) :: x
+      real(r_typ), dimension(2:nrm1,2:ntm1,2:npm1) :: y
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j,k
+!
+!-----------------------------------------------------------------------
+!
+      do concurrent (k=2:npm1, j=2:ntm1, i=2:nrm1)
+        y(i,j,k)=a(i,j,k,1)*x(i  ,j  ,k-1) &
+                +a(i,j,k,2)*x(i  ,j-1,k  ) &
+                +a(i,j,k,3)*x(i-1,j  ,k  ) &
+                +a(i,j,k,4)*x(i  ,j  ,k  ) &
+                +a(i,j,k,5)*x(i+1,j  ,k  ) &
+                +a(i,j,k,6)*x(i  ,j+1,k  ) &
+                +a(i,j,k,7)*x(i  ,j  ,k+1)
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine set_boundary_points (x,vmask)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Set boundary points of X at the physical boundaries.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use global_mesh
+      use local_dims
+      use local_mesh
+      use fields
+      use solve_params
+      use vars, ONLY : validation_run
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), dimension(nr,nt,np) :: x
+      real(r_typ) :: vmask,temp_sum0
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: two=2._r_typ
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j,k
+!
+!-----------------------------------------------------------------------
+!
+! ****** Set X at the radial boundaries.
+!
+      if (rb0) then
+        do concurrent (k=2:npm1, j=2:ntm1)
+          x( 1,j,k)=x(2,j,k)-vmask*br0(j,k)*dr1
+        enddo
+      end if
+!
+      if (rb1) then
+        if (validation_run) then
+          do concurrent (k=2:npm1, j=2:ntm1)
+            x(nr,j,k)=x(nrm1,j,k)+vmask*br1(j,k)*drn
+          enddo
+        else
+          do concurrent (k=2:npm1, j=2:ntm1)
+            x(nr,j,k)=pm_r1*x(nrm1,j,k)
+          enddo
+        end if
+      end if
+!
+! ****** If this processor does not contain any points at the
+! ****** pole, return.
+!
+      if (.not.(tb0.or.tb1)) return
+!
+! ****** Get the m=0 component of X at the poles.
+!
+      if (tb0) then
+        do concurrent (i=1:nr)
+          sum0(i)=0
+        enddo
+!$omp target teams loop
+        do i=1,nr
+          temp_sum0=0.
+!$omp loop reduction(+:temp_sum0)
+          do k=2,npm1
+            temp_sum0=temp_sum0+x(i,2,k)*dph(k)*pl_i
+          enddo
+          sum0(i)=temp_sum0
+        enddo
+      end if
+!
+      if (tb1) then
+        do concurrent (i=1:nr)
+          sum1(i)=0
+        enddo
+!$omp target teams loop
+        do i=1,nr
+          temp_sum0=0.
+!$omp loop reduction(+:temp_sum0)
+          do k=2,npm1
+            temp_sum0=temp_sum0+x(i,ntm1,k)*dph(k)*pl_i
+          enddo
+          sum1(i)=temp_sum0
+        enddo
+      end if
+!
+! ****** Sum over all processors.
+!
+      call sum_over_phi (nr,sum0,sum1)
+!
+! ****** Set X to have only an m=0 component at the poles.
+!
+      if (tb0) then
+        do concurrent (k=2:npm1, i=1:nr)
+          x(i,1,k)=two*sum0(i)-x(i,2,k)
+        enddo
+      end if
+!
+      if (tb1) then
+        do concurrent (k=2:npm1, i=1:nr)
+          x(i,nt,k)=two*sum1(i)-x(i,ntm1,k)
+        enddo
+      end if
+!
+      return
+end subroutine
+!#######################################################################
+subroutine sum_over_phi (n,a0,a1)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Sum the contribution over all processors in the phi
+! ****** dimension (only for processors with points on the poles).
+!
+! ****** The sum is performed for all N points in the vectors
+! ****** SUM0(N) and SUM1(N), at the North and South pole,
+! ****** respectively.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use local_dims
+      use mpidefs
+      use timing
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: n
+      real(r_typ), dimension(n) :: a0,a1
+!
+!-----------------------------------------------------------------------
+!
+! ****** MPI error return.
+!
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+      call timer_on
+!
+!$omp target data use_device_ptr(a0,a1)
+      if (tb0) then
+        call MPI_Allreduce (MPI_IN_PLACE,a0,n,ntype_real, &
+                            MPI_SUM,comm_phi,ierr)
+      end if
+!
+      if (tb1) then
+        call MPI_Allreduce (MPI_IN_PLACE,a1,n,ntype_real, &
+                            MPI_SUM,comm_phi,ierr)
+      end if
+!$omp end target data
+!
+      call timer_off (c_sumphi)
+!
+end subroutine
+!#######################################################################
+subroutine zero_boundary_points (x)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Set the boundary points at the physical boundaries
+! ****** of X to zero.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use local_dims
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), dimension(nr,nt,np) :: x
+!
+!-----------------------------------------------------------------------
+!
+      if (rb0) x( 1,:,:)=0.
+      if (rb1) x(nr,:,:)=0.
+      if (tb0) x(:, 1,:)=0.
+      if (tb1) x(:,nt,:)=0.
+!
+end subroutine
+!#######################################################################
+function cgdot (x,y,N)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Get the dot product of the vectors X and Y.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: N,i
+      real(r_typ) :: cgdot
+      real(r_typ), dimension(N) :: x,y
+!
+!-----------------------------------------------------------------------
+!
+      cgdot=0.
+!
+!$omp target teams loop reduction(+:cgdot)
+      do i=1,N
+        cgdot=cgdot+x(i)*y(i)
+      enddo
+!
+! ****** Sum over all the processors.
+!
+      call global_sum (cgdot)
+!
+      return
+end function
+!#######################################################################
+subroutine global_sum (x)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Overwrite X by the its sum over all processors.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use mpidefs
+      use timing
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ) :: x
+!
+!-----------------------------------------------------------------------
+!
+! ****** MPI error return.
+!
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+      call timer_on
+!
+! ****** Take the sum over all the processors.
+!
+      call MPI_Allreduce (MPI_IN_PLACE,x,1,ntype_real, &
+                          MPI_SUM,comm_all,ierr)
+!
+      call timer_off (c_cgdot)
+!
+end subroutine
+!#######################################################################
+subroutine seam_setup
+!
+!-----------------------------------------------------------------------
+!
+! ****** Allocate seam buffers.
+! ****** Doing this here avoids GPU allocate/deallocates.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use cgcom
+      use local_mesh
+      use local_dims
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      allocate(sbuf_rt1(nr,nt))
+      allocate(sbuf_rt2(nr,nt))
+      allocate(sbuf_tp1(nt,np))
+      allocate(sbuf_tp2(nt,np))
+      allocate(sbuf_rp1(nr,np))
+      allocate(sbuf_rp2(nr,np))
+!
+      allocate(rbuf_rt1(nr,nt))
+      allocate(rbuf_rt2(nr,nt))
+      allocate(rbuf_tp1(nt,np))
+      allocate(rbuf_tp2(nt,np))
+      allocate(rbuf_rp1(nr,np))
+      allocate(rbuf_rp2(nr,np))
+!
+!$omp target enter data map(alloc:sbuf_rt1,sbuf_rt2)
+!$omp target enter data map(alloc:sbuf_tp1,sbuf_tp2)
+!$omp target enter data map(alloc:sbuf_rp1,sbuf_rp2)
+!$omp target enter data map(alloc:rbuf_rt1,rbuf_rt2)
+!$omp target enter data map(alloc:rbuf_tp1,rbuf_tp2)
+!$omp target enter data map(alloc:rbuf_rp1,rbuf_rp2)
+!
+end subroutine
+!#######################################################################
+subroutine seam_hhh (a)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Seam the boundary points of 3D (r,t,p) array A between
+! ****** adjacent processors.
+!
+! ****** This routine assumes that there is a two-point
+! ****** overlap between processors in each dimension.
+!
+!-----------------------------------------------------------------------
+!
+! ****** This version uses non-blocking MPI sends and receives
+! ****** whenever possible in order to overlap communications.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use mpidefs
+      use timing
+      use local_mesh
+      use local_dims
+      use cgcom
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), dimension(nr,nt,np) :: a
+!
+!-----------------------------------------------------------------------
+!
+! ****** MPI error return.
+!
+      integer :: ierr
+!
+! ****** MPI tag for MPI_ISEND and MPI_IRECV (not tagged).
+!
+      integer :: tag=0
+!
+!-----------------------------------------------------------------------
+!
+      integer :: lbuf,i,j
+      integer :: reqs(4)
+!
+!-----------------------------------------------------------------------
+!
+      call timer_on
+!
+! ****** Seam the third (periodic) dimension.
+! ****** Since halo data is stride-1, no need for buffers.
+!
+      lbuf=nr*nt
+!
+!$omp target data use_device_ptr(a)
+      call MPI_Isend (a(:,:,np-1),lbuf,ntype_real,iproc_pp,tag, &
+                      comm_all,reqs(1),ierr)
+!
+      call MPI_Isend (a(:,:,   2),lbuf,ntype_real,iproc_pm,tag, &
+                      comm_all,reqs(2),ierr)
+!
+      call MPI_Irecv (a(:,:, 1),lbuf,ntype_real,iproc_pm,tag,   &
+                      comm_all,reqs(3),ierr)
+!
+      call MPI_Irecv (a(:,:,np),lbuf,ntype_real,iproc_pp,tag,   &
+                      comm_all,reqs(4),ierr)
+!
+      call MPI_Waitall (4,reqs,MPI_STATUSES_IGNORE,ierr)
+!$omp end target data
+!
+! ****** Seam the first dimension.
+!
+      if (nproc_r.gt.1) then
+!
+        lbuf=nt*np
+!
+        do concurrent (j=1:np, i=1:nt)
+          sbuf_tp1(i,j)=a(nr-1,i,j)
+          sbuf_tp2(i,j)=a(   2,i,j)
+        enddo
+!
+!$omp target data use_device_ptr(sbuf_tp1,sbuf_tp2,rbuf_tp1,rbuf_tp2)
+        call MPI_Isend (sbuf_tp1,lbuf,ntype_real,iproc_rp,tag, &
+                        comm_all,reqs(1),ierr)
+!
+        call MPI_Isend (sbuf_tp2,lbuf,ntype_real,iproc_rm,tag, &
+                        comm_all,reqs(2),ierr)
+!
+        call MPI_Irecv (rbuf_tp1,lbuf,ntype_real,iproc_rm,tag, &
+                        comm_all,reqs(3),ierr)
+!
+        call MPI_Irecv (rbuf_tp2,lbuf,ntype_real,iproc_rp,tag, &
+                        comm_all,reqs(4),ierr)
+!
+        call MPI_Waitall (4,reqs,MPI_STATUSES_IGNORE,ierr)
+!$omp end target data
+!
+        if (iproc_rm.ne.MPI_PROC_NULL) then
+          do concurrent (j=1:np, i=1:nt)
+            a( 1,i,j)=rbuf_tp1(i,j)
+          enddo
+        end if
+!
+        if (iproc_rp.ne.MPI_PROC_NULL) then
+          do concurrent (j=1:np, i=1:nt)
+            a(nr,i,j)=rbuf_tp2(i,j)
+          enddo
+        end if
+      end if
+!
+! ****** Seam the second dimension.
+!
+      if (nproc_t.gt.1) then
+!
+        lbuf=nr*np
+!
+        do concurrent (j=1:np, i=1:nr)
+          sbuf_rp1(i,j)=a(i,nt-1,j)
+          sbuf_rp2(i,j)=a(i,   2,j)
+        enddo
+!
+!$omp target data use_device_ptr(sbuf_rp1,sbuf_rp2,rbuf_rp1,rbuf_rp2)
+        call MPI_Isend (sbuf_rp1,lbuf,ntype_real,iproc_tp,tag, &
+                        comm_all,reqs(1),ierr)
+!
+        call MPI_Isend (sbuf_rp2,lbuf,ntype_real,iproc_tm,tag, &
+                        comm_all,reqs(2),ierr)
+!
+        call MPI_Irecv (rbuf_rp1,lbuf,ntype_real,iproc_tm,tag, &
+                        comm_all,reqs(3),ierr)
+!
+        call MPI_Irecv (rbuf_rp2,lbuf,ntype_real,iproc_tp,tag, &
+                        comm_all,reqs(4),ierr)
+!
+        call MPI_Waitall (4,reqs,MPI_STATUSES_IGNORE,ierr)
+!$omp end target data
+!
+        if (iproc_tm.ne.MPI_PROC_NULL) then
+          do concurrent (j=1:np, i=1:nr)
+            a(i, 1,j)=rbuf_rp1(i,j)
+          enddo
+        end if
+!
+        if (iproc_tp.ne.MPI_PROC_NULL) then
+          do concurrent (j=1:np, i=1:nr)
+            a(i,nt,j)=rbuf_rp2(i,j)
+          enddo
+        end if
+!
+      end if
+!
+      call timer_off (c_seam)
+!
+end subroutine
+!#######################################################################
+subroutine seam_gen (a,n1,n2,n3)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Seam the boundary points of 3D (r,t,p) array A between
+! ****** adjacent processors.
+!
+! ****** This routine assumes that there is a two-point
+! ****** overlap between processors in each dimension.
+!
+!-----------------------------------------------------------------------
+!
+! ****** This version uses non-blocking MPI sends and receives
+! ****** whenever possible in order to overlap communications.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use mpidefs
+      use timing
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), dimension(n1,n2,n3) :: a
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), dimension(n2,n3) :: sbuf11,rbuf11
+      real(r_typ), dimension(n2,n3) :: sbuf12,rbuf12
+      real(r_typ), dimension(n1,n3) :: sbuf21,rbuf21
+      real(r_typ), dimension(n1,n3) :: sbuf22,rbuf22
+      real(r_typ), dimension(n1,n2) :: sbuf31,rbuf31
+      real(r_typ), dimension(n1,n2) :: sbuf32,rbuf32
+!
+!-----------------------------------------------------------------------
+!
+! ****** MPI error return.
+!
+      integer :: ierr
+!
+! ****** MPI tag for MPI_ISEND and MPI_IRECV (not tagged).
+!
+      integer :: tag=0
+!
+!-----------------------------------------------------------------------
+!
+      integer :: lbuf,i,j
+      integer :: n1,n2,n3
+      integer :: reqs(4)
+!
+!-----------------------------------------------------------------------
+!
+      call timer_on
+!
+! ****** Seam the third (periodic) dimension.
+!
+!$omp target enter data map(alloc:sbuf31,sbuf32,rbuf31,rbuf32)
+!
+      lbuf=n1*n2
+!
+      do concurrent (j=1:n2, i=1:n1)
+        sbuf31(i,j)=a(i,j,n3-1)
+        sbuf32(i,j)=a(i,j,   2)
+      enddo
+!
+!$omp target update from(sbuf31,sbuf32)
+      call MPI_Isend (sbuf31,lbuf,ntype_real,iproc_pp,tag, &
+                      comm_all,reqs(1),ierr)
+!
+      call MPI_Isend (sbuf32,lbuf,ntype_real,iproc_pm,tag, &
+                      comm_all,reqs(2),ierr)
+!
+      call MPI_Irecv (rbuf31,lbuf,ntype_real,iproc_pm,tag, &
+                      comm_all,reqs(3),ierr)
+!
+      call MPI_Irecv (rbuf32,lbuf,ntype_real,iproc_pp,tag, &
+                      comm_all,reqs(4),ierr)
+!
+      call MPI_Waitall (4,reqs,MPI_STATUSES_IGNORE,ierr)
+!$omp target update to(rbuf31,rbuf32)
+!
+      do concurrent (j=1:n2, i=1:n1)
+        a(i,j, 1)=rbuf31(i,j)
+        a(i,j,n3)=rbuf32(i,j)
+      enddo
+!
+!$omp target exit data map(delete:sbuf31,sbuf32,rbuf31,rbuf32)
+!
+! ****** Seam the first dimension.
+!
+      if (nproc_r.gt.1) then
+!
+!$omp target enter data map(alloc:sbuf11,sbuf12,rbuf11,rbuf12)
+!
+        lbuf=n2*n3
+!
+        do concurrent (j=1:n3, i=1:n2)
+          sbuf11(i,j)=a(n1-1,i,j)
+          sbuf12(i,j)=a(   2,i,j)
+        enddo
+!
+!$omp target update from(sbuf11,sbuf12)
+        call MPI_Isend (sbuf11,lbuf,ntype_real,iproc_rp,tag, &
+                        comm_all,reqs(1),ierr)
+!
+        call MPI_Isend (sbuf12,lbuf,ntype_real,iproc_rm,tag, &
+                        comm_all,reqs(2),ierr)
+!
+        call MPI_Irecv (rbuf11,lbuf,ntype_real,iproc_rm,tag, &
+                        comm_all,reqs(3),ierr)
+!
+        call MPI_Irecv (rbuf12,lbuf,ntype_real,iproc_rp,tag, &
+                        comm_all,reqs(4),ierr)
+!
+        call MPI_Waitall (4,reqs,MPI_STATUSES_IGNORE,ierr)
+!$omp target update to(rbuf11,rbuf12)
+!
+        if (iproc_rm.ne.MPI_PROC_NULL) then
+          do concurrent (j=1:n3, i=1:n2)
+            a( 1,i,j)=rbuf11(i,j)
+          enddo
+        end if
+!
+        if (iproc_rp.ne.MPI_PROC_NULL) then
+          do concurrent (j=1:n3, i=1:n2)
+            a(n1,i,j)=rbuf12(i,j)
+          enddo
+        end if
+!
+!$omp target exit data map(delete:sbuf11,sbuf12,rbuf11,rbuf12)
+      end if
+!
+! ****** Seam the second dimension.
+!
+      if (nproc_t.gt.1) then
+!
+!$omp target enter data map(alloc:sbuf21,sbuf22,rbuf21,rbuf22)
+!
+        lbuf=n1*n3
+!
+        do concurrent (j=1:n3, i=1:n1)
+          sbuf21(i,j)=a(i,n2-1,j)
+          sbuf22(i,j)=a(i,   2,j)
+        enddo
+!
+!$omp target update from(sbuf21,sbuf22)
+        call MPI_Isend (sbuf21,lbuf,ntype_real,iproc_tp,tag, &
+                        comm_all,reqs(1),ierr)
+!
+        call MPI_Isend (sbuf22,lbuf,ntype_real,iproc_tm,tag, &
+                        comm_all,reqs(2),ierr)
+!
+        call MPI_Irecv (rbuf21,lbuf,ntype_real,iproc_tm,tag, &
+                        comm_all,reqs(3),ierr)
+!
+        call MPI_Irecv (rbuf22,lbuf,ntype_real,iproc_tp,tag, &
+                        comm_all,reqs(4),ierr)
+!
+        call MPI_Waitall (4,reqs,MPI_STATUSES_IGNORE,ierr)
+!$omp target update to(rbuf21,rbuf22)
+!
+        if (iproc_tm.ne.MPI_PROC_NULL) then
+          do concurrent (j=1:n3, i=1:n1)
+            a(i, 1,j)=rbuf21(i,j)
+          enddo
+        end if
+!
+        if (iproc_tp.ne.MPI_PROC_NULL) then
+          do concurrent (j=1:n3, i=1:n1)
+            a(i,n2,j)=rbuf22(i,j)
+          enddo
+        end if
+!
+!$omp target exit data map(delete:sbuf21,sbuf22,rbuf21,rbuf22)
+      end if
+!
+      call timer_off (c_seam)
+!
+end subroutine
+!#######################################################################
+subroutine write_solution
+!
+!-----------------------------------------------------------------------
+!
+! ****** Write the global solution.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use global_dims
+      use global_mesh
+      use fields
+      use vars
+      use solve_params
+      use mpidefs
+      use decomposition
+      use assemble_array_interface
+      use timing
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+! ****** Declaration for the global arrays.
+! ****** These arrays are only allocated on processor IPROC0.
+!
+      real(r_typ), dimension(:,:,:), allocatable :: phi_g
+      real(r_typ), dimension(:,:,:), allocatable :: br_g
+      real(r_typ), dimension(:,:,:), allocatable :: bt_g
+      real(r_typ), dimension(:,:,:), allocatable :: bp_g
+!
+!-----------------------------------------------------------------------
+!
+      integer :: ierr
+      character(256) :: fname
+!
+!-----------------------------------------------------------------------
+!
+      call timer_on
+!
+! ****** Potential.
+!
+      if (phifile.ne.'') then
+!$omp target update from(phi)
+!
+! ****** Allocate the global array PHI_G (on processor IPROC0).
+!
+        if (iamp0) then
+          allocate (phi_g(nr_g,nt_g,np_g))
+        else
+          allocate (phi_g(1,1,1))
+        end if
+!
+! ****** Assemble the global PHI array.
+!
+        call assemble_array (map_rh,map_th,map_ph,phi,phi_g)
+!
+        fname=phifile
+!
+! ****** Write out the potential to a file.
+!
+        if (iamp0) then
+            write (*,*)
+            write (*,*) '### COMMENT from WRITE_SOLUTION:'
+            write (*,*)
+            write (*,*) 'Writing the potential to file: ',trim(fname)
+            call wrhdf_3d (fname,.true.,nr_g,nt_g,np_g, &
+                           phi_g,rh_g,th_g,ph_g,hdf32,ierr)
+        end if
+!
+        deallocate (phi_g)
+!
+      end if
+!
+! ****** Br.
+!
+      if (brfile.ne.'') then
+!$omp target update from(br)
+!
+        fname=brfile
+!
+        if (iamp0) then
+          allocate (br_g(nrm1_g,nt_g,np_g))
+        else
+          allocate (br_g(1,1,1))
+        end if
+!
+! ****** Assemble the global PHI array.
+!
+        call assemble_array (map_rm,map_th,map_ph,br,br_g)
+!
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### COMMENT from WRITE_SOLUTION:'
+          write (*,*)
+          write (*,*) 'Writing Br to file: ',trim(fname)
+          call wrhdf_3d (fname,.true.,nrm1_g,nt_g,np_g, &
+                         br_g,r_g,th_g,ph_g,hdf32,ierr)
+        end if
+!
+        deallocate (br_g)
+!
+      end if
+!
+! ****** Bt.
+!
+      if (btfile.ne.'') then
+!$omp target update from(bt)
+!
+        fname=btfile
+!
+        if (iamp0) then
+          allocate (bt_g(nr_g,ntm1_g,np_g))
+        else
+          allocate (bt_g(1,1,1))
+        end if
+!
+! ****** Assemble the global PHI array.
+!
+        call assemble_array (map_rh,map_tm,map_ph,bt,bt_g)
+!
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### COMMENT from WRITE_SOLUTION:'
+          write (*,*)
+          write (*,*) 'Writing Bt to file: ',trim(fname)
+          call wrhdf_3d (fname,.true.,nr_g,ntm1_g,np_g, &
+                         bt_g,rh_g,t_g,ph_g,hdf32,ierr)
+!
+        end if
+!
+        deallocate (bt_g)
+!
+      end if
+!
+! ****** Bp.
+!
+      if (bpfile.ne.'') then
+!$omp target update from(bp)
+!
+        fname=bpfile
+!
+        if (iamp0) then
+          allocate (bp_g(nr_g,nt_g,npm1_g))
+        else
+          allocate (bp_g(1,1,1))
+        end if
+!
+! ****** Assemble the global PHI array.
+!
+        call assemble_array (map_rh,map_th,map_pm,bp,bp_g)
+!
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### COMMENT from WRITE_SOLUTION:'
+          write (*,*)
+          write (*,*) 'Writing Bp to file: ',trim(fname)
+          call wrhdf_3d (fname,.true.,nr_g,nt_g,npm1_g, &
+                         bp_g,rh_g,th_g,p_g,hdf32,ierr)
+!
+        end if
+!
+        deallocate (bp_g)
+!
+      end if
+!
+      call timer_off (t_io)
+!
+end subroutine
+!#######################################################################
+subroutine getb
+!
+!-----------------------------------------------------------------------
+!
+! ****** Calculate B from grad-phi.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use global_dims
+      use global_mesh
+      use vars
+      use fields
+      use local_dims
+      use local_mesh
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j,k
+!
+!-----------------------------------------------------------------------
+!
+!$omp target enter data map(alloc:br,bt,bp)
+!
+! ****** Get Br.
+!
+      do concurrent (k=1:np, j=1:nt, i=1:nrm1)
+        br(i,j,k)=(phi(i+1,j,k)-phi(i,j,k))/dr(i)
+      enddo
+!
+! ****** Get Bt.
+!
+      do concurrent (k=1:np, j=1:ntm1, i=1:nr)
+        bt(i,j,k)=(phi(i,j+1,k)-phi(i,j,k))/(rh(i)*dt(j))
+      enddo
+!
+! ****** Get Bp.
+!
+      do concurrent (k=1:npm1, j=1:nt, i=1:nr)
+        bp(i,j,k)=(phi(i,j,k+1)-phi(i,j,k))/(rh(i)*sth(j)*dp(k))
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine magnetic_energy
+!
+!-----------------------------------------------------------------------
+!
+! ****** Calculate magnetic energy from B.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use global_dims
+      use global_mesh
+      use vars
+      use fields
+      use mpidefs
+      use local_dims
+      use local_mesh
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: half=.5_r_typ
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j,k,ierr
+      real(r_typ) :: brav,btav,bpav,dv
+      real(r_typ) :: wr,wt,wp
+      real(r_typ), dimension(3) :: w
+      character(32) :: fmtstr
+!
+!-----------------------------------------------------------------------
+!
+      if (hdf32) then
+        fmtstr="(A, ES14.8)"
+      else
+        fmtstr="(A,ES22.16)"
+      end if
+!
+      wr=0.
+      wt=0.
+      wp=0.
+!$omp target teams loop collapse(3) reduction(+:wr,wt,wp)
+      do k=2,npm1
+        do j=2,ntm1
+          do i=2,nrm1
+            dv=rh(i)**2*drh(i)*dth(j)*sth(j)*dph(k)
+            brav=half*(br(i,j,k)+br(i-1,j,k))
+            btav=half*(bt(i,j,k)+bt(i,j-1,k))
+            bpav=half*(bp(i,j,k)+bp(i,j,k-1))
+            wr=wr+half*brav**2*dv
+            wt=wt+half*btav**2*dv
+            wp=wp+half*bpav**2*dv
+          enddo
+        enddo
+      enddo
+!
+! ****** Sum up all processors into final values and print.
+!
+      w(1)=wr
+      w(2)=wt
+      w(3)=wp
+      call MPI_Allreduce(MPI_IN_PLACE,w,3,ntype_real, &
+                         MPI_SUM,comm_all,ierr)
+!
+      if (iamp0) then
+        write (*,*)
+        write (*,*) '### COMMENT from GETB:'
+        write (*,*) '### Magnetic energy diagnostic:'
+        write (*,*)
+        write (*,trim(fmtstr)) 'Energy in Br**2 = ',w(1)
+        write (*,trim(fmtstr)) 'Energy in Bt**2 = ',w(2)
+        write (*,trim(fmtstr)) 'Energy in Bp**2 = ',w(3)
+        write (*,trim(fmtstr)) 'Magnetic energy = ',SUM(w)
+        write (9,*)
+        write (9,*) '### COMMENT from GETB:'
+        write (9,*) '### Magnetic energy diagnostic:'
+        write (9,*)
+        write (9,trim(fmtstr)) 'Energy in Br**2 = ',w(1)
+        write (9,trim(fmtstr)) 'Energy in Bt**2 = ',w(2)
+        write (9,trim(fmtstr)) 'Energy in Bp**2 = ',w(3)
+        write (9,trim(fmtstr)) 'Magnetic energy = ',SUM(w)
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine assemble_array (map_r,map_t,map_p,a,a_g)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Assemble a global array (into A_G) on processor IPROC0 by
+! ****** fetching the local sections (A) from all the processors.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use decomposition
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      type(map_struct), dimension(0:nproc-1) :: map_r,map_t,map_p
+      real(r_typ), dimension(:,:,:) :: a,a_g
+!
+!-----------------------------------------------------------------------
+!
+! ****** Storage for the buffers.
+!
+      integer :: lbuf,lsbuf
+      real(r_typ), dimension(:), allocatable :: sbuf
+      real(r_typ), dimension(:), allocatable :: rbuf
+!
+!-----------------------------------------------------------------------
+!
+      integer :: tag=0
+      integer :: irank,l1,l2,l3,i,j,k,ii
+      integer :: i0,j0,k0,i1,j1,k1
+      integer :: i0g,j0g,k0g
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+      l1=map_r(iproc)%n
+      l2=map_t(iproc)%n
+      l3=map_p(iproc)%n
+      lsbuf=l1*l2*l3
+!
+      i0=map_r(iproc)%i0
+      i1=map_r(iproc)%i1
+      j0=map_t(iproc)%i0
+      j1=map_t(iproc)%i1
+      k0=map_p(iproc)%i0
+      k1=map_p(iproc)%i1
+!
+! ***** Extract 1D array of non-overlapping values from local array.
+!
+      allocate (sbuf(lsbuf))
+!
+      sbuf=reshape(a(i0:i1,j0:j1,k0:k1),(/lsbuf/))
+!
+! ****** If proc0, recieve/store local arrays into global array.
+!
+      if (iamp0) then
+        do irank=0,nproc-1
+!
+          l1=map_r(irank)%n
+          l2=map_t(irank)%n
+          l3=map_p(irank)%n
+          lbuf=l1*l2*l3
+!
+          i0g=map_r(irank)%offset
+          j0g=map_t(irank)%offset
+          k0g=map_p(irank)%offset
+!
+! ****** If proc0 is the current rank in loop, simply copy local array.
+          if (iproc==irank) then
+            do k=1,l3
+              do j=1,l2
+                do i=1,l1
+                  ii=l2*l1*(k-1)+l1*(j-1)+i
+                  a_g(i0g+i-1,j0g+j-1,k0g+k-1)=sbuf(ii)
+                enddo
+              enddo
+            enddo
+! ****** Otherwise recieve data:
+          else
+            allocate (rbuf(lbuf))
+            call MPI_Recv (rbuf,lbuf,ntype_real,irank,tag, &
+                           comm_all,MPI_STATUS_IGNORE,ierr)
+            do k=1,l3
+              do j=1,l2
+                do i=1,l1
+                  ii=l2*l1*(k-1)+l1*(j-1)+i
+                  a_g(i0g+i-1,j0g+j-1,k0g+k-1)=rbuf(ii)
+                enddo
+              enddo
+            enddo
+            deallocate(rbuf)
+          end if
+        enddo
+      else
+!
+! ****** Send local array to iproc0.
+!
+      call MPI_Ssend (sbuf,lsbuf,ntype_real,iproc0,tag,comm_all,ierr)
+!
+      end if
+      deallocate (sbuf)
+!
+end subroutine
+!#######################################################################
+subroutine timer_on
+!
+!-----------------------------------------------------------------------
+!
+! ****** Push an entry onto the timing stack and initialize
+! ****** a timing event.
+!
+!-----------------------------------------------------------------------
+!
+! ****** This routine can be called in a nested way to measure
+! ****** multiple timing events.  Calls to TIMER_ON and TIMER_OFF
+! ****** need to be nested like do-loops in FORTRAN.
+!
+!-----------------------------------------------------------------------
+!
+      use mpidefs
+      use timer
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      if (istack.ge.nstack) then
+        write (*,*)
+        write (*,*) '### WARNING from TIMER_ON:'
+        write (*,*) '### Timing stack depth exceeded.'
+        write (*,*) 'This may signal an incorrect nesting of '// &
+                    'TIMER_ON/TIMER_OFF calls.'
+        write (*,*) 'Timing information will not be valid.'
+        return
+      else
+        istack=istack+1
+      end if
+!
+      tstart(istack)=MPI_Wtime()
+!
+end subroutine
+!#######################################################################
+subroutine timer_off (tused)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Increment the CPU time used since the call to TIMER_ON
+! ****** in variable TUSED, and pop an entry off the timing
+! ****** stack.
+!
+!-----------------------------------------------------------------------
+!
+! ****** This routine can be called in a nested way to measure
+! ****** multiple timing events.  Calls to TIMER_ON and TIMER_OFF
+! ****** need to be nested like do-loops in FORTRAN.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use mpidefs
+      use timer
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ) :: tused
+!
+!-----------------------------------------------------------------------
+!
+      if (istack.le.0) then
+        write (*,*)
+        write (*,*) '### WARNING from TIMER_OFF:'
+        write (*,*) '### Timing stack cannot be popped.'
+        write (*,*) 'This may signal an incorrect nesting of '// &
+                    'TIMER_ON/TIMER_OFF calls.'
+        write (*,*) 'Timing information will not be valid.'
+        return
+      else
+        istack=istack-1
+      end if
+!
+      tused=tused+MPI_Wtime()-tstart(istack+1)
+!
+end subroutine
+!#######################################################################
+subroutine write_timing
+!
+!-----------------------------------------------------------------------
+!
+! ****** Write out the timing info.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use mpidefs
+      use timing
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+! ****** Timing buffers.
+!
+      integer, parameter :: lbuf=10
+      real(r_typ), dimension(lbuf) :: sbuf
+      real(r_typ), dimension(lbuf,0:nproc-1) :: tbuf
+!
+! ****** Timing statistics.
+!
+      real(r_typ), dimension(lbuf) :: tmin,tmax,tavg,tsdev
+!
+!-----------------------------------------------------------------------
+!
+      integer :: ierr,irank
+      real(r_typ) :: t_tot_avg,c_tot_avg,c_tot
+!
+      character(80) :: tfile='timing.out'
+!
+!-----------------------------------------------------------------------
+!
+! ****** Gather the timing information for all processors into TBUF.
+!
+      sbuf(1)=t_solve
+      sbuf(2)=t_startup
+      sbuf(3)=t_io
+      sbuf(4)=t_ax
+      sbuf(5)=t_pc_load
+      sbuf(6)=t_pc
+      sbuf(7)=c_seam
+      sbuf(8)=c_cgdot
+      sbuf(9)=c_sumphi
+      sbuf(10)=t_wall
+!
+      call MPI_Allgather (sbuf,lbuf,ntype_real, &
+                          tbuf,lbuf,ntype_real,comm_all,ierr)
+!
+! ****** Calculate the timing statistics.
+!
+      tavg=sum(tbuf,dim=2)/nproc
+      tmin=minval(tbuf,dim=2)
+      tmax=maxval(tbuf,dim=2)
+!
+      tsdev(:)=0.
+      do irank=0,nproc-1
+        tsdev(:)=tsdev(:)+(tbuf(:,irank)-tavg(:))**2
+      enddo
+      tsdev(:)=sqrt(tsdev(:)/nproc)
+!
+      t_tot_avg=tavg(10)
+      c_tot_avg=tavg(7)+tavg(8)+tavg(9)
+!
+      if (iamp0) then
+!
+        call ffopen (1,tfile,'rw',ierr)
+!
+        if (ierr.ne.0) then
+          write (*,*)
+          write (*,*) '### WARNING from WRITE_TIMING:'
+          write (*,*) '### Could not create the timing file.'
+          write (*,*) 'File name: ',trim(tfile)
+        end if
+!
+        do irank=0,nproc-1
+          c_tot=tbuf(7,irank)+tbuf(8,irank)+tbuf(9,irank)
+          write (1,*)
+          write (1,100)
+          write (1,*)
+          write (1,*) 'Processor id = ',irank
+          write (1,*)
+          write (1,200) 'Comm. time in SEAM    = ',tbuf(7,irank)
+          write (1,200) 'Comm. time in CGDOT   = ',tbuf(8,irank)
+          write (1,200) 'Comm. time in SUMPHI  = ',tbuf(9,irank)
+          write (1,*)   '------------------------------------'
+          write (1,200) 'Total comm. time      = ',c_tot
+          write (1,*)
+          write (1,200) 'Time used in start-up = ',tbuf(2,irank)
+          write (1,200) 'Time used in i/o      = ',tbuf(3,irank)
+          write (1,200) 'Time used in POTFLD   = ',tbuf(1,irank)
+          write (1,200) 'Time used in AX       = ',tbuf(4,irank)
+          write (1,200) 'Time used in PCLOAD   = ',tbuf(5,irank)
+          write (1,200) 'Time used in PC       = ',tbuf(6,irank)
+          write (1,*)   '------------------------------------'
+          write (1,200) 'Total time used       = ',tbuf(10,irank)
+  100     format (80('-'))
+  200     format (1x,a,f12.6)
+        enddo
+        write (1,*)
+        write (1,100)
+!
+        write (1,*)
+        write (1,*) 'Average times:'
+        write (1,*) '-------------'
+        write (1,*)
+        write (1,300) '    Avg         Min         Max      S. Dev'
+        write (1,300) '    ---         ---         ---      ------'
+        write (1,400) 'Avg comm. time in SEAM    = ', &
+                      tavg(7),tmin(7),tmax(7),tsdev(7)
+        write (1,400) 'Avg comm. time in CGDOT   = ', &
+                      tavg(8),tmin(8),tmax(8),tsdev(8)
+        write (1,400) 'Avg comm. time in SUMPHI  = ', &
+                      tavg(9),tmin(9),tmax(9),tsdev(9)
+        write (1,400) 'Avg time used in start-up = ', &
+                      tavg(2),tmin(2),tmax(2),tsdev(2)
+        write (1,400) 'Avg time used in i/o      = ', &
+                      tavg(3),tmin(3),tmax(3),tsdev(3)
+        write (1,400) 'Avg time used in POTFLD   = ', &
+                      tavg(1),tmin(1),tmax(1),tsdev(1)
+        write (1,400) 'Avg time used in AX       = ', &
+                      tavg(4),tmin(4),tmax(4),tsdev(4)
+        write (1,400) 'Avg time used in PCLOAD   = ', &
+                      tavg(5),tmin(5),tmax(5),tsdev(5)
+        write (1,400) 'Avg time used in PC       = ', &
+                      tavg(6),tmin(6),tmax(6),tsdev(6)
+        write (1,400) 'Avg total time            = ', &
+                      tavg(10),tmin(10),tmax(10),tsdev(10)
+  300   format (1x,33x,a)
+  400   format (1x,a,4f12.3)
+!
+        write (1,*)
+        write (1,200) 'Average time used per proc  = ',t_tot_avg
+        write (1,200) 'Average comm. time per proc = ',c_tot_avg
+        write (1,*)
+        write (1,100)
+        write (1,*)
+!
+        close (1)
+!
+        write(*,*)
+        write(*,200) 'Total time:               ',tmax(10)
+        write(*,200) 'Max Comm/Imbalance time:  ', &
+                      tmax(7)+tmax(8)+tmax(9)
+        write(*,*)
+!
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine readbr (fname,br0_g,ierr)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Read in the radial magnetic field at the photosphere
+! ****** and interpolate it into array BR0_G.
+!
+! ****** FNAME is the name of the file to read.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use global_dims
+      use global_mesh
+      use vars
+      use rdhdf_2d_interface
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      character(*) :: fname
+      real(r_typ), dimension(nt_g,np_g) :: br0_g
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: two=2._r_typ
+!
+!-----------------------------------------------------------------------
+!
+! ****** Br array read in and its scales.
+!
+      real(r_typ), dimension(:,:), pointer :: bn
+      real(r_typ), dimension(:), pointer :: tn,pn
+!
+!-----------------------------------------------------------------------
+!
+      integer :: ntn,npn,j,k
+      logical :: scale
+      real(r_typ) :: sum0,sum1,area,fluxp,fluxm,da,br00err
+!
+!-----------------------------------------------------------------------
+!
+      ierr=0
+!
+! ****** Read in the normal field.
+!
+      write (*,*)
+      write (*,*) '### COMMENT from READBR:'
+      write (*,*) '### Reading Br file: ',trim(fname)
+!
+      call rdhdf_2d (fname,scale,ntn,npn,bn,tn,pn,ierr)
+!
+      if (ierr.ne.0) then
+        write (*,*)
+        write (*,*) '### ERROR in READBR:'
+        write (*,*) '### The flux file has the wrong format.'
+        write (*,*) 'IERR (from RDHDF_2D) = ',ierr
+        write (*,*) 'File name: ',trim(fname)
+        ierr=1
+        return
+      end if
+!
+! ****** Check that the arrays has scales.
+!
+      if (.not.scale) then
+        write (*,*)
+        write (*,*) '### ERROR in READBR:'
+        write (*,*) '### The flux file does not have scales.'
+        write (*,*) 'File name: ',trim(fname)
+        ierr=2
+        return
+      end if
+!
+! ****** Interpolate the field to the code mesh (into array BR0_G).
+!
+      call intrp2d (ntn,npn,tn,pn,bn, &
+                    nt_g-2,np_g-2,th_g(2:ntm1_g),ph_g(2:npm1_g), &
+                    br0_g(2:ntm1_g,2:npm1_g),ierr)
+!
+      if (ierr.ne.0) then
+        write (*,*)
+        write (*,*) '### ERROR in READBR:'
+        write (*,*) '### The scales in the Br file are invalid.'
+        write (*,*) 'File name: ',trim(fname)
+        ierr=3
+        return
+      end if
+!
+! ****** De-allocate the memory for the BN array and its scales.
+!
+      deallocate (bn)
+      deallocate (tn)
+      deallocate (pn)
+!
+! ****** Set Br to be periodic.
+!
+      br0_g(:,1)=br0_g(:,npm1_g)
+      br0_g(:,np_g)=br0_g(:,2)
+!
+! ****** Set BCs at the poles.
+!
+! ****** Br has only an m=0 component there.
+!
+      sum0=sum(br0_g(     2,2:npm1_g)*dph_g(2:npm1_g))*pl_i
+      sum1=sum(br0_g(ntm1_g,2:npm1_g)*dph_g(2:npm1_g))*pl_i
+!
+      br0_g(1   ,:)=two*sum0-br0_g(     2,:)
+      br0_g(nt_g,:)=two*sum1-br0_g(ntm1_g,:)
+!
+! ****** Calculate the total flux.
+!
+      area=0.
+      fluxp=0.
+      fluxm=0.
+      do j=2,ntm1_g
+        do k=2,npm1_g
+          da=sth_g(j)*dth_g(j)*dph_g(k)
+          if (br0_g(j,k).gt.0.) then
+            fluxp=fluxp+br0_g(j,k)*da
+          else
+            fluxm=fluxm+br0_g(j,k)*da
+          end if
+          area=area+da
+        enddo
+      enddo
+!
+      write (*,*)
+      write (*,*) '### COMMENT from READBR:'
+      write (*,*) '### Computed flux balance:'
+      write (*,*)
+      write (*,*) 'Positive flux = ',fluxp
+      write (*,*) 'Negative flux = ',fluxm
+!
+! ****** Fix the magnetic field so that the total flux is zero
+! ****** (unless this has not been requested).
+!
+      if (.not.((option.eq.'ss'.or.option.eq.'open') &
+                .and.do_not_balance_flux)) then
+!
+        br00err=(fluxp+fluxm)/area
+!
+        do k=1,np_g
+          do j=1,nt_g
+            br0_g(j,k)=br0_g(j,k)-br00err
+          enddo
+        enddo
+!
+        write (*,*)
+        write (*,*) '### COMMENT from READBR:'
+        write (*,*) '### Flux balance correction:'
+        write (*,*)
+        write (*,*) 'BR00 (monopole Br field magnitude) = ',br00err
+!
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine intrp2d (nxi,nyi,xi,yi,fi,nx,ny,x,y,f,ierr)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Interpolate a 2D field from array FI(NXI,NYI), defined
+! ****** on the mesh XI(NXI) x YI(NYI), into the array F(NX,NY),
+! ****** defined on the mesh X(NX) x Y(NY).
+!
+! ****** Note that if a data point is outside the bounds of
+! ****** the XI x YI mesh, IERR=2 is returned.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: nxi,nyi
+      real(r_typ), dimension(nxi) :: xi
+      real(r_typ), dimension(nyi) :: yi
+      real(r_typ), dimension(nxi,nyi) :: fi
+      integer :: nx,ny
+      real(r_typ), dimension(nx) :: x
+      real(r_typ), dimension(ny) :: y
+      real(r_typ), dimension(nx,ny) :: f
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: zero=0._r_typ
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), external :: flint
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j,ii,jj
+      real(r_typ) :: dum(1)=0.
+      real(r_typ) :: dummy,xv,yv,ax,ay
+!
+!-----------------------------------------------------------------------
+!
+      ierr=0
+!
+! ****** Check that the scales XI and YI are monotonic.
+!
+      dummy=flint(zero,nxi,xi,dum,1,ierr)
+      if (ierr.ne.0) go to 900
+!
+      dummy=flint(zero,nyi,yi,dum,1,ierr)
+      if (ierr.ne.0) go to 900
+!
+! ****** Interpolate the data.
+!
+      do j=1,ny
+        yv=y(j)
+        if (yv.lt.yi(1).or.yv.gt.yi(nyi)) then
+          go to 910
+        end if
+        call interp (yi,nyi,yv,jj,ay)
+        do i=1,nx
+          xv=x(i)
+          if (xv.lt.xi(1).or.xv.gt.xi(nxi)) then
+            go to 910
+          end if
+          call interp (xi,nxi,xv,ii,ax)
+          f(i,j)= (1.-ax)*((1.-ay)*fi(ii  ,jj  )+ay*fi(ii  ,jj+1)) &
+                 +    ax *((1.-ay)*fi(ii+1,jj  )+ay*fi(ii+1,jj+1))
+        enddo
+      enddo
+!
+      return
+!
+! ****** Error exit: invalid scales.
+!
+  900 continue
+!
+      write (*,*)
+      write (*,*) '### ERROR in INTRP2D:'
+      write (*,*) '### Scales are not monotonically increasing.'
+      ierr=1
+!
+      return
+!
+! ****** Error exit: data outside range of scales.
+!
+  910 continue
+!
+      write (*,*)
+      write (*,*) '### ERROR in INTRP2D:'
+      write (*,*) '### An interpolation was attempted outside the'// &
+                  ' range of the defined scales.'
+      ierr=2
+!
+end subroutine
+!#######################################################################
+function flint (x,n,xn,fn,icheck,ierr)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Interpolate a function linearly.
+!
+!-----------------------------------------------------------------------
+!
+! ****** The funcion is defined at N nodes, with values given by
+! ****** FN(N) at positions XN(N).  The function value returned is
+! ****** the linear interpolant at X.
+!
+! ****** Note that if X.lt.XN(1), the function value returned
+! ****** is FN(1), and if X.gt.XN(N), the function value returned
+! ****** is FN(N).
+!
+! ****** Call once with ICHECK.ne.0 to check that the values
+! ****** in XN(N) are monotonically increasing.  In this mode
+! ****** the array XN(N) is checked, and X and FN(N) are not
+! ****** accessed.  If the check is passed, IERR=0 is returned.
+! ****** Otherwise, IERR=1 is returned.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ) :: flint
+      real(r_typ) :: x
+      integer :: n
+      real(r_typ), dimension(n) :: xn,fn
+      integer :: icheck
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j
+      real(r_typ) :: x1,x2,alpha
+!
+!-----------------------------------------------------------------------
+!
+      ierr=0
+      flint=0.
+!
+! ****** If ICHECK.ne.0, check the abscissa table.
+!
+      if (icheck.ne.0) then
+        if (n.le.0) then
+          write (*,*)
+          write (*,*) '### ERROR in FLINT:'
+          write (*,*) '### Bad dimension of abscissa table.'
+          write (*,*) 'N = ',n
+          ierr=1
+          return
+        end if
+        do 100 i=1,n-1
+          if (xn(i+1).le.xn(i)) then
+            write (*,*)
+            write (*,*) '### ERROR in FLINT:'
+            write (*,*) '### Bad data in abscissa table.'
+            write (*,*) 'N = ',n
+            write (*,*) 'XN = '
+            write (*,*) (xn(j),j=1,n)
+            ierr=1
+            return
+          end if
+  100   continue
+        return
+      end if
+!
+! ****** Get the interpolated value.
+!
+      if (x.le.xn(1)) then
+        flint=fn(1)
+      else if (x.gt.xn(n)) then
+        flint=fn(n)
+      else
+        do 200 i=1,n-1
+          if (x.ge.xn(i).and.x.lt.xn(i+1)) go to 300
+  200   continue
+  300   continue
+        x1=xn(i)
+        x2=xn(i+1)
+        alpha=(x-x1)/(x2-x1)
+        flint=fn(i)*(1.-alpha)+fn(i+1)*alpha
+      end if
+!
+      return
+end function
+!#######################################################################
+subroutine interp (x,n,xv,i,alpha)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Get interpolation factor ALPHA and table index I.
+!
+! ****** This routine does not do the actual interpolation.  Use the
+! ****** returned values of I and ALPHA to get the interpolated value.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: n
+      real(r_typ), dimension(n) :: x
+      real(r_typ) :: xv
+      integer :: i
+      real(r_typ) :: alpha
+!
+!-----------------------------------------------------------------------
+!
+      do 100 i=1,n-1
+        if (xv.ge.x(i).and.xv.le.x(i+1)) then
+          alpha=(xv-x(i))/(x(i+1)-x(i))
+          go to 200
+        end if
+  100 continue
+!
+! ****** Value not found --- signal error and stop.
+!
+      write (*,*)
+      write (*,*) '### ERROR in INTERP:'
+      write (*,*) '### Value not found in table.'
+      write (*,*) 'Value requested = ',xv
+      write (*,*) 'Min table value = ',x(1)
+      write (*,*) 'Max table value = ',x(n)
+      call endrun (.true.)
+!
+  200 continue
+!
+end subroutine
+!#######################################################################
+!
+! ****** Revision history:
+!
+! ### Version 1.00, 03/02/2006, file pot3d.f, modified by ZM:
+!
+!       - Cleaned up the previous version of POT3D.
+!
+! ### Version 1.01, 03/06/2006, file pot3d.f, modified by ZM:
+!
+!       - Added the ability to do a "source-surface plus
+!         current-sheet" solution.  Select this by setting
+!         OPTION='ss+cs'.
+!
+! ### Version 1.02, 06/18/2007, file pot3d.f, modified by ZM:
+!
+!       - Fixed a bug that caused the code to hang when an error
+!         was encountered (when running a parallel job).
+!
+! ### Version 1.03, 03/17/2009, file pot3d.f, modified by ZM:
+!
+!       - Added the ability to write the boundary flux before the
+!         sign flip for current-sheet solutions (i.e., OPTION='open').
+!         Set the variable BR_PHOTO_ORIGINAL_FILE to the desired
+!         file name to request this.
+!
+! ### Version 1.50, 01/25/2016, file pot3d.f, modified by RC:
+!
+!       - Added new (much faster) BILU0 preconditioner to CG solver.
+!         To activate, set ifprec=2 in pot3d.dat file.
+!       - Modified CG solve to use 1D arrays
+!         for SAXPY and DOT operations.
+!
+! ### Version 2.00, 06/06/2017, file pot3d.f, modified by RC:
+!
+!       - Added OpenACC support to the code.
+!         - OpenACC support is currently ONLY on 'standard'
+!           pot3d runs (not inner-outer-iteratative mode)
+!           and is only efficient on GPUs when using ifprec=1.
+!         - OpenACC adds support for running the code on
+!           Nvidia GPUs/Intel KNL/x86-multicore/OpenPower.
+!         - To use OpenACC, simply compile the code with a compiler
+!           that supports OpenACC with the correct flags activated.
+!         - Multi-gpu support is included by using the new
+!           ngpus_per_node input parameter.  This should be set
+!           to the number of GPUs available per node.
+!           The number of MPI ranks per node should match the
+!           number of gpus per node.  This can be launched with
+!           "mpiexec -np <np> -ntasks-per-node <ngpus_per_node>".
+!         - The GPU features of the code are fully portable, i.e.
+!           the code can be compiled/used as before on CPUs with no
+!           changes in compilation or run-time.
+!       - Modified some routines to be "nicer" for OpenACC
+!         and optimized some MPI communications.
+!       - Added wall-clock timer and corrected placement of
+!         MPI_Finalize().  The wall clock timer now reflects the
+!         true runtime.
+!
+! ### Version 2.01, 10/02/2017, file pot3d.f, modified by RC:
+!
+!       - Optimized OpenACC.
+!       - Renamed cgsolv() to cgsolve().
+!
+! ### Version 2.10, 01/15/2018, file pot3d.f, modified by ZM+RC:
+!
+!       - Added the ability to skip the balancing of the flux
+!         when doing a PFSS or OPEN field.  To invoke this, set
+!         DO_NOT_BALNCE_FLUX=.true..
+!       - Changed some pointers to allocatables for better
+!         vectorization.
+!
+! ### Version 2.11, 03/19/2018, file pot3d.f, modified by RC:
+!
+!       - Added 'fmt' input parameter to set output file type.
+!         Set fmt='h5' to write out HDF5 (default is 'hdf').
+!
+! ### Version 2.12, 10/08/2018, file pot3d.f, modified by RC:
+!
+!       - COMPATIBILITY CHANGE! Renamed gpus_per_node to gpn.
+!         gpn is default 0 which will set gpn to the number of
+!         MPI ranks in the local node.
+!         Setting gpn manually is not recommended and only used for
+!         oversubscribing the GPUs.
+!       - Added MPI shared communicator for automatically setting gpn.
+!         This requires an MPI-3 capable MPI library.
+!       - Changed layout of matrix coefficient arrays to be more
+!         vector-friendly instead of cache-friendly.
+!
+! ### Version 2.13, 11/19/2018, file pot3d.f, modified by RC:
+!
+!       - Small modifications to polar boundary condition calculations.
+!       - Updated array ops and ACC clauses to be F2003 optimized.
+!
+! ### Version 2.20, 01/09/2019, file pot3d.f, modified by RC:
+!
+!       - Added double precision output option.
+!         Set hdf32=.false. to activate 64-bit output.
+!       - Updated magnetic field computation.  B is now computed
+!         in parallel.  3D output fields now gathered to rank 0
+!         using Sends and Receives instead of Gatherv in order
+!         to allow very large resolutions.
+!       - Added automatic topology.  Now, nprocs is optional.
+!         One can specify one or more topology dimensions and
+!         use the flag value "-1" to indicate dimensions to auto-set.
+!         It is recommended to simply delete nprocs from input files.
+!       - Added output file flushing so CG iterations can be monitored.
+!       - Added new MPI rank diagnostics including
+!         estimated load imbalance.
+!       - Processor topology and magnetic energy output now written to
+!         pot3d.out as well as the terminal.
+!
+! ### Version 2.21, 01/11/2019, file pot3d.f, modified by RC:
+!
+!       - Small updates to magnetic_energy routine.
+!
+! ### Version 2.22, 11/27/2019, file pot3d.f, modified by RC:
+!
+!       - Optimized some OpenACC directives.  Expanded some
+!         array-syntax lines to full loops.
+!
+! ### Version 2.23, 08/11/2020, file pot3d.f, modified by RC:
+!
+!       - Small bug fix for default output file names and
+!         format option fmt.
+!
+! ### Version 3.0.0, 02/10/2021, file pot3d.f, modified by RC:
+!
+!       - Changed version number scheme to semantic versioning.
+!
+! ### Version 3.1.0, 08/11/2020, file pot3d.f, modified by RC:
+!
+!       - Removed ss+cs inner-outer iterative mode.
+!         It had not been used in many years and never worked
+!         quite right.  Algorithm is saved in previous releases
+!         on the github and svn.
+!       - Added a validation run mode.  To activate, set:
+!         VALIDATION_RUN=.TRUE.
+!         This will set up a tilted dipole exact solution, write it
+!         out, and then solve for it.  It will ignore input br files.
+!         To change the angle of the dipole, set DIPOLE_ANGLE
+!         (default is pi/4).
+!       - Added NCGFLUSH parameter to flush output file during solver
+!         iterations if NCGHIST>0 every NCGFLUSH iteration (default 25).
+!         This is useful to track progress of the solver.
+!
+! ### Version 3.1.1, 11/02/2021, modified by RC:
+!
+!       - Fixed default output name mechanics.  The default names
+!         were ignoring "fmt", making "fmt" almost useless.
+!
+! ### Version 3.2.0, 12/21/2021, modified by RC:
+!
+!       - GPN is now depricated.  Now, # of GPUs per node is assumed
+!         to be equal to # of MPI ranks per node.
+!         This removes the ability to oversubscribe GPUs, but this
+!         was not being used ever, so no great loss.
+!         This change was done to allow setting the gpu device
+!         number before any allocations so that it works correctly
+!         when using NVIDIA unified managed memory.
+!
+! ### Version 3.3.0, 02/14/2022, modified by RC:
+!
+!       - Added new timers for AX, PC, PCLOAD, and modified some timers
+!         to be more useful.
+!       - Updated ILU0 PC2 preconditioner to use single precision.
+!         This speeds up the solve both on CPUs and GPUs.
+!
+! ### Version 4.0.0, 03/02/2022, modified by RC/MS:
+!
+!       - Added cuSparse library calls for ifprec=2 on NVIDIA GPUs.
+!       - Added #ifdef
+!         preprocessor directives which will either compile a cusparse
+!         GPU version of the code or a CPU compatible version
+!         without cusparse calls and no c code references
+!
+! ### Version 4.0.1, 03/31/2022, modified by RC:
+!
+!       - Added 'target' attribute to arrays used with C_LOC.
+!         This fixes a compiler error with GFORTRAN when
+!         using cuSparse.
+!
+! ### Version 4.1.0, 10/24/2022, modified by RC:
+!
+!       - Removed NCGFLUSH input parameter.  The output is
+!         flushed at every write now.
+!       - Changed behavior of solver residual reporting
+!         to also output to terminal.
+!       - Removed flush_output_file() routine in favor of Fortran 2003
+!         intrinsic FLUSH.
+!
+! ### Version 4.2.0, 08/21/2024, modified by MS/RC:
+!
+!       - Replaced OpenACC with OpenMP Target directives.
+!         There remains a single OpenACC directive for device selection
+!         in order to work with nvfortran.
+!
+! ### Version 4.2.1, 09/19/2024, modified by RC:
+!
+!       - Changed array reduction loops to use
+!         OpenMP reductions instead of atomics.
+!       - Updated OpenMP target directives to use the
+!         more streamlined "loop" syntax.
+!
+! ### Version 4.2.2, 12/05/2024, modified by RC:
+!
+!       - Fixed bug with OpenMP target loops.
+!         Was using 'target loop' instead of 'target teams loop'
+!         nvfortran seemed to allow this, but ifx does not.
+!
+! ### Version 4.3.0, 12/05/2024, modified by RC:
+!
+!       - Converted code to free-format and renamed file from
+!         pot3d.F to pot3d.F90.
+!
+! ### Version 4.3.1, 12/05/2024, modified by RC:
+!
+!       - Code cleanup of modules.
+!       - Made solver seam buffers global to avoid too many
+!         allocates and deallocates for GPU.
+!
+!#######################################################################
diff --git a/src/pot3d.F90 b/src/pot3d.F90
index 1db8f4a..d23c271 100644
--- a/src/pot3d.F90
+++ b/src/pot3d.F90
@@ -52,7 +52,7 @@ module ident
 !-----------------------------------------------------------------------
 !
       character(*), parameter :: idcode='POT3D'
-      character(*), parameter :: vers  ='4.3.0'
+      character(*), parameter :: vers  ='4.3.1'
       character(*), parameter :: update='12/05/2024'
 !
 end module
@@ -129,65 +129,38 @@ module global_dims
 !
 end module
 !#######################################################################
-module local_dims_r
+module local_dims
 !
 !-----------------------------------------------------------------------
-! ****** Local number of mesh points and indices in the r direction
-! ****** for the radial mesh.
+! ****** Local number of mesh points.
 !-----------------------------------------------------------------------
 !
       implicit none
 !
-! ****** Local mesh size.
-!
-      integer :: nr,nrm1
-!
-! ****** Dimensions of arrays on the "main" mesh.
-!
-      integer :: nrm
-!
-! ****** Indices of start and end points in the global mesh
-! ****** belonging to this processor.
-!
-      integer :: i0_g,i1_g
-!
 ! ****** Flags to indicate whether this processor has points
 ! ****** on the physical boundaries.
 !
-      logical :: rb0,rb1
-!
-end module
-!#######################################################################
-module local_dims_tp
-!
-!-----------------------------------------------------------------------
-! ****** Local number of mesh points and indices in the theta
-! ****** and phi dimensions.
-!-----------------------------------------------------------------------
-!
-      implicit none
+      logical :: rb0,rb1,tb0,tb1
 !
 ! ****** Local mesh size.
 !
+      integer :: nr,nrm1
       integer :: nt,ntm1
       integer :: np,npm1
 !
 ! ****** Dimensions of arrays on the "main" mesh.
 !
+      integer :: nrm
       integer :: ntm
       integer :: npm
 !
 ! ****** Indices of start and end points in the global mesh
 ! ****** belonging to this processor.
 !
+      integer :: i0_g,i1_g
       integer :: j0_g,j1_g
       integer :: k0_g,k1_g
 !
-! ****** Flags to indicate whether this processor has points
-! ****** on the physical boundaries.
-!
-      logical :: tb0,tb1
-!
 end module
 !#######################################################################
 module global_mesh
@@ -225,10 +198,10 @@ module global_mesh
 !
 end module
 !#######################################################################
-module local_mesh_r
+module local_mesh
 !
 !-----------------------------------------------------------------------
-! ****** Local mesh for the r dimension.
+! ****** Local mesh.
 !-----------------------------------------------------------------------
 !
       use number_types
@@ -237,23 +210,6 @@ module local_mesh_r
 !
       real(r_typ), dimension(:), allocatable :: r,r2,rh,dr,drh
       real(r_typ) :: dr1,drn
-!
-! ****** Inverse quantities (for efficiency).
-!
-      real(r_typ), dimension(:), allocatable :: r_i,rh_i
-      real(r_typ), dimension(:), allocatable :: dr_i,drh_i
-!
-end module
-!#######################################################################
-module local_mesh_tp
-!
-!-----------------------------------------------------------------------
-! ****** Local mesh for the theta and phi dimensions.
-!-----------------------------------------------------------------------
-!
-      use number_types
-!
-      implicit none
 !
       real(r_typ), dimension(:), allocatable :: t,th,dt,dth
       real(r_typ), dimension(:), allocatable :: p,ph,dp,dph
@@ -263,6 +219,8 @@ module local_mesh_tp
 !
 ! ****** Inverse quantities (for efficiency).
 !
+      real(r_typ), dimension(:), allocatable :: r_i,rh_i
+      real(r_typ), dimension(:), allocatable :: dr_i,drh_i
       real(r_typ), dimension(:), allocatable :: dt_i,dth_i
       real(r_typ), dimension(:), allocatable :: st_i,sth_i
       real(r_typ), dimension(:), allocatable :: dp_i,dph_i
@@ -479,6 +437,16 @@ module cgcom
       integer :: ncg
       real(r_typ) :: epsn
 !
+! ****** Seam buffers.
+!
+      real(r_typ), allocatable, dimension(:,:) :: sbuf_rt1,sbuf_rt2
+      real(r_typ), allocatable, dimension(:,:) :: sbuf_tp1,sbuf_tp2
+      real(r_typ), allocatable, dimension(:,:) :: sbuf_rp1,sbuf_rp2
+!
+      real(r_typ), allocatable, dimension(:,:) :: rbuf_rt1,rbuf_rt2
+      real(r_typ), allocatable, dimension(:,:) :: rbuf_tp1,rbuf_tp2
+      real(r_typ), allocatable, dimension(:,:) :: rbuf_rp1,rbuf_rp2
+!
 end module
 !#######################################################################
 module vars
@@ -763,6 +731,10 @@ program POT3D
 !
       call decomp_diags
 !
+! ****** Set up seam for solver.
+!
+      call seam_setup
+!
 ! ****** Initialize the flux and if validating, write analytic solution.
 !
       if (validation_run) then
@@ -1646,7 +1618,7 @@ subroutine decompose_mesh_r
 !-----------------------------------------------------------------------
 !
       use global_dims
-      use local_dims_r
+      use local_dims
       use decomposition
       use solve_params
       use mpidefs
@@ -1779,7 +1751,7 @@ subroutine decompose_mesh_tp
 !-----------------------------------------------------------------------
 !
       use global_dims
-      use local_dims_tp
+      use local_dims
       use decomposition
       use mpidefs
 !
@@ -2209,10 +2181,8 @@ subroutine decomp_diags
 !
       use global_dims
       use global_mesh
-      use local_dims_r
-      use local_mesh_r
-      use local_dims_tp
-      use local_mesh_tp
+      use local_dims
+      use local_mesh
       use mpidefs
       use solve_params
       use debug
@@ -2386,9 +2356,8 @@ subroutine allocate_local_arrays_r
 !
 !-----------------------------------------------------------------------
 !
-      use local_dims_r
-      use local_mesh_r
-      use local_dims_tp
+      use local_dims
+      use local_mesh
       use fields
 !
 !-----------------------------------------------------------------------
@@ -2447,8 +2416,8 @@ subroutine allocate_local_arrays_tp
 !
 !-----------------------------------------------------------------------
 !
-      use local_dims_tp
-      use local_mesh_tp
+      use local_dims
+      use local_mesh
       use fields
       use vars
       use solve_params
@@ -2645,8 +2614,8 @@ subroutine set_local_mesh_r
       use number_types
       use global_dims
       use global_mesh
-      use local_dims_r
-      use local_mesh_r
+      use local_dims
+      use local_mesh
 !
 !-----------------------------------------------------------------------
 !
@@ -2699,8 +2668,8 @@ subroutine set_local_mesh_tp
       use number_types
       use global_dims
       use global_mesh
-      use local_dims_tp
-      use local_mesh_tp
+      use local_dims
+      use local_mesh
 !
 !-----------------------------------------------------------------------
 !
@@ -3689,10 +3658,8 @@ subroutine set_flux
       use number_types
       use global_dims
       use global_mesh
-      use local_dims_r
-      use local_mesh_r
-      use local_dims_tp
-      use local_mesh_tp
+      use local_dims
+      use local_mesh
       use fields
       use vars
       use mpidefs
@@ -3801,10 +3768,8 @@ subroutine potfld
 !-----------------------------------------------------------------------
 !
       use number_types
-      use local_dims_r
-      use local_mesh_r
-      use local_dims_tp
-      use local_mesh_tp
+      use local_dims
+      use local_mesh
       use fields
       use cgcom
       use solve_params
@@ -3866,7 +3831,7 @@ subroutine potfld
 ! ****** boundary conditions (i.e., the inhomogenous part).
 !
       call set_boundary_points (x_ax,one)
-      call seam (x_ax,nr,nt,np)
+      call seam_hhh (x_ax)
       call delsq (x_ax,rhs_cg)
 !
 ! ****** Original rhs is zero so just use negative of boundary
@@ -3893,7 +3858,7 @@ subroutine potfld
       call unpack_scalar (phi,x_cg)
 !
       call set_boundary_points (phi,one)
-      call seam (phi,nr,nt,np)
+      call seam_hhh (phi)
 !
 !$omp target exit data map(delete:rhs_cg,x_cg,a,a_i)
       call dealloc_pot3d_matrix_coefs
@@ -3914,10 +3879,8 @@ subroutine set_validation_flux
       use number_types
       use global_dims
       use global_mesh
-      use local_dims_r
-      use local_mesh_r
-      use local_dims_tp
-      use local_mesh_tp
+      use local_dims
+      use local_mesh
       use fields
       use vars
       use mpidefs
@@ -4006,10 +3969,8 @@ subroutine write_validation_solution
       use number_types
       use global_dims
       use global_mesh
-      use local_dims_r
-      use local_mesh_r
-      use local_dims_tp
-      use local_mesh_tp
+      use local_dims
+      use local_mesh
       use fields
       use vars
       use mpidefs
@@ -4038,7 +3999,7 @@ subroutine write_validation_solution
       enddo
 !
       call set_boundary_points (phi,one)
-      call seam (phi,nr,nt,np)
+      call seam_hhh (phi)
 !
 !$omp target enter data map(alloc:br,bt,bp)
 !
@@ -4065,9 +4026,9 @@ subroutine write_validation_solution
 ! ****** Due to half mesh trig issues on periodic domain, we need
 ! ****** to seam these even though seaming is not needed in getb().
 !
-      call seam (br,nrm,nt,np)
-      call seam (bt,nr,ntm,np)
-      call seam (bp,nr,nt,npm)
+      call seam_gen (br,nrm,nt,np)
+      call seam_gen (bt,nr,ntm,np)
+      call seam_gen (bp,nr,nt,npm)
 !
 ! ****** Backup fnames and set new ones so we can reuse write routine.
 !
@@ -4421,8 +4382,7 @@ subroutine alloc_pot3d_matrix_coefs
 !
       use matrix_storage_pot3d_solve
       use cgcom
-      use local_dims_r
-      use local_dims_tp
+      use local_dims
 !
 !-----------------------------------------------------------------------
 !
@@ -4500,10 +4460,8 @@ subroutine load_matrix_pot3d_solve
 !
       use number_types
       use matrix_storage_pot3d_solve
-      use local_dims_r
-      use local_mesh_r
-      use local_dims_tp
-      use local_mesh_tp
+      use local_dims
+      use local_mesh
 !
 !-----------------------------------------------------------------------
 !
@@ -4553,8 +4511,7 @@ subroutine load_preconditioner_pot3d_solve
       use number_types
       use matrix_storage_pot3d_solve
       use cgcom
-      use local_dims_r
-      use local_dims_tp
+      use local_dims
       use, intrinsic :: iso_c_binding
       use cusparse_interface
       use timing
@@ -4816,8 +4773,7 @@ subroutine diacsr (N,M,Adia,ioff,Acsr,JA,IA,Adptr)
 !
       use number_types
       use number_types_pc
-      use local_dims_r
-      use local_dims_tp
+      use local_dims
       use mpidefs
 !
 !-----------------------------------------------------------------------
@@ -4962,8 +4918,7 @@ subroutine getM (N, ioff, M)
 !-----------------------------------------------------------------------
 !
       use mpidefs
-      use local_dims_r
-      use local_dims_tp
+      use local_dims
 !
 !-----------------------------------------------------------------------
 !
@@ -5059,8 +5014,7 @@ subroutine ax (x,y,N)
 !-----------------------------------------------------------------------
 !
       use number_types
-      use local_dims_r
-      use local_dims_tp
+      use local_dims
       use fields, ONLY : x_ax
       use timing
 !
@@ -5089,7 +5043,7 @@ subroutine ax (x,y,N)
 !
 ! ****** Seam along edges between processors.
 !
-      call seam (x_ax,nr,nt,np)
+      call seam_hhh (x_ax)
 !
 ! ****** Get the matrix-vector product.
 !
@@ -5263,8 +5217,7 @@ subroutine unpack_scalar (s,x)
 !-----------------------------------------------------------------------
 !
       use number_types
-      use local_dims_r
-      use local_dims_tp
+      use local_dims
 !
 !-----------------------------------------------------------------------
 !
@@ -5296,8 +5249,7 @@ subroutine delsq (x,y)
 !-----------------------------------------------------------------------
 !
       use number_types
-      use local_dims_r
-      use local_dims_tp
+      use local_dims
       use matrix_storage_pot3d_solve
 !
 !-----------------------------------------------------------------------
@@ -5337,10 +5289,8 @@ subroutine set_boundary_points (x,vmask)
 !
       use number_types
       use global_mesh
-      use local_dims_r
-      use local_mesh_r
-      use local_dims_tp
-      use local_mesh_tp
+      use local_dims
+      use local_mesh
       use fields
       use solve_params
       use vars, ONLY : validation_run
@@ -5456,7 +5406,7 @@ subroutine sum_over_phi (n,a0,a1)
 !-----------------------------------------------------------------------
 !
       use number_types
-      use local_dims_tp
+      use local_dims
       use mpidefs
       use timing
 !
@@ -5505,8 +5455,7 @@ subroutine zero_boundary_points (x)
 !-----------------------------------------------------------------------
 !
       use number_types
-      use local_dims_r
-      use local_dims_tp
+      use local_dims
 !
 !-----------------------------------------------------------------------
 !
@@ -5600,7 +5549,207 @@ subroutine global_sum (x)
 !
 end subroutine
 !#######################################################################
-subroutine seam (a,n1,n2,n3)
+subroutine seam_setup
+!
+!-----------------------------------------------------------------------
+!
+! ****** Allocate seam buffers.
+! ****** Doing this here avoids GPU allocate/deallocates.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use cgcom
+      use local_mesh
+      use local_dims
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      allocate(sbuf_rt1(nr,nt))
+      allocate(sbuf_rt2(nr,nt))
+      allocate(sbuf_tp1(nt,np))
+      allocate(sbuf_tp2(nt,np))
+      allocate(sbuf_rp1(nr,np))
+      allocate(sbuf_rp2(nr,np))
+!
+      allocate(rbuf_rt1(nr,nt))
+      allocate(rbuf_rt2(nr,nt))
+      allocate(rbuf_tp1(nt,np))
+      allocate(rbuf_tp2(nt,np))
+      allocate(rbuf_rp1(nr,np))
+      allocate(rbuf_rp2(nr,np))
+!
+!$omp target enter data map(alloc:sbuf_rt1,sbuf_rt2)
+!$omp target enter data map(alloc:sbuf_tp1,sbuf_tp2)
+!$omp target enter data map(alloc:sbuf_rp1,sbuf_rp2)
+!$omp target enter data map(alloc:rbuf_rt1,rbuf_rt2)
+!$omp target enter data map(alloc:rbuf_tp1,rbuf_tp2)
+!$omp target enter data map(alloc:rbuf_rp1,rbuf_rp2)
+!
+end subroutine
+!#######################################################################
+subroutine seam_hhh (a)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Seam the boundary points of 3D (r,t,p) array A between
+! ****** adjacent processors.
+!
+! ****** This routine assumes that there is a two-point
+! ****** overlap between processors in each dimension.
+!
+!-----------------------------------------------------------------------
+!
+! ****** This version uses non-blocking MPI sends and receives
+! ****** whenever possible in order to overlap communications.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use mpidefs
+      use timing
+      use local_mesh
+      use local_dims
+      use cgcom
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), dimension(nr,nt,np) :: a
+!
+!-----------------------------------------------------------------------
+!
+! ****** MPI error return.
+!
+      integer :: ierr
+!
+! ****** MPI tag for MPI_ISEND and MPI_IRECV (not tagged).
+!
+      integer :: tag=0
+!
+!-----------------------------------------------------------------------
+!
+      integer :: lbuf,i,j
+      integer :: reqs(4)
+!
+!-----------------------------------------------------------------------
+!
+      call timer_on
+!
+! ****** Seam the third (periodic) dimension.
+! ****** Since halo data is stride-1, no need for buffers.
+!
+      lbuf=nr*nt
+!
+!$omp target data use_device_ptr(a)
+      call MPI_Isend (a(:,:,np-1),lbuf,ntype_real,iproc_pp,tag, &
+                      comm_all,reqs(1),ierr)
+!
+      call MPI_Isend (a(:,:,   2),lbuf,ntype_real,iproc_pm,tag, &
+                      comm_all,reqs(2),ierr)
+!
+      call MPI_Irecv (a(:,:, 1),lbuf,ntype_real,iproc_pm,tag,   &
+                      comm_all,reqs(3),ierr)
+!
+      call MPI_Irecv (a(:,:,np),lbuf,ntype_real,iproc_pp,tag,   &
+                      comm_all,reqs(4),ierr)
+!
+      call MPI_Waitall (4,reqs,MPI_STATUSES_IGNORE,ierr)
+!$omp end target data
+!
+! ****** Seam the first dimension.
+!
+      if (nproc_r.gt.1) then
+!
+        lbuf=nt*np
+!
+        do concurrent (j=1:np, i=1:nt)
+          sbuf_tp1(i,j)=a(nr-1,i,j)
+          sbuf_tp2(i,j)=a(   2,i,j)
+        enddo
+!
+!$omp target data use_device_ptr(sbuf_tp1,sbuf_tp2,rbuf_tp1,rbuf_tp2)
+        call MPI_Isend (sbuf_tp1,lbuf,ntype_real,iproc_rp,tag, &
+                        comm_all,reqs(1),ierr)
+!
+        call MPI_Isend (sbuf_tp2,lbuf,ntype_real,iproc_rm,tag, &
+                        comm_all,reqs(2),ierr)
+!
+        call MPI_Irecv (rbuf_tp1,lbuf,ntype_real,iproc_rm,tag, &
+                        comm_all,reqs(3),ierr)
+!
+        call MPI_Irecv (rbuf_tp2,lbuf,ntype_real,iproc_rp,tag, &
+                        comm_all,reqs(4),ierr)
+!
+        call MPI_Waitall (4,reqs,MPI_STATUSES_IGNORE,ierr)
+!$omp end target data
+!
+        if (iproc_rm.ne.MPI_PROC_NULL) then
+          do concurrent (j=1:np, i=1:nt)
+            a( 1,i,j)=rbuf_tp1(i,j)
+          enddo
+        end if
+!
+        if (iproc_rp.ne.MPI_PROC_NULL) then
+          do concurrent (j=1:np, i=1:nt)
+            a(nr,i,j)=rbuf_tp2(i,j)
+          enddo
+        end if
+      end if
+!
+! ****** Seam the second dimension.
+!
+      if (nproc_t.gt.1) then
+!
+        lbuf=nr*np
+!
+        do concurrent (j=1:np, i=1:nr)
+          sbuf_rp1(i,j)=a(i,nt-1,j)
+          sbuf_rp2(i,j)=a(i,   2,j)
+        enddo
+!
+!$omp target data use_device_ptr(sbuf_rp1,sbuf_rp2,rbuf_rp1,rbuf_rp2)
+        call MPI_Isend (sbuf_rp1,lbuf,ntype_real,iproc_tp,tag, &
+                        comm_all,reqs(1),ierr)
+!
+        call MPI_Isend (sbuf_rp2,lbuf,ntype_real,iproc_tm,tag, &
+                        comm_all,reqs(2),ierr)
+!
+        call MPI_Irecv (rbuf_rp1,lbuf,ntype_real,iproc_tm,tag, &
+                        comm_all,reqs(3),ierr)
+!
+        call MPI_Irecv (rbuf_rp2,lbuf,ntype_real,iproc_tp,tag, &
+                        comm_all,reqs(4),ierr)
+!
+        call MPI_Waitall (4,reqs,MPI_STATUSES_IGNORE,ierr)
+!$omp end target data
+!
+        if (iproc_tm.ne.MPI_PROC_NULL) then
+          do concurrent (j=1:np, i=1:nr)
+            a(i, 1,j)=rbuf_rp1(i,j)
+          enddo
+        end if
+!
+        if (iproc_tp.ne.MPI_PROC_NULL) then
+          do concurrent (j=1:np, i=1:nr)
+            a(i,nt,j)=rbuf_rp2(i,j)
+          enddo
+        end if
+!
+      end if
+!
+      call timer_off (c_seam)
+!
+end subroutine
+!#######################################################################
+subroutine seam_gen (a,n1,n2,n3)
 !
 !-----------------------------------------------------------------------
 !
@@ -5954,10 +6103,8 @@ subroutine getb
       use global_mesh
       use vars
       use fields
-      use local_dims_r
-      use local_dims_tp
-      use local_mesh_r
-      use local_mesh_tp
+      use local_dims
+      use local_mesh
 !
 !-----------------------------------------------------------------------
 !
@@ -6005,10 +6152,8 @@ subroutine magnetic_energy
       use vars
       use fields
       use mpidefs
-      use local_dims_r
-      use local_dims_tp
-      use local_mesh_r
-      use local_mesh_tp
+      use local_dims
+      use local_mesh
 !
 !-----------------------------------------------------------------------
 !
@@ -6386,8 +6531,8 @@ subroutine write_timing
         write (1,*) 'Average times:'
         write (1,*) '-------------'
         write (1,*)
-        write (1,300) 'Avg         Min         Max      S. Dev'
-        write (1,300) '---         ---         ---      ------'
+        write (1,300) '    Avg         Min         Max      S. Dev'
+        write (1,300) '    ---         ---         ---      ------'
         write (1,400) 'Avg comm. time in SEAM    = ', &
                       tavg(7),tmin(7),tmax(7),tsdev(7)
         write (1,400) 'Avg comm. time in CGDOT   = ', &
@@ -7048,4 +7193,10 @@ subroutine interp (x,n,xv,i,alpha)
 !       - Converted code to free-format and renamed file from
 !         pot3d.F to pot3d.F90.
 !
+! ### Version 4.3.1, 12/05/2024, modified by RC:
+!
+!       - Code cleanup of modules.
+!       - Made solver seam buffers global to avoid too many
+!         allocates and deallocates for GPU.
+!
 !#######################################################################
diff --git a/src/stdpar/pot3d.F b/src/stdpar/pot3d.F
deleted file mode 100644
index 5c85e58..0000000
--- a/src/stdpar/pot3d.F
+++ /dev/null
@@ -1,6970 +0,0 @@
-c#######################################################################
-c  _____   ____ _______ ____  _____
-c |  __ \ / __ \__   __|___ \|  __ \
-c | |__) | |  | | | |    __) | |  | |
-c |  ___/| |  | | | |   |__ <| |  | |
-c | |    | |__| | | |   ___) | |__| |
-c |_|     \____/  |_|  |____/|_____/
-c
-c ****** POT3D: 3D potential magnetic field outside a sphere.
-c
-c ****** This program can find the classical potential field, the
-c ****** fully open field, the source-surface field, and the
-c ****** source-surface plus current-sheet field.
-c
-c        Authors:  Zoran Mikic
-c                  Ronald M. Caplan
-c                  Jon A. Linker
-c                  Roberto Lionello
-c                  Miko Stulajter
-c
-c        Predictive Science Inc.
-c        www.predsci.com
-c        San Diego, California, USA 92121
-c
-c#######################################################################
-c Copyright 2021 Predictive Science Inc.
-c
-c Licensed under the Apache License, Version 2.0 (the "License");
-c you may not use this file except in compliance with the License.
-c You may obtain a copy of the License at
-c
-c    http://www.apache.org/licenses/LICENSE-2.0
-c
-c Unless required by applicable law or agreed to in writing, software
-c distributed under the License is distributed on an "AS IS" BASIS,
-c WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-c implied.
-c See the License for the specific language governing permissions and
-c limitations under the License.
-c#######################################################################
-c
-c#######################################################################
-c
-      module ident
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c ****** Code name.
-c-----------------------------------------------------------------------
-c
-      character(*), parameter :: idcode='POT3D'
-      character(*), parameter :: vers  ='4.2.0_STDPAR'
-      character(*), parameter :: update='08/21/2024'
-c
-      end module
-c#######################################################################
-      module number_types
-c
-c-----------------------------------------------------------------------
-c ****** Basic number types.
-c ****** This module is used to set the default precision for REALs.
-c-----------------------------------------------------------------------
-c
-      use iso_fortran_env
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-      integer, parameter :: KIND_REAL_4=REAL32
-      integer, parameter :: KIND_REAL_8=REAL64
-      integer, parameter :: KIND_REAL_16=max(REAL128,REAL64)
-c
-      integer, parameter :: r_typ=KIND_REAL_8
-c
-      end module
-c#######################################################################
-      module number_types_pc
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use iso_fortran_env
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer, parameter :: r_typ_pc=REAL32
-c
-      end module
-c#######################################################################
-      module constants
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-      real(r_typ), parameter :: pi=3.1415926535897932_r_typ
-c
-      end module
-c#######################################################################
-      module global_dims
-c
-c-----------------------------------------------------------------------
-c ****** Global number of mesh points.
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c ****** Global mesh size.
-c
-      integer :: nr_g,nrm1_g
-      integer :: nt_g,ntm1_g
-      integer :: np_g,npm1_g
-c
-c ****** Flag to indicate an axisymmetric run.
-c
-      logical :: axisymmetric=.false.
-c
-      end module
-c#######################################################################
-      module local_dims_r
-c
-c-----------------------------------------------------------------------
-c ****** Local number of mesh points and indices in the r direction
-c ****** for the radial mesh.
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c ****** Local mesh size.
-c
-      integer :: nr,nrm1
-c
-c ****** Dimensions of arrays on the "main" mesh.
-c
-      integer :: nrm
-c
-c ****** Indices of start and end points in the global mesh
-c ****** belonging to this processor.
-c
-      integer :: i0_g,i1_g
-c
-c ****** Flags to indicate whether this processor has points
-c ****** on the physical boundaries.
-c
-      logical :: rb0,rb1
-c
-      end module
-c#######################################################################
-      module local_dims_tp
-c
-c-----------------------------------------------------------------------
-c ****** Local number of mesh points and indices in the theta
-c ****** and phi dimensions.
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c ****** Local mesh size.
-c
-      integer :: nt,ntm1
-      integer :: np,npm1
-c
-c ****** Dimensions of arrays on the "main" mesh.
-c
-      integer :: ntm
-      integer :: npm
-c
-c ****** Indices of start and end points in the global mesh
-c ****** belonging to this processor.
-c
-      integer :: j0_g,j1_g
-      integer :: k0_g,k1_g
-c
-c ****** Flags to indicate whether this processor has points
-c ****** on the physical boundaries.
-c
-      logical :: tb0,tb1
-c
-      end module
-c#######################################################################
-      module global_mesh
-c
-c-----------------------------------------------------------------------
-c ****** Global mesh.
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use constants
-c
-      implicit none
-c
-      real(r_typ), dimension(:), allocatable :: r_g,rh_g,dr_g,drh_g
-      real(r_typ), dimension(:), allocatable :: t_g,th_g,dt_g,dth_g
-      real(r_typ), dimension(:), allocatable :: p_g,ph_g,dp_g,dph_g
-c
-      real(r_typ), dimension(:), allocatable :: st_g,ct_g,sth_g,cth_g
-      real(r_typ), dimension(:), allocatable :: sp_g,cp_g,sph_g,cph_g
-c
-c ****** Physical mesh size.
-c
-      real(r_typ), parameter, private :: one=1._r_typ
-      real(r_typ), parameter, private :: two=2._r_typ
-c
-      real(r_typ) :: r0=1._r_typ
-      real(r_typ) :: r1=30._r_typ
-      real(r_typ), parameter :: t0=0.
-      real(r_typ), parameter :: t1=pi
-      real(r_typ), parameter :: p0=0.
-      real(r_typ), parameter :: p1=two*pi
-c
-      real(r_typ), parameter :: pl=p1-p0
-      real(r_typ), parameter :: pl_i=one/pl
-c
-      end module
-c#######################################################################
-      module local_mesh_r
-c
-c-----------------------------------------------------------------------
-c ****** Local mesh for the r dimension.
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-      implicit none
-c
-      real(r_typ), dimension(:), allocatable :: r,r2,rh,dr,drh
-      real(r_typ) :: dr1,drn
-c
-c ****** Inverse quantities (for efficiency).
-c
-      real(r_typ), dimension(:), allocatable :: r_i,rh_i
-      real(r_typ), dimension(:), allocatable :: dr_i,drh_i
-c
-      end module
-c#######################################################################
-      module local_mesh_tp
-c
-c-----------------------------------------------------------------------
-c ****** Local mesh for the theta and phi dimensions.
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-      implicit none
-c
-      real(r_typ), dimension(:), allocatable :: t,th,dt,dth
-      real(r_typ), dimension(:), allocatable :: p,ph,dp,dph
-c
-      real(r_typ), dimension(:), allocatable :: st,ct,sth,cth
-      real(r_typ), dimension(:), allocatable :: sp,cp,sph,cph
-c
-c ****** Inverse quantities (for efficiency).
-c
-      real(r_typ), dimension(:), allocatable :: dt_i,dth_i
-      real(r_typ), dimension(:), allocatable :: st_i,sth_i
-      real(r_typ), dimension(:), allocatable :: dp_i,dph_i
-c
-      end module
-c#######################################################################
-      module mpidefs
-c
-c-----------------------------------------------------------------------
-c ****** MPI variables, processor topology, and processor information.
-c-----------------------------------------------------------------------
-c
-      use mpi
-c
-      implicit none
-c
-c ****** Total number of processors.
-c
-      integer :: nproc
-c
-c ****** Total number of processors per node.
-c
-      integer :: nprocsh
-c
-c ****** Processor rank of this process in communicator
-c ****** MPI_COMM_WORLD.
-c
-      integer :: iprocw
-c
-c ****** Processor rank of this process in communicator
-c ****** comm_shared.
-c
-      integer :: iprocsh
-c
-c ****** Flag to designate that this is the processor with
-c ****** rank 0 in communicator MPI_COMM_WORLD.
-c
-      logical :: iamp0
-c
-c ****** Communicator over all processors in the Cartesian topology.
-c
-      integer :: comm_all
-c
-c ****** Processor rank of this process in communicator
-c ****** COMM_ALL.
-c
-      integer :: iproc
-c
-c ****** Processor rank in communicator COMM_ALL for the
-c ****** processor that has rank 0 in MPI_COMM_WORLD.
-c
-      integer :: iproc0
-c
-c ****** Communicators over all processors in the phi dimension.
-c
-      integer :: comm_phi
-c
-c ****** Communicator over all shared processors on the node.
-c
-      integer :: comm_shared
-c
-c ****** Communicators over all processors in the r dimension.
-c
-      integer :: comm_r
-c
-c ****** Processor coordinate indices of this process
-c ****** in the Cartesian topology.
-c
-      integer :: iproc_r,iproc_t,iproc_p
-c
-c ****** Processor coordinate indices of the neighboring
-c ****** processors in the Cartesian topology.
-c
-      integer :: iproc_rm,iproc_rp
-      integer :: iproc_tm,iproc_tp
-      integer :: iproc_pm,iproc_pp
-c
-c ****** Number of processors along r, theta, and phi.
-c
-      integer :: nproc_r,nproc_t,nproc_p
-c
-c ****** Number type for REALs to be used in MPI calls.
-c
-      integer :: ntype_real
-c
-c ****** Total number of GPUs/node (DEPRICATED).
-c
-      integer :: gpn=0
-c
-      end module
-c#######################################################################
-      module decomposition_params
-c
-c-----------------------------------------------------------------------
-c ****** Input parameters that define the domain decomposition
-c ****** among processors.
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c ****** Number of processors per dimension.
-c
-      integer, dimension(3) :: nprocs=(/-1,-1,-1/)
-c
-      end module
-c#######################################################################
-      module decomposition
-c
-c-----------------------------------------------------------------------
-c ****** Information defining the domain decomposition.
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c ****** Define the structure type for mapping local arrays
-c ****** into global arrays.
-c
-      type :: map_struct
-        integer :: n
-        integer :: i0
-        integer :: i1
-        integer :: offset
-      end type
-c
-c ****** Mapping structures for the different mesh types.
-c
-      type(map_struct), dimension(:), pointer :: map_rh
-      type(map_struct), dimension(:), pointer :: map_rm
-      type(map_struct), dimension(:), pointer :: map_th
-      type(map_struct), dimension(:), pointer :: map_tm
-      type(map_struct), dimension(:), pointer :: map_ph
-      type(map_struct), dimension(:), pointer :: map_pm
-c
-      end module
-c#######################################################################
-      module meshdef
-c
-c-----------------------------------------------------------------------
-c ****** Variables that define the mesh-point distributions.
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-      implicit none
-c
-      integer, parameter :: nmseg=30
-c
-      real(r_typ), dimension(nmseg) :: drratio=0.
-      real(r_typ), dimension(nmseg) :: dtratio=0.
-      real(r_typ), dimension(nmseg) :: dpratio=0.
-      real(r_typ), dimension(nmseg) :: rfrac=0.
-      real(r_typ), dimension(nmseg) :: tfrac=0.
-      real(r_typ), dimension(nmseg) :: pfrac=0.
-c
-      integer :: nfrmesh=0
-      integer :: nftmesh=0
-      integer :: nfpmesh=0
-c
-      real(r_typ) :: phishift=0.
-c
-      end module
-c#######################################################################
-      module fields
-c
-c-----------------------------------------------------------------------
-c ****** Local field arrays.
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-      implicit none
-c
-c ****** Potential solution array and cg temp array.
-c
-      real(r_typ), dimension(:,:,:), allocatable :: phi
-      real(r_typ), dimension(:,:,:), allocatable :: x_ax
-c
-c ****** Boundary radial magnetic field arrays.
-c
-      real(r_typ), dimension(:,:), allocatable :: br0
-      real(r_typ), dimension(:,:), allocatable :: br1
-c
-c ****** Arrays used in polar boundary conditions.
-c
-      real(r_typ), dimension(:), allocatable :: sum0,sum1
-c
-c ****** Arrays used for final magnetic field.
-c
-      real(r_typ), dimension(:,:,:), allocatable :: br,bt,bp
-c
-      end module
-c#######################################################################
-      module cgcom
-c
-      use number_types
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c ****** Number of equations to solve in the CG solver.
-c-----------------------------------------------------------------------
-c
-      integer :: ncgeq
-c
-c-----------------------------------------------------------------------
-c ****** CG field solver parameters.
-c-----------------------------------------------------------------------
-c
-      integer :: ifprec=1
-      integer :: ncgmax=1000000
-      integer :: ncghist=100
-      real(r_typ) :: epscg=1.e-9
-c
-c-----------------------------------------------------------------------
-c ****** CG field solver variables.
-c-----------------------------------------------------------------------
-c
-      integer :: ncg
-      real(r_typ) :: epsn
-c
-      end module
-c#######################################################################
-      module vars
-c
-c-----------------------------------------------------------------------
-c ****** Miscellaneous input variables.
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-      implicit none
-c
-      character(256) :: outfile='pot3d.out'
-      character(256) :: phifile='default'
-      character(256) :: br0file='default'
-      character(256) :: brfile='default'
-      character(256) :: btfile='default'
-      character(256) :: bpfile='default'
-      character(256) :: br_photo_file='default'
-      character(256) :: br_photo_original_file='default'
-c
-c ****** Type of field solution.
-c ****** Select between 'potential', 'open', and 'source-surface'.
-c
-      character(16) :: option='potential'
-c
-c ****** Interval at which to dump diagonstics during the
-c ****** iteration for the source-surface plus current-sheet
-c ****** solution.
-c
-      integer :: ndump=0
-c
-c ****** Flag to skip the balancing of the flux (for PFSS and
-c ****** OPEN field options only).
-
-      logical :: do_not_balance_flux=.false.
-c
-c ****** Set format for output files.
-c
-      character(3) :: fmt='h5'
-c
-      logical :: hdf32=.true.
-c
-c ***** Validation run (tilted dipole).
-c
-      logical :: validation_run=.false.
-c
-      real(r_typ) :: dipole_angle=0.7853981633974483_r_typ
-c
-      end module
-c#######################################################################
-      module solve_params
-c
-c-----------------------------------------------------------------------
-c ****** Parameters used in the solver.
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-      implicit none
-c
-c ****** Boundary condition switch at r=R1.
-c
-      real(r_typ) :: pm_r1
-c
-      end module
-c#######################################################################
-      module timer
-c
-c-----------------------------------------------------------------------
-c ****** Timer stack.
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-      implicit none
-c
-      integer, parameter :: nstack=10
-      integer :: istack=0
-      real(r_typ), dimension(nstack) :: tstart=0.
-c
-      end module
-c#######################################################################
-      module timing
-c
-c-----------------------------------------------------------------------
-c ****** Timing variables.
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-      implicit none
-c
-      real(r_typ) :: t_startup=0.
-      real(r_typ) :: t_solve=0.
-      real(r_typ) :: t_pc_load=0.
-      real(r_typ) :: t_pc=0.
-      real(r_typ) :: t_ax=0.
-      real(r_typ) :: t_io=0.
-      real(r_typ) :: c_seam=0.
-      real(r_typ) :: c_cgdot=0.
-      real(r_typ) :: c_sumphi=0.
-      real(r_typ) :: t_wall=0.
-c
-      end module
-c#######################################################################
-      module debug
-c
-c-----------------------------------------------------------------------
-c ****** Debugging level.
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-      integer :: idebug=0
-c
-      end module
-c#######################################################################
-      module assemble_array_interface
-      interface
-        subroutine assemble_array (map_r,map_t,map_p,a,a_g)
-        use number_types
-        use decomposition
-        use mpidefs
-        implicit none
-        type(map_struct), dimension(0:nproc-1) :: map_r,map_t,map_p
-        real(r_typ), dimension(:,:,:) :: a,a_g
-        end subroutine
-      end interface
-      end module
-c#######################################################################
-      module cusparse_interface
-c
-      use, intrinsic :: iso_c_binding
-c
-#ifdef CUSPARSE
-      interface
-        subroutine load_lusol_cusparse(CSR_A,CSR_AI,CSR_AJ,N,M)
-     &    BIND(C, name="load_lusol_cusparse")
-          use, intrinsic :: iso_c_binding
-          implicit none
-          integer(C_INT), value :: N,M
-          type(C_PTR), value :: CSR_A,CSR_AI,CSR_AJ
-        end subroutine load_lusol_cusparse
-c
-        subroutine lusol_cusparse(x)
-     &    BIND(C, name="lusol_cusparse")
-          use, intrinsic :: iso_c_binding
-          implicit none
-          type(C_PTR), value :: x
-        end subroutine lusol_cusparse
-c
-        subroutine unload_lusol_cusparse()
-     &    BIND(C, name="unload_lusol_cusparse")
-        end subroutine unload_lusol_cusparse
-      end interface
-#endif
-c
-      integer(c_int) :: cN,cM
-c
-      end module
-c#######################################################################
-      module matrix_storage_pot3d_solve
-c
-c-----------------------------------------------------------------------
-c ****** Storage for the matrix/preconditioners of the solve.
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use number_types_pc
-c
-      implicit none
-c
-      real(r_typ), dimension(:,:,:,:), allocatable :: a
-      real(r_typ_pc), dimension(:), allocatable :: a_i
-c
-      integer, dimension(7) :: a_offsets
-
-      integer :: N,M
-      real(r_typ_pc), dimension(:), allocatable, target :: a_csr
-      real(r_typ_pc), dimension(:), allocatable :: lu_csr
-      real(r_typ_pc), dimension(:), allocatable :: a_csr_d
-      integer, dimension(:), allocatable :: lu_csr_ja
-      integer, dimension(:), allocatable, target :: a_csr_ia
-      integer, dimension(:), allocatable, target :: a_csr_ja
-      integer, dimension(:), allocatable :: a_N1
-      integer, dimension(:), allocatable :: a_N2
-      integer, dimension(:), allocatable :: a_csr_dptr
-c
-      end module
-c#######################################################################
-      program POT3D
-c
-c-----------------------------------------------------------------------
-c
-      use ident
-      use mpidefs
-      use vars
-      use solve_params
-      use timing
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-c ****** Initialize MPI.
-c
-      call init_mpi
-c
-c ****** Start the wall-clock timer.
-c
-      call timer_on
-c
-c ****** Write the code name and version.
-c
-      if (iamp0) then
-        write (*,*)
-        write (*,*) idcode,' Version ',vers,', updated on ',update
-      end if
-c
-      call timer_on
-c
-c ****** Read the input file.
-c
-      call read_input_file
-c
-c ****** Create the output file.
-c
-      if (iamp0) then
-        call ffopen (9,outfile,'rw',ierr)
-        if (ierr.ne.0) then
-          write (*,*)
-          write (*,*) '### ERROR in POT3D:'
-          write (*,*) '### Could not create the output file.'
-          write (*,*) 'File name: ',trim(outfile)
-        end if
-      end if
-      call check_error_on_p0 (ierr)
-c
-c ****** Check the input parameters.
-c
-      call check_input
-c
-c ****** Check the processor topology.
-c
-      call check_proc_topology
-c
-c ****** Decompose the domain.
-c
-      call decompose_domain
-c
-c ****** Allocate global arrays.
-c
-      call allocate_global_arrays
-c
-c ****** Set the global meshes.
-c
-      call set_global_mesh
-c
-c ****** Decompose the mesh.
-c
-      call decompose_mesh_r
-      call decompose_mesh_tp
-c
-c ****** Allocate local arrays.
-c
-      call allocate_local_arrays_r
-      call allocate_local_arrays_tp
-c
-c ****** Set the local meshes.
-c
-      call set_local_mesh_r
-      call set_local_mesh_tp
-c
-c ****** Print decomposition diagnostics.
-c
-      call decomp_diags
-c
-c ****** Initialize the flux and if validating, write analytic solution.
-c
-      if (validation_run) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### COMMENT from POT3D:'
-          write (*,*) '### Validation run requested.'
-          write (*,*) '### Ignoring br input file,'
-          write (*,*) '### setting HDF32=.FALSE.,'
-          write (*,*) '### and overriding output filenames.'
-          write (9,*)
-          write (9,*) '### COMMENT from POT3D:'
-          write (9,*) '### Validation run requested.'
-          write (9,*) '### Ignoring br input file'
-          write (9,*) '### setting HDF32=.FALSE.,'
-          write (9,*) '### and overriding output filenames.'
-        end if
-        hdf32=.false.
-        brfile='br_solved.'//trim(fmt)
-        btfile='bt_solved.'//trim(fmt)
-        bpfile='bp_solved.'//trim(fmt)
-        phifile='phi_solved.'//trim(fmt)
-        call set_validation_flux
-        call write_validation_solution
-      else
-        call set_flux
-      end if
-c
-      call timer_off (t_startup)
-c
-c ****** Find the solution.
-c
-      if (iamp0) then
-        write (*,*)
-        write (*,*) '### COMMENT from POT3D:'
-        write (*,*) '### Starting PCG solve.'
-        call FLUSH(OUTPUT_UNIT)
-        write (9,*)
-        write (9,*) '### COMMENT from POT3D:'
-        write (9,*) '### Starting PCG solve.'
-        call FLUSH(9)
-      end if
-c
-      call timer_on
-c
-      call potfld
-c
-      call timer_off (t_solve)
-c
-c ****** Compute B.
-c
-      call getb
-c
-c ****** Write solution to file.
-c
-      call write_solution
-c
-c ****** Magnetic energy diagnostics.
-c
-      call magnetic_energy
-c
-      call MPI_Barrier(MPI_COMM_WORLD,ierr)
-      call timer_off (t_wall)
-c
-      call write_timing
-c
-      call endrun (.false.)
-c
-      end
-c#######################################################################
-      subroutine read_input_file
-c
-c-----------------------------------------------------------------------
-c
-c ****** Read the input file.
-c
-c-----------------------------------------------------------------------
-c
-      use global_dims
-      use global_mesh
-      use mpidefs
-      use meshdef
-      use cgcom
-      use debug
-      use vars
-      use decomposition_params
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-c ****** Values for the global mesh size.
-c ****** Since these names conflict with those in LOCAL_DIMS*, it is
-c ****** important not to use these modules here.
-c
-      integer :: nr=0
-      integer :: nt=0
-      integer :: np=0
-c
-c-----------------------------------------------------------------------
-c
-      namelist /topology/ nprocs,nr,nt,np,gpn
-c
-c-----------------------------------------------------------------------
-c
-      namelist /inputvars/ r0,r1,fmt,
-     &                     drratio,dtratio,dpratio,
-     &                     rfrac,tfrac,pfrac,
-     &                     nfrmesh,nftmesh,nfpmesh,
-     &                     phishift,
-     &                     ifprec,ncgmax,ncghist,epscg,
-     &                     idebug,br0file,phifile,
-     &                     brfile,btfile,bpfile,br_photo_file,
-     &                     br_photo_original_file,
-     &                     option,
-     &                     do_not_balance_flux,hdf32,
-     &                     validation_run,dipole_angle
-c
-c-----------------------------------------------------------------------
-c
-      integer :: ierr
-      character(80) :: infile='pot3d.dat'
-c
-c-----------------------------------------------------------------------
-c
-c ****** Read the input file.
-c
-      call ffopen (8,infile,'r',ierr)
-c
-      if (ierr.ne.0) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in READ_INPUT_FILE:'
-          write (*,*) '### Could not open the input file.'
-          write (*,*) 'File name: ',trim(infile)
-        end if
-        call endrun (.true.)
-      end if
-c
-      read (8,topology)
-c
-      read (8,inputvars)
-c
-      close (8)
-c
-      if (trim(fmt).ne.'h5') then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in READ_INPUT_FILE:'
-          write (*,*) '### "fmt" must be "h5".'
-          write (*,*) 'fmt: ',trim(fmt)
-        end if
-        call endrun (.true.)
-      end if
-c
-c ****** Check if output names were overwritten.
-c ****** If not, set default names with format fmt.
-c
-      if (trim(phifile).eq.'default') then
-        phifile='phi.'//trim(fmt)
-      end if
-      if (trim(br0file).eq.'default') then
-        br0file='br0.'//trim(fmt)
-      end if
-      if (trim(brfile).eq.'default') then
-        brfile='br.'//trim(fmt)
-      end if
-      if (trim(btfile).eq.'default') then
-        btfile='bt.'//trim(fmt)
-      end if
-      if (trim(bpfile).eq.'default') then
-        bpfile='bp.'//trim(fmt)
-      end if
-      if (trim(br_photo_file).eq.'default') then
-        br_photo_file='br_photo.'//trim(fmt)
-      end if
-      if (trim(br_photo_original_file).eq.'default') then
-        br_photo_original_file='br_photo_original.'//trim(fmt)
-      end if
-c
-      nr_g=nr
-      nt_g=nt
-      np_g=np
-c
-c ****** Check if the specified mesh dimensions are valid.
-c
-      call check_mesh_dimensions (nr_g,nt_g,np_g)
-c
-      nrm1_g=nr_g-1
-      ntm1_g=nt_g-1
-      npm1_g=np_g-1
-c
-      end subroutine
-c#######################################################################
-      subroutine check_error_on_p0 (ierr0)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Check if the error flag IERR0 on processor 0 in
-c ****** MPI_COMM_WORLD (i.e., processor IPROC0 in COMM_ALL)
-c ****** indicates that the code should exit.
-c
-c ****** If IERR0 is non-zero, all the processors are directed
-c ****** to call ENDRUN to terminate the code.
-c
-c-----------------------------------------------------------------------
-c
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: ierr0
-c
-c-----------------------------------------------------------------------
-c
-c ****** MPI error return.
-c
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-c ****** Broadcast IERR0 to all processors.
-c
-      call MPI_Bcast (ierr0,1,MPI_INTEGER,0,MPI_COMM_WORLD,ierr)
-c
-c ****** Call ENDRUN if IERR0 is non-zero.
-c
-      if (ierr0.ne.0) then
-        call endrun (.true.)
-      end if
-c
-      return
-      end
-c#######################################################################
-      subroutine endrun (ifstop)
-c
-c-----------------------------------------------------------------------
-c
-c ****** End the run and exit the code.
-c
-c-----------------------------------------------------------------------
-c
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      logical :: ifstop
-c
-c-----------------------------------------------------------------------
-c
-c ****** MPI error return.
-c
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-c ****** Close the output file.
-c
-      if (iamp0) then
-        close (9)
-      end if
-c
-c ****** Exit MPI gracefully.
-c
-      call MPI_Finalize (ierr)
-c
-c ****** Call the STOP statement if requested.
-c
-      if (ifstop) then
-        stop
-      end if
-c
-      return
-      end
-c#######################################################################
-      subroutine init_mpi
-c
-c-----------------------------------------------------------------------
-c
-c ****** Initialize MPI.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-c ****** MPI error return.
-c
-      integer :: ierr,tcheck
-c
-c-----------------------------------------------------------------------
-c
-c ****** Real number to determine the KIND of REALs.
-c
-      real(r_typ) :: def_real
-c
-c-----------------------------------------------------------------------
-c
-      call MPI_Init_thread (MPI_THREAD_FUNNELED,tcheck,ierr)
-c
-c ****** Get the total number of processors.
-c
-      call MPI_Comm_size (MPI_COMM_WORLD,nproc,ierr)
-c
-c ****** Get the index (rank) of the local processor in
-c ****** communicator MPI_COMM_WORLD in variable IPROCW.
-c
-      call MPI_Comm_rank (MPI_COMM_WORLD,iprocw,ierr)
-c
-c ****** Create a shared communicator for all ranks in the node.
-c
-      call MPI_Comm_split_type (MPI_COMM_WORLD,MPI_COMM_TYPE_SHARED,0,
-     &                          MPI_INFO_NULL,comm_shared,ierr)
-c
-c ****** Get the total number of processors in node.
-c
-      call MPI_Comm_size (comm_shared,nprocsh,ierr)
-c
-c ****** Get the index (rank) of the local processor in the local node.
-c
-      call MPI_Comm_rank (comm_shared,iprocsh,ierr)
-c
-c ****** Set the flag to designate whether this processor
-c ****** has rank 0 in communicator MPI_COMM_WORLD.
-c
-      if (iprocw.eq.0) then
-        iamp0=.true.
-      else
-        iamp0=.false.
-      end if
-c
-c ****** Set the number type for communicating REAL
-c ****** numbers in MPI calls.
-c
-      if (kind(def_real).eq.KIND_REAL_4) then
-        ntype_real=MPI_REAL4
-      else if (kind(def_real).eq.KIND_REAL_8) then
-        ntype_real=MPI_REAL8
-      else
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in INIT_MPI:'
-          write (*,*) '### Unrecognized default REAL number KIND:'
-          write (*,*) 'KIND(default_real) = ',kind(def_real)
-          write (*,*) 'This is a fatal error.'
-        end if
-        call endrun (.true.)
-      end if
-c
-      end subroutine
-c#######################################################################
-      subroutine check_input
-c
-c-----------------------------------------------------------------------
-c
-c ****** Check the validity of the input parameters.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use vars
-      use solve_params
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: one=1._r_typ
-c
-c-----------------------------------------------------------------------
-c
-c ****** Check that OPTION is valid and set the boundary condition
-c ****** switches accordingly.
-c
-      if (option.eq.'potential') then
-c
-c ****** For a potential field, set d(phi)/dr to zero at r=R1
-c ****** (i.e., the field is tangential to the boundary).
-c
-        pm_r1=one
-c
-      else if (option.eq.'open') then
-c
-c ****** For an open field, set phi to zero at r=R1
-c ****** (i.e., the field is radial there).
-c
-        pm_r1=-one
-c
-      else if (option.eq.'ss') then
-c
-c ****** For a source surface field, set phi to zero at r=R1
-c ****** (i.e., the field is radial there).
-c
-        pm_r1=-one
-c
-      else
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in CHECK_INPUT:'
-          write (*,*) '### Invalid OPTION:'
-          write (*,*)
-          write (*,*) 'OPTION = ',trim(option)
-          write (*,*)
-          write (*,*) 'The options allowed are:'
-          write (*,*) '''potential'''
-          write (*,*) '''open'''
-          write (*,*) '''ss'''
-        end if
-        call endrun (.true.)
-      end if
-c
-      if (iamp0) then
-        write (*,*)
-        write (*,*) '### COMMENT from CHECK_INPUT:'
-        write (*,*) '### Field solve type:'
-        write (*,*)
-        write (*,*) 'OPTION = ',option
-        write (9,*)
-        write (9,*) '### COMMENT from CHECK_INPUT:'
-        write (9,*) '### Field solve type:'
-        write (9,*)
-        write (9,*) 'OPTION = ',option
-      end if
-c
-      return
-      end
-c#######################################################################
-      subroutine set_proc_topology
-c
-c-----------------------------------------------------------------------
-c
-c ****** Set the optimal values of the MPI rank topology
-c ****** in dimensions not set by user.
-c
-c-----------------------------------------------------------------------
-c
-      use mpidefs
-      use decomposition_params
-      use number_types
-      use global_dims
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: one=1.0_r_typ
-      real(r_typ), parameter :: zero=0.0_r_typ
-      real(r_typ), parameter :: bigval=HUGE(1.0_r_typ)
-c
-c-----------------------------------------------------------------------
-c
-      integer, dimension(:), allocatable :: factors
-      integer, dimension(:,:), allocatable :: rank_factors
-      real(r_typ), dimension(:,:), allocatable :: nperrank
-      real(r_typ), dimension(:), allocatable :: penalty
-c
-      integer :: i,j,k,num_fac,num_rank_fac,best_idx
-      integer :: fr=1,ft=1,fp=1
-      real(r_typ) :: a12,a13,a23
-c
-c-----------------------------------------------------------------------
-c
-c ****** Extract nproc values.  A value of -1 indicates the dimension
-c ****** should be autoset.
-c
-      nproc_r=nprocs(1)
-      nproc_t=nprocs(2)
-      nproc_p=nprocs(3)
-c
-c ****** If no dimensions are to be autoset, return.
-c
-      if(nproc_r.ne.-1.and.nproc_t.ne.-1.and.nproc_p.ne.-1) return
-c
-c ****** Get all factors of nproc and store them in factors array.
-c
-      i=1
-      num_fac=0
-      do while(i.le.nproc)
-        if (MOD(nproc,i).eq.0) then
-          num_fac=num_fac+1
-        endif
-        i=i+1
-      enddo
-      allocate (factors(num_fac))
-      i=1
-      num_fac=0
-      do while(i.le.nproc)
-        if (MOD(nproc,i).eq.0) then
-          num_fac=num_fac+1
-          factors(num_fac)=i
-        endif
-        i=i+1
-      enddo
-c
-c ****** Set penalty function parameters and any fixed dimensions
-c ****** based on which dimensions are to be autoset.
-c
-      a12=one
-      a13=one
-      a23=one
-c
-      if (nproc_r.ne.-1) then
-        fr=nproc_r
-        a12=zero
-        a13=zero
-      end if
-      if (nproc_t.ne.-1) then
-        ft=nproc_t
-        a12=zero
-        a23=zero
-      end if
-      if (nproc_p.ne.-1) then
-        fp=nproc_p
-        a13=zero
-        a23=zero
-      end if
-c
-c ****** Loop over all combinations of factors and save those that
-c ****** yield the correct number of MPI ranks into rank_factors array.
-c
-      num_rank_fac=0
-      do k=1,num_fac
-        do j=1,num_fac
-          do i=1,num_fac
-            if(nproc_r.eq.-1) fr=factors(i)
-            if(nproc_t.eq.-1) ft=factors(j)
-            if(nproc_p.eq.-1) fp=factors(k)
-            if (fr*ft*fp.eq.nproc) then
-              num_rank_fac=num_rank_fac+1
-            end if
-          enddo
-        enddo
-      enddo
-c
-      if (num_rank_fac.eq.0) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in SET_PROC_TOPOLOGY:'
-          write (*,*) '### Processor topology specification error.'
-          write (*,*) 'No valid topologies found for selected options.'
-          write (*,*) 'Number of MPI ranks = ',nproc
-          write (*,*) 'NPROC_R = ',nproc_r
-          write (*,*) 'NPROC_T = ',nproc_t
-          write (*,*) 'NPROC_P = ',nproc_p
-        end if
-        call endrun (.true.)
-      end if
-c
-      allocate(rank_factors(num_rank_fac,3))
-      allocate(nperrank(num_rank_fac,3))
-      allocate(penalty(num_rank_fac))
-c
-      rank_factors(:,:)=-1
-      penalty(:)=bigval
-c
-      num_rank_fac=0
-      do k=1,num_fac
-        do j=1,num_fac
-          do i=1,num_fac
-            if(nproc_r.eq.-1) fr=factors(i)
-            if(nproc_t.eq.-1) ft=factors(j)
-            if(nproc_p.eq.-1) fp=factors(k)
-            if (fr*ft*fp.eq.nproc) then
-              num_rank_fac=num_rank_fac+1
-              rank_factors(num_rank_fac,1)=fr
-              rank_factors(num_rank_fac,2)=ft
-              rank_factors(num_rank_fac,3)=fp
-            end if
-          enddo
-        enddo
-      enddo
-c
-c ****** Get number of grid points per rank for each dimension.
-c
-      nperrank(:,1)=real(nr_g)/rank_factors(:,1)
-      nperrank(:,2)=real(nt_g)/rank_factors(:,2)
-      nperrank(:,3)=real(np_g)/rank_factors(:,3)
-c
-c ****** Compute penalty function.
-c
-      penalty(:)=a12*(nperrank(:,1)-nperrank(:,2))**2
-     &          +a23*(nperrank(:,2)-nperrank(:,3))**2
-     &          +a13*(nperrank(:,3)-nperrank(:,1))**2
-c
-c ****** Eliminate any choices that yield less than a minimum number
-c ****** of grid points per rank.
-c
-      do i=1,num_rank_fac
-        if (nperrank(i,1).lt.4) penalty(i)=bigval
-        if (nperrank(i,2).lt.4) penalty(i)=bigval
-        if (nperrank(i,3).lt.3) penalty(i)=bigval
-      enddo
-c
-c ****** Find optimal topology.
-c
-      best_idx=MINLOC(penalty,1)
-c
-      if (penalty(best_idx).eq.bigval) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in SET_PROC_TOPOLOGY:'
-          write (*,*) '### Processor topology specification error.'
-          write (*,*) 'No valid topologies found for selected options'
-          write (*,*) 'with selected grid.  '
-          write (*,*) 'It is likely you are using too many MPI ranks.'
-          write (*,*) 'Number of MPI ranks = ',nproc
-          write (*,*) 'NPROC_R = ',nproc_r
-          write (*,*) 'NPROC_T = ',nproc_t
-          write (*,*) 'NPROC_P = ',nproc_p
-          write (*,*) 'NR = ',nr_g
-          write (*,*) 'NT = ',nt_g
-          write (*,*) 'NP = ',np_g
-        end if
-        call endrun (.true.)
-      end if
-c
-c ****** Set optimal topology.
-c
-      nprocs(1)=rank_factors(best_idx,1)
-      nprocs(2)=rank_factors(best_idx,2)
-      nprocs(3)=rank_factors(best_idx,3)
-c
-      deallocate(factors)
-      deallocate(rank_factors)
-      deallocate(nperrank)
-      deallocate(penalty)
-c
-      end subroutine
-c#######################################################################
-      subroutine check_proc_topology
-c
-c-----------------------------------------------------------------------
-c
-c ****** Check the validity of the requested processor topology.
-c
-c-----------------------------------------------------------------------
-c
-      use mpidefs
-      use decomposition_params
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,nreq
-c
-c-----------------------------------------------------------------------
-c
-c ****** Check the processor topology.
-c
-      do i=1,3
-        if (nprocs(i).lt.1.and.nprocs(i).ne.-1) then
-          if (iamp0) then
-            write (*,*)
-            write (*,*) '### ERROR in CHECK_PROC_TOPOLOGY:'
-            write (*,*) '### Processor topology specification error.'
-            write (*,*) 'Invalid number of processors requested.'
-            write (*,*) 'Dimension = ',i
-            write (*,*) 'Number of processors requested = ',
-     &                  nprocs(i)
-          end if
-          call endrun (.true.)
-        end if
-      enddo
-c
-c ****** Set the optimal values of the topology if requested.
-c
-      call set_proc_topology
-c
-c ****** Check that the number of processors available
-c ****** matches the number requested.
-c
-      nreq=nprocs(1)*nprocs(2)*nprocs(3)
-c
-      if (nreq.ne.nproc) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in CHECK_PROC_TOPOLOGY:'
-          write (*,*) '### Processor topology specification error.'
-          write (*,*) 'The number of processors requested does not'//
-     &                ' equal the number available.'
-          write (*,*) 'Number of processors requested = ',nreq
-          write (*,*) 'Number of processors available = ',nproc
-        end if
-        call endrun (.true.)
-      end if
-c
-      return
-      end
-c#######################################################################
-      subroutine decompose_domain
-c
-c-----------------------------------------------------------------------
-c
-c ****** Decompose the domain into a Cartesian MPI topology.
-c
-c-----------------------------------------------------------------------
-c
-      use mpidefs
-      use decomposition_params
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-      integer, parameter :: ndim=3
-      integer, dimension(ndim) :: coords
-      logical, dimension(ndim) :: periodic
-      logical :: reorder
-      logical, dimension(ndim) :: keep_dim
-c
-c-----------------------------------------------------------------------
-c
-c ****** Create a communicator over all processors, COMM_ALL,
-c ****** that has a Cartesian topology.
-c
-c ****** Specify the periodicity of the coordinate system.
-c
-      periodic(1)=.false.
-      periodic(2)=.false.
-      periodic(3)=.true.
-c
-c ****** Allow re-ordering in the Cartesian topology.
-c
-      reorder=.true.
-c
-      call MPI_Cart_create (MPI_COMM_WORLD,ndim,nprocs,
-     &                      periodic,reorder,comm_all,ierr)
-c
-c ****** Get the index (rank) of the local processor in
-c ****** communicator COMM_ALL in variable IPROC.
-c
-c ****** IMPORTANT NOTE:
-c ****** If re-odering was allowed in the Cartesian topology
-c ****** creation (above), then the rank of the local processor
-c ****** in communicator COMM_ALL may be different from its rank
-c ****** in communicator MPI_COMM_WORLD.
-c
-      call MPI_Comm_rank (comm_all,iproc,ierr)
-c
-c ****** Set the processor rank IPROC0 in communicator COMM_ALL
-c ****** for the processor that has rank 0 in MPI_COMM_WORLD.
-c ****** This value is broadcast to all the processors.
-c
-      if (iamp0) then
-        iproc0=iproc
-      end if
-      call MPI_Bcast (iproc0,1,MPI_INTEGER,0,MPI_COMM_WORLD,ierr)
-c
-c ****** Get the coordinate indices of this processor in the
-c ****** Cartesian MPI topology.
-c
-      call MPI_Cart_coords (comm_all,iproc,ndim,coords,ierr)
-c
-      iproc_r=coords(1)
-      iproc_t=coords(2)
-      iproc_p=coords(3)
-c
-      nproc_r=nprocs(1)
-      nproc_t=nprocs(2)
-      nproc_p=nprocs(3)
-c
-c ****** Get the rank of the neighboring processors in the
-c ****** Cartesian MPI topology.
-c
-      call MPI_Cart_shift (comm_all,0,1,iproc_rm,iproc_rp,ierr)
-      call MPI_Cart_shift (comm_all,1,1,iproc_tm,iproc_tp,ierr)
-      call MPI_Cart_shift (comm_all,2,1,iproc_pm,iproc_pp,ierr)
-c
-c ****** Create communicators for operations involving all
-c ****** processors in the phi dimension.  These communicators
-c ****** are stored in COMM_PHI (and generally represent different
-c ****** communicators on different processors).
-c
-      keep_dim(1)=.false.
-      keep_dim(2)=.false.
-      keep_dim(3)=.true.
-c
-      call MPI_Cart_sub (comm_all,keep_dim,comm_phi,ierr)
-c
-c ****** Create communicators for operations involving
-c ****** all processors in the r dimension.
-c ****** These communicators are stored in COMM_R
-c ****** (and generally represent different communicators on
-c ****** different processors).
-c
-      keep_dim(1)=.true.
-      keep_dim(2)=.false.
-      keep_dim(3)=.false.
-c
-      call MPI_Cart_sub (comm_all,keep_dim,comm_r,ierr)
-c
-      return
-      end
-c#######################################################################
-      subroutine decompose_mesh_r
-c
-c-----------------------------------------------------------------------
-c
-c ****** Decompose the r mesh between processors.
-c
-c-----------------------------------------------------------------------
-c
-      use global_dims
-      use local_dims_r
-      use decomposition
-      use solve_params
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: ierr,i,npts
-      integer :: i0_h,i1_h,i0_m,i1_m
-      integer, dimension(nproc_r) :: mp_r
-c
-c-----------------------------------------------------------------------
-c
-c ****** Decompose the r dimension.
-c
-      npts=nr_g
-c
-      call decompose_dimension (npts,nproc_r,mp_r,ierr)
-      if (ierr.ne.0) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in DECOMPOSE_MESH:'
-          write (*,*) '### Anomaly in decomposing the mesh'//
-     &                ' between processors.'
-          write (*,*) '### Could not decompose the r mesh.'
-          write (*,*) 'Number of mesh points in r = ',npts
-          write (*,*) 'Number of processors along r = ',nproc_r
-        end if
-        call endrun (.true.)
-      end if
-c
-c ****** Check that the resulting mesh topology is valid.
-c
-      call check_mesh_topology (nproc_r,mp_r,1,'r')
-c
-c ****** Compute the mapping between the processor decomposition
-c ****** and the global mesh.
-c
-c ****** Note that there is a two-point overlap in the mesh
-c ****** between adjacent processors in r.
-c
-      i0_g=1
-      do i=1,iproc_r
-        i0_g=i0_g+mp_r(i)
-      enddo
-      nr=mp_r(iproc_r+1)+2
-      i1_g=i0_g+nr-1
-c
-      nrm1=nr-1
-c
-c ****** Set the flags to indicate whether this processor has
-c ****** points on the physical boundaries.
-c
-      if (iproc_r.eq.0) then
-        rb0=.true.
-      else
-        rb0=.false.
-      end if
-c
-      if (iproc_r.eq.nproc_r-1) then
-        rb1=.true.
-      else
-        rb1=.false.
-      end if
-c
-c ****** Set the dimensions of arrays for the "main" meshes
-c ****** (i.e., the "m" mesh) for which normal derivatives are
-c ****** needed (e.g., v).  These vary on different processors,
-c ****** depending if they are left-boundary, internal, or
-c ****** right-boundary processors.
-c
-      if (rb1) then
-        nrm=nrm1
-      else
-        nrm=nr
-      end if
-c
-c ****** Store the mapping structure (for this processor).
-c
-      allocate (map_rh(0:nproc-1))
-      allocate (map_rm(0:nproc-1))
-c
-      if (rb0) then
-        i0_h=1
-      else
-        i0_h=2
-      end if
-      if (rb1) then
-        i1_h=nr
-      else
-        i1_h=nrm1
-      end if
-c
-      if (rb0) then
-        i0_m=1
-      else
-        i0_m=2
-      end if
-      i1_m=nrm1
-c
-      map_rh(iproc)%i0=i0_h
-      map_rh(iproc)%i1=i1_h
-c
-      map_rm(iproc)%i0=i0_m
-      map_rm(iproc)%i1=i1_m
-c
-      map_rh(iproc)%offset=i0_g+map_rh(iproc)%i0-1
-      map_rh(iproc)%n=map_rh(iproc)%i1-map_rh(iproc)%i0+1
-c
-      map_rm(iproc)%offset=i0_g+map_rm(iproc)%i0-1
-      map_rm(iproc)%n=map_rm(iproc)%i1-map_rm(iproc)%i0+1
-c
-c ****** Gather the mapping information by communicating among
-c ****** all processors.
-c
-      call gather_mapping_info (map_rh)
-      call gather_mapping_info (map_rm)
-c
-      return
-      end
-c#######################################################################
-      subroutine decompose_mesh_tp
-c
-c-----------------------------------------------------------------------
-c
-c ****** Decompose the theta and phi mesh between processors.
-c
-c-----------------------------------------------------------------------
-c
-      use global_dims
-      use local_dims_tp
-      use decomposition
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: ierr,j,k
-      integer :: j0_h,j1_h,j0_m,j1_m
-      integer :: k0_h,k1_h,k0_m,k1_m
-      integer, dimension(nproc_t) :: mp_t
-      integer, dimension(nproc_p) :: mp_p
-c
-c-----------------------------------------------------------------------
-c
-c ****** Decompose the t dimension.
-c
-      call decompose_dimension (nt_g,nproc_t,mp_t,ierr)
-      if (ierr.ne.0) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in DECOMPOSE_MESH_TP:'
-          write (*,*) '### Anomaly in decomposing the mesh'//
-     &                ' between processors.'
-          write (*,*) '### Could not decompose the theta mesh.'
-          write (*,*) 'Number of mesh points in theta = ',nt_g
-          write (*,*) 'Number of processors along theta = ',nproc_t
-        end if
-        call endrun (.true.)
-      end if
-c
-c ****** Decompose the p dimension.
-c
-      call decompose_dimension (np_g,nproc_p,mp_p,ierr)
-      if (ierr.ne.0) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in DECOMPOSE_MESH_TP:'
-          write (*,*) '### Anomaly in decomposing the mesh'//
-     &                ' between processors.'
-          write (*,*) '### Could not decompose the phi mesh.'
-          write (*,*) 'Number of mesh points in phi = ',np_g
-          write (*,*) 'Number of processors along phi = ',nproc_p
-        end if
-        call endrun (.true.)
-      end if
-c
-c ****** Check that the resulting mesh topology is valid.
-c
-      call check_mesh_topology (nproc_t,mp_t,1,'theta')
-      call check_mesh_topology (nproc_p,mp_p,1,'phi')
-c
-c ****** Set the flag for an axisymmetric run (requested by
-c ****** setting NP_G = 3).
-c
-      if (np_g.eq.3) then
-        axisymmetric=.true.
-      else
-        axisymmetric=.false.
-      end if
-c
-c ****** Compute the mapping between the processor decomposition
-c ****** and the global mesh.
-c
-c ****** Note that there is a two-point overlap in the mesh
-c ****** between adjacent processors in theta and phi.
-c
-      j0_g=1
-      do j=1,iproc_t
-        j0_g=j0_g+mp_t(j)
-      enddo
-      nt=mp_t(iproc_t+1)+2
-      j1_g=j0_g+nt-1
-c
-      k0_g=1
-      do k=1,iproc_p
-        k0_g=k0_g+mp_p(k)
-      enddo
-      np=mp_p(iproc_p+1)+2
-      k1_g=k0_g+np-1
-c
-      ntm1=nt-1
-      npm1=np-1
-c
-c ****** Set the flags to indicate whether this processor has
-c ****** points on the physical boundaries.
-c
-      if (iproc_t.eq.0) then
-        tb0=.true.
-      else
-        tb0=.false.
-      end if
-c
-      if (iproc_t.eq.nproc_t-1) then
-        tb1=.true.
-      else
-        tb1=.false.
-      end if
-c
-c ****** Set the dimensions of arrays for the "main" meshes
-c ****** (i.e., the "m" mesh) for which normal derivatives are
-c ****** needed (e.g., v).  These vary on different processors,
-c ****** depending if they are left-boundary, internal, or
-c ****** right-boundary processors.
-c
-      if (tb1) then
-        ntm=ntm1
-      else
-        ntm=nt
-      end if
-c
-c ****** Since the phi dimension is periodic, all processors
-c ****** have the same mesh limits.
-c
-      npm=np
-c
-c ****** Store the mapping structure (for this processor).
-c
-      allocate (map_th(0:nproc-1))
-      allocate (map_tm(0:nproc-1))
-      allocate (map_ph(0:nproc-1))
-      allocate (map_pm(0:nproc-1))
-c
-      if (tb0) then
-        j0_h=1
-      else
-        j0_h=2
-      end if
-      if (tb1) then
-        j1_h=nt
-      else
-        j1_h=ntm1
-      end if
-c
-      if (tb0) then
-        j0_m=1
-      else
-        j0_m=2
-      end if
-      j1_m=ntm1
-c
-      if (iproc_p.eq.0) then
-        k0_m=1
-      else
-        k0_m=2
-      end if
-      k1_m=npm1
-c
-      if (iproc_p.eq.0) then
-        k0_h=1
-      else
-        k0_h=2
-      end if
-      if (iproc_p.eq.nproc_p-1) then
-        k1_h=np
-      else
-        k1_h=npm1
-      end if
-c
-      map_th(iproc)%i0=j0_h
-      map_th(iproc)%i1=j1_h
-c
-      map_tm(iproc)%i0=j0_m
-      map_tm(iproc)%i1=j1_m
-c
-      map_ph(iproc)%i0=k0_h
-      map_ph(iproc)%i1=k1_h
-c
-      map_pm(iproc)%i0=k0_m
-      map_pm(iproc)%i1=k1_m
-c
-      map_th(iproc)%offset=j0_g+map_th(iproc)%i0-1
-      map_th(iproc)%n=map_th(iproc)%i1-map_th(iproc)%i0+1
-c
-      map_tm(iproc)%offset=j0_g+map_tm(iproc)%i0-1
-      map_tm(iproc)%n=map_tm(iproc)%i1-map_tm(iproc)%i0+1
-c
-      map_ph(iproc)%offset=k0_g+map_ph(iproc)%i0-1
-      map_ph(iproc)%n=map_ph(iproc)%i1-map_ph(iproc)%i0+1
-c
-      map_pm(iproc)%offset=k0_g+map_pm(iproc)%i0-1
-      map_pm(iproc)%n=map_pm(iproc)%i1-map_pm(iproc)%i0+1
-c
-c ****** Gather the mapping information by communicating among
-c ****** all processors.
-c
-      call gather_mapping_info (map_th)
-      call gather_mapping_info (map_tm)
-      call gather_mapping_info (map_ph)
-      call gather_mapping_info (map_pm)
-c
-      return
-      end
-c#######################################################################
-      subroutine check_mesh_dimensions (nr_g,nt_g,np_g)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Check that the requested (global) mesh dimensions are
-c ****** valid.
-c
-c-----------------------------------------------------------------------
-c
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: nr_g,nt_g,np_g
-c
-c-----------------------------------------------------------------------
-c
-      if (nr_g.lt.4) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in CHECK_MESH_DIMENSIONS:'
-          write (*,*) '### Invalid number of r mesh points'//
-     &              ' requested.'
-          write (*,*) '### The minimum number of mesh points is 4.'
-          write (*,*)
-          write (*,*) 'Number of mesh points requested = ',nr_g
-        end if
-        call endrun (.true.)
-      end if
-c
-      if (nt_g.lt.4) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in CHECK_MESH_DIMENSIONS:'
-          write (*,*) '### Invalid number of theta mesh points'//
-     &              ' requested.'
-          write (*,*) '### The minimum number of mesh points is 4.'
-          write (*,*)
-          write (*,*) 'Number of mesh points requested = ',nt_g
-        end if
-        call endrun (.true.)
-      end if
-c
-      if (np_g.lt.3) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in CHECK_MESH_DIMENSIONS:'
-          write (*,*) '### Invalid number of phi mesh points'//
-     &              ' requested.'
-          write (*,*) '### The minimum number of mesh points is 3.'
-          write (*,*)
-          write (*,*) 'Number of mesh points requested = ',np_g
-        end if
-        call endrun (.true.)
-      end if
-c
-      return
-      end
-c#######################################################################
-      subroutine decompose_dimension (nx,np,mp,ierr)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Decompose the mesh points NX along NP processors.
-c
-c ****** The decomposed mesh points are returned in array MP.
-c
-c-----------------------------------------------------------------------
-c
-c ****** This routine attempts to assign the mesh points as equally
-c ****** as possible between the processors.
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: nx
-      integer :: np
-      integer, dimension(np) :: mp
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-      integer :: nxm2,mpav,nrem
-c
-c-----------------------------------------------------------------------
-c
-      ierr=0
-c
-      nxm2=nx-2
-c
-      if (nxm2.le.0) then
-        ierr=1
-        return
-      end if
-c
-      if (np.le.0) then
-        ierr=2
-        return
-      end if
-c
-      mpav=nxm2/np
-c
-      mp(:)=mpav
-c
-      nrem=nxm2-mpav*np
-c
-      mp(1:nrem)=mp(1:nrem)+1
-c
-      return
-      end
-c#######################################################################
-      subroutine check_mesh_topology (np,mp,min_pts,coord)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Check the validity of the requested mesh topology.
-c
-c-----------------------------------------------------------------------
-c
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: np
-      integer, dimension(np) :: mp
-      integer :: min_pts
-      character(*) :: coord
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i
-c
-c-----------------------------------------------------------------------
-c
-c ****** Check that the number of mesh points on each processor
-c ****** is valid.
-c
-      do i=1,np
-        if (mp(i).lt.min_pts) then
-          if (iamp0) then
-            write (*,*)
-            write (*,*) '### ERROR in CHECK_MESH_TOPOLOGY:'
-            write (*,*) '### Mesh topology specification error.'
-            write (*,*) 'Invalid number of ',coord,
-     &                  ' mesh points requested.'
-            write (*,*) 'Processor index = ',i
-            write (*,*) 'Number of mesh points requested = ',mp(i)
-            write (*,*) 'Minimum number of mesh points allowed = ',
-     &                  min_pts
-          end if
-          call endrun (.true.)
-        end if
-      enddo
-c
-      return
-      end
-c#######################################################################
-      subroutine gather_mapping_info (map)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Gather a mapping information array by communicating
-c ****** among all processors.
-c
-c-----------------------------------------------------------------------
-c
-      use mpidefs
-      use decomposition
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      type(map_struct), dimension(0:nproc-1) :: map
-c
-c-----------------------------------------------------------------------
-c
-c ****** Buffers for packing the data.
-c
-      integer, parameter :: lbuf=4
-      integer, dimension(lbuf) :: sbuf
-      integer, dimension(lbuf,0:nproc-1) :: rbuf
-c
-c-----------------------------------------------------------------------
-c
-      integer :: ierr,irank
-c
-c-----------------------------------------------------------------------
-c
-c ****** Put the local section of the mapping information
-c ****** array into a buffer.
-c
-      sbuf(1)=map(iproc)%n
-      sbuf(2)=map(iproc)%i0
-      sbuf(3)=map(iproc)%i1
-      sbuf(4)=map(iproc)%offset
-c
-c ****** Communicate among all processors.  After this call, all
-c ****** processors have the complete mapping information.
-c
-      call MPI_Allgather (sbuf,lbuf,MPI_INTEGER,
-     &                    rbuf,lbuf,MPI_INTEGER,comm_all,ierr)
-c
-c ****** Extract the mapping information from the buffer.
-c
-      do irank=0,nproc-1
-        map(irank)%n     =rbuf(1,irank)
-        map(irank)%i0    =rbuf(2,irank)
-        map(irank)%i1    =rbuf(3,irank)
-        map(irank)%offset=rbuf(4,irank)
-      enddo
-c
-      return
-      end
-c#######################################################################
-      subroutine decomp_diags
-c
-c-----------------------------------------------------------------------
-c
-c ****** Print diagnostics about the mesh decomposition.
-c
-c-----------------------------------------------------------------------
-c
-      use global_dims
-      use global_mesh
-      use local_dims_r
-      use local_mesh_r
-      use local_dims_tp
-      use local_mesh_tp
-      use mpidefs
-      use solve_params
-      use debug
-      use decomposition
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: ierr
-      integer :: irank
-      real(r_typ) :: n_per_grid_min,n_per_grid_max
-c
-c-----------------------------------------------------------------------
-c
-      if (iamp0) then
-c
-        n_per_grid_min=floor(real(nr_g)/nproc_r)
-     &                *floor(real(nt_g)/nproc_t)
-     &                *floor(real(np_g)/nproc_p)
-c
-        n_per_grid_max=ceiling(real(nr_g)/nproc_r)
-     &                *ceiling(real(nt_g)/nproc_t)
-     &                *ceiling(real(np_g)/nproc_p)
-c
-        write (*,*)
-        write (*,*) 'Total number of MPI ranks = ',nproc
-        write (*,*)
-        write (*,*) 'Number of MPI ranks in r = ',nproc_r
-        write (*,*) 'Number of MPI ranks in t = ',nproc_t
-        write (*,*) 'Number of MPI ranks in p = ',nproc_p
-        write (*,*)
-        write (*,*) 'Global mesh dimension in r = ',nr_g
-        write (*,*) 'Global mesh dimension in t = ',nt_g
-        write (*,*) 'Global mesh dimension in p = ',np_g
-        write (*,*)
-        write (*,'(A,F6.1)') ' Average # of r mesh pts per rank = ',
-     &               real(nr_g)/nproc_r
-        write (*,'(A,F6.1)') ' Average # of t mesh pts per rank = ',
-     &               real(nt_g)/nproc_t
-        write (*,'(A,F6.1)') ' Average # of p mesh pts per rank = ',
-     &               real(np_g)/nproc_p
-        write (*,*)
-        write (*,'(A,F6.2,A)') ' Estimated load imbalance = ',
-     &      100.0*(1.0-real(n_per_grid_min)/real(n_per_grid_max)),' %'
-c
-        write (9,*)
-        write (9,*) 'Total number of MPI ranks = ',nproc
-        write (9,*)
-        write (9,*) 'Number of MPI ranks in r = ',nproc_r
-        write (9,*) 'Number of MPI ranks in t = ',nproc_t
-        write (9,*) 'Number of MPI ranks in p = ',nproc_p
-        write (9,*)
-        write (9,*) 'Global mesh dimension in r = ',nr_g
-        write (9,*) 'Global mesh dimension in t = ',nt_g
-        write (9,*) 'Global mesh dimension in p = ',np_g
-        write (9,*)
-        write (9,'(A,F6.1)') ' Average # of r mesh pts per rank = ',
-     &               real(nr_g)/nproc_r
-        write (9,'(A,F6.1)') ' Average # of t mesh pts per rank = ',
-     &               real(nt_g)/nproc_t
-        write (9,'(A,F6.1)') ' Average # of p mesh pts per rank = ',
-     &               real(np_g)/nproc_p
-        write (9,*)
-        write (9,'(A,F6.2,A)') ' Estimated load imbalance = ',
-     &      100.0*(1.0-real(n_per_grid_min)/real(n_per_grid_max)),' %'
-c
-      end if
-c
-      if (idebug.le.1) return
-c
-      do irank=0,nproc-1
-        call MPI_Barrier (comm_all,ierr)
-        if (irank.eq.iproc) then
-          write (*,*)
-          write (*,*)
-          write (*,*) 'Rank id = ',iproc
-          write (*,*) 'nr = ',nr
-          write (*,*) 'nt = ',nt
-          write (*,*) 'np = ',np
-          write (*,*) 'i0_g = ',i0_g
-          write (*,*) 'i1_g = ',i1_g
-          write (*,*) 'j0_g = ',j0_g
-          write (*,*) 'j1_g = ',j1_g
-          write (*,*) 'k0_g = ',k0_g
-          write (*,*) 'k1_g = ',k1_g
-          write (*,*) 'Rank index in r    = ',iproc_r
-          write (*,*) 'Rank index in t    = ',iproc_t
-          write (*,*) 'Rank index in p    = ',iproc_p
-          write (*,*) 'Rank to left  in r = ',iproc_rm
-          write (*,*) 'Rank to right in r = ',iproc_rp
-          write (*,*) 'Rank to left  in t = ',iproc_tm
-          write (*,*) 'Rank to right in t = ',iproc_tp
-          write (*,*) 'Rank to left  in p = ',iproc_pm
-          write (*,*) 'Rank to right in p = ',iproc_pp
-          write (*,*)
-          write (*,*) 'Rank in MPI_COMM_WORLD = ',iprocw
-          write (*,*) 'Rank in COMM_ALL       = ',iproc
-          if (idebug.gt.2) then
-            write (*,*)
-            write (*,*) 'r mesh:'
-            write (*,*) r
-            write (*,*)
-            write (*,*) 'theta mesh:'
-            write (*,*) t
-            write (*,*)
-            write (*,*) 'phi mesh:'
-            write (*,*) p
-          end if
-        end if
-      enddo
-c
-      return
-      end
-c#######################################################################
-      subroutine allocate_global_arrays
-c
-c-----------------------------------------------------------------------
-c
-c ****** Allocate global arrays.
-c
-c-----------------------------------------------------------------------
-c
-      use global_dims
-      use global_mesh
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-c ****** Allocate global mesh arrays.
-c
-      allocate (r_g (nrm1_g))
-      allocate (dr_g(nrm1_g))
-c
-      allocate (rh_g (nr_g))
-      allocate (drh_g(nr_g))
-c
-      allocate (t_g (ntm1_g))
-      allocate (dt_g(ntm1_g))
-c
-      allocate (th_g (nt_g))
-      allocate (dth_g(nt_g))
-c
-      allocate (p_g  (np_g))
-      allocate (dp_g (np_g))
-      allocate (ph_g (np_g))
-      allocate (dph_g(np_g))
-c
-      allocate (st_g(ntm1_g))
-      allocate (ct_g(ntm1_g))
-c
-      allocate (sth_g(nt_g))
-      allocate (cth_g(nt_g))
-c
-      allocate (sp_g (np_g))
-      allocate (cp_g (np_g))
-      allocate (sph_g(np_g))
-      allocate (cph_g(np_g))
-c
-      return
-      end
-c#######################################################################
-      subroutine allocate_local_arrays_r
-c
-c-----------------------------------------------------------------------
-c
-c ****** Allocate local arrays for the r dimension and 3D arrays.
-c
-c-----------------------------------------------------------------------
-c
-      use local_dims_r
-      use local_mesh_r
-      use local_dims_tp
-      use fields
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      allocate (r (nrm))
-      allocate (r2 (nrm))
-      allocate (dr(nrm))
-c
-      allocate (rh (nr))
-      allocate (drh(nr))
-c
-c ****** Allocate inverse quantities.
-c
-      allocate (r_i (nrm))
-      allocate (dr_i(nrm))
-c
-      allocate (rh_i (nr))
-      allocate (drh_i(nr))
-c
-c ****** Allocate the potential array and cg scratch array.
-c
-      allocate (phi(nr,nt,np))
-      allocate (x_ax(nr,nt,np))
-      phi(:,:,:)=0.
-      x_ax(:,:,:)=0.
-c
-c ****** Allocate polar boundary arrays.
-c
-      allocate (sum0(nr))
-      allocate (sum1(nr))
-      sum0(:)=0.
-      sum1(:)=0.
-c
-c ****** Allocate the local magnetic field arrays.
-c
-      allocate (br(nrm,nt,np))
-      allocate (bt(nr,ntm,np))
-      allocate (bp(nr,nt,npm))
-      br(:,:,:)=0.
-      bt(:,:,:)=0.
-      bp(:,:,:)=0.
-c
-      end subroutine
-c#######################################################################
-      subroutine allocate_local_arrays_tp
-c
-c-----------------------------------------------------------------------
-c
-c ****** Allocate local arrays for the theta and phi dimensions.
-c
-c-----------------------------------------------------------------------
-c
-      use local_dims_tp
-      use local_mesh_tp
-      use fields
-      use vars
-      use solve_params
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      allocate (t (ntm))
-      allocate (dt(ntm))
-c
-      allocate (th (nt))
-      allocate (dth(nt))
-c
-      allocate (p (np))
-      allocate (dp(np))
-c
-      allocate (ph (np))
-      allocate (dph(np))
-c
-      allocate (st(ntm))
-      allocate (ct(ntm))
-c
-      allocate (sth(nt))
-      allocate (cth(nt))
-c
-      allocate (sp (np))
-      allocate (cp (np))
-      allocate (sph(np))
-      allocate (cph(np))
-c
-c ****** Allocate inverse quantities.
-c
-      allocate (dt_i(ntm))
-      allocate (st_i(ntm))
-c
-      allocate (dth_i(nt))
-      allocate (sth_i(nt))
-c
-      allocate (dp_i (np))
-      allocate (dph_i(np))
-c
-c ****** Allocate the boundary radial magnetic field array(s).
-c
-      allocate (br0(nt,np))
-      br0(:,:)=0.
-c
-      if (validation_run) then
-        allocate (br1(nt,np))
-        br1(:,:)=0.
-      end if
-c
-      return
-      end
-c#######################################################################
-      subroutine set_global_mesh
-c
-c-----------------------------------------------------------------------
-c
-c ****** Define the global mesh arrays.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use global_dims
-      use global_mesh
-      use meshdef
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: zero=0._r_typ
-      real(r_typ), parameter :: one=1._r_typ
-      real(r_typ), parameter :: half=.5_r_typ
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j,k
-c
-c-----------------------------------------------------------------------
-c
-c ****** Define the radial mesh.
-c
-      call genmesh (9,'r',nrm1_g,r0,r1,nmseg,rfrac,drratio,
-     &              nfrmesh,.false.,zero,r_g)
-c
-      do i=2,nrm1_g
-        rh_g(i)=.5*(r_g(i)+r_g(i-1))
-        drh_g(i)=r_g(i)-r_g(i-1)
-      enddo
-      rh_g(1)=rh_g(2)-drh_g(2)
-      rh_g(nr_g)=rh_g(nrm1_g)+drh_g(nrm1_g)
-      drh_g(1)=drh_g(2)
-      drh_g(nr_g)=drh_g(nrm1_g)
-c
-      do i=1,nrm1_g
-        dr_g(i)=rh_g(i+1)-rh_g(i)
-      enddo
-c
-c ****** Define the theta mesh.
-c
-      call genmesh (9,'t',ntm1_g,t0,t1,nmseg,tfrac,dtratio,
-     &              nftmesh,.false.,zero,t_g)
-c
-      do j=2,ntm1_g
-        th_g(j)=.5*(t_g(j)+t_g(j-1))
-        dth_g(j)=t_g(j)-t_g(j-1)
-      enddo
-      th_g(1)=th_g(2)-dth_g(2)
-      th_g(nt_g)=th_g(ntm1_g)+dth_g(ntm1_g)
-      dth_g(1)=dth_g(2)
-      dth_g(nt_g)=dth_g(ntm1_g)
-c
-      do j=1,ntm1_g
-        dt_g(j)=th_g(j+1)-th_g(j)
-      enddo
-c
-c ****** Define the periodic phi mesh.
-c
-      call genmesh (9,'p',npm1_g,p0,p1,nmseg,pfrac,dpratio,
-     &              nfpmesh,.true.,phishift,p_g)
-      p_g(np_g)=p_g(2)+pl
-c
-      do k=2,np_g
-        ph_g(k)=half*(p_g(k)+p_g(k-1))
-        dph_g(k)=p_g(k)-p_g(k-1)
-      enddo
-      ph_g(1)=ph_g(npm1_g)-pl
-      dph_g(1)=dph_g(npm1_g)
-c
-      do k=1,npm1_g
-        dp_g(k)=ph_g(k+1)-ph_g(k)
-      enddo
-      dp_g(np_g)=dp_g(2)
-c
-c ****** Enforce exact periodicity to protect symmetry properties
-c ****** from round-off errors (especially for axisymmetric cases).
-c
-      dph_g(np_g)=dph_g(2)
-      dp_g(1)=dp_g(npm1_g)
-c
-c ****** Define global auxiliary mesh-related arrays.
-c
-      st_g(:)=sin(t_g(:))
-      ct_g(:)=cos(t_g(:))
-      sth_g(:)=sin(th_g(:))
-      cth_g(:)=cos(th_g(:))
-c
-      sp_g(:)=sin(p_g(:))
-      cp_g(:)=cos(p_g(:))
-      sph_g(:)=sin(ph_g(:))
-      cph_g(:)=cos(ph_g(:))
-c
-c ****** For an axisymmetric case, set the exact values of
-c ****** sin(phi) and cos(phi) to preserve symmetry properties
-c ****** in the presence of round-off errors.
-c
-      if (axisymmetric) then
-        sp_g(2)=0.
-        cp_g(2)=one
-        sph_g(2)=0.
-        cph_g(2)=-one
-      end if
-c
-c ****** Enforce exact periodicity to protect symmetry properties
-c ****** from round-off errors (especially for axisymmetric cases).
-c
-      sph_g(1)=sph_g(npm1_g)
-      sph_g(np_g)=sph_g(2)
-      cph_g(1)=cph_g(npm1_g)
-      cph_g(np_g)=cph_g(2)
-      sp_g(1)=sp_g(npm1_g)
-      sp_g(np_g)=sp_g(2)
-      cp_g(1)=cp_g(npm1_g)
-      cp_g(np_g)=cp_g(2)
-c
-      return
-      end
-c#######################################################################
-      subroutine set_local_mesh_r
-c
-c-----------------------------------------------------------------------
-c
-c ****** Define the local r mesh arrays.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use global_dims
-      use global_mesh
-      use local_dims_r
-      use local_mesh_r
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: one=1._r_typ
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i
-c
-c-----------------------------------------------------------------------
-c
-c ****** Define the local meshes.
-c
-      do i=1,nrm
-        r(i)=r_g(i0_g+i-1)
-        dr(i)=dr_g(i0_g+i-1)
-      enddo
-c
-      dr1=dr(1)
-      drn=dr(nrm)
-c
-      do i=1,nr
-        rh(i)=rh_g(i0_g+i-1)
-        drh(i)=drh_g(i0_g+i-1)
-      enddo
-c
-c ****** Define local auxiliary mesh-related arrays.
-c
-      r2(:)=r(:)**2
-      r_i(:)=one/r(:)
-      dr_i(:)=one/dr(:)
-      rh_i(:)=one/rh(:)
-      drh_i(:)=one/drh(:)
-c
-      return
-      end
-c#######################################################################
-      subroutine set_local_mesh_tp
-c
-c-----------------------------------------------------------------------
-c
-c ****** Define the local theta and phi mesh arrays.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use global_dims
-      use global_mesh
-      use local_dims_tp
-      use local_mesh_tp
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: one=1._r_typ
-c
-c-----------------------------------------------------------------------
-c
-      integer :: j,k,j0,j1
-c
-c-----------------------------------------------------------------------
-c
-c ****** Define the local meshes.
-c
-      do j=1,ntm
-        t(j)=t_g(j0_g+j-1)
-        dt(j)=dt_g(j0_g+j-1)
-      enddo
-c
-      do j=1,nt
-        th(j)=th_g(j0_g+j-1)
-        dth(j)=dth_g(j0_g+j-1)
-      enddo
-c
-      do k=1,npm
-        p(k)=p_g(k0_g+k-1)
-        dp(k)=dp_g(k0_g+k-1)
-      enddo
-c
-      do k=1,np
-        ph(k)=ph_g(k0_g+k-1)
-        dph(k)=dph_g(k0_g+k-1)
-      enddo
-c
-c ****** Define local auxiliary mesh-related arrays.
-c
-      do j=1,ntm
-        st(j)=st_g(j0_g+j-1)
-        ct(j)=ct_g(j0_g+j-1)
-      enddo
-c
-      do j=1,nt
-        sth(j)=sth_g(j0_g+j-1)
-        cth(j)=cth_g(j0_g+j-1)
-      enddo
-c
-      do k=1,npm
-        sp(k)=sp_g(k0_g+k-1)
-        cp(k)=cp_g(k0_g+k-1)
-      enddo
-c
-      do k=1,np
-        sph(k)=sph_g(k0_g+k-1)
-        cph(k)=cph_g(k0_g+k-1)
-      enddo
-c
-      dt_i(:)=one/dt(:)
-      dth_i(:)=one/dth(:)
-      sth_i(:)=one/sth(:)
-      dp_i(:)=one/dp(:)
-      dph_i(:)=one/dph(:)
-c
-c ****** Prevent division by zero at the poles for sin(theta).
-c
-      if (tb0) then
-        j0=2
-      else
-        j0=1
-      end if
-      if (tb1) then
-        j1=ntm1-1
-      else
-        j1=ntm1
-      end if
-c
-      st_i(:)=0.
-      do j=j0,j1
-        st_i(j)=one/st(j)
-      enddo
-c
-      return
-      end
-c#######################################################################
-      subroutine genmesh (io,label,nc,c0,c1,nseg,frac,dratio,
-     &                    nfilt,periodic,c_shift,c)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Generate a one-dimensional mesh.
-c
-c-----------------------------------------------------------------------
-c
-c ****** Input arguments:
-c
-c          IO      : [integer]
-c                    Fortran file unit number to which to write
-c                    mesh diagnostics.  Set IO=0 if diagnostics
-c                    are not of interest.  It is assumed that
-c                    unit IO has been connected to a file prior
-c                    to calling this routine.
-c
-c          LABEL   : [character(*)]
-c                    Name for the mesh coordinate (example: 'x').
-c
-c          NC      : [integer]
-c                    Number of mesh points to load.
-c
-c          C0      : [real]
-c                    The starting location for the coordinate.
-c
-c          C1      : [real]
-c                    The ending location for the coordinate.
-c                    It is required that C1.gt.C0.
-c
-c          NSEG    : [integer]
-c                    Maximum number of mesh segments.
-c                    The mesh spacing in each segment varies
-c                    exponentially with a uniform amplification
-c                    factor.  The actual number of mesh segments
-c                    used is NSEG or less.  It is obtained from the
-c                    information in array FRAC.
-c
-c          FRAC    : [real array, dimension NSEG]
-c                    The normalized positions of the mesh segment
-c                    boundaries (as a fraction of the size of the
-c                    domain).  For a non-periodic mesh, the first
-c                    value of FRAC specified must equal 0. and the
-c                    last value must equal 1.  For a periodic mesh,
-c                    FRAC must not contain both 0. and 1., since
-c                    these represent the same point.
-c
-c          DRATIO  : [real array, dimension NSEG]
-c                    The ratio of the mesh spacing at the end of a
-c                    segment to that at the beginning.
-c
-c          NFILT   : [integer]
-c                    The number of times to filter the mesh-point
-c                    distribution array.  Set NFILT=0 if filtering
-c                    is not desired.  Filtering can reduce
-c                    discontinuities in the derivative of the mesh
-c                    spacing.
-c
-c          PERIODIC: [logical]
-c                    A flag to indicate whether the mesh to be
-c                    generated represents a periodic coordinate.
-c                    If the coordinate is specified as periodic,
-c                    the range [C0,C1] should be the whole periodic
-c                    interval; the first mesh point is set at C0
-c                    and the last mesh point, C(NC), is set at C1.
-c
-c          C_SHIFT : [real]
-c                    Amount by which to shift the periodic coordinate.
-c                    C_SHIFT is only used when PERIODIC=.true.,
-c                    and is ignored otherwise.  A positive C_SHIFT
-c                    moves the mesh points to the right.
-c
-c ****** Output arguments:
-c
-c          C       : [real array, dimension NC]
-c                    The locations of the mesh points.
-c
-c-----------------------------------------------------------------------
-c
-c ****** The arrays DRATIO and FRAC define the mesh as follows.
-c
-c ****** For example, suppose that a (non-periodic) mesh with three
-c ****** segments is desired.  Suppose the domain size is c=[0:2].
-c ****** In the first segment (with c between 0 and .5) the mesh
-c ****** spacing is decreasing with c, such that DC at c=.5 is half
-c ****** DC at c=0.  From c=.5 to c=1, the mesh is uniform.  From c=1
-c ****** to c=2, the mesh spacing is increasing with c such that DC at
-c ****** c=2 is 10 times DC at c=1.  This mesh would be specified by:
-c ******
-c ******     FRAC=0.,.25,.5,1.
-c ******     DRATIO=.5,1.,10.
-c ******
-c ****** The variable C_SHIFT can be used to shift the mesh point
-c ****** distribution for a periodic coordinate.  For example,
-c ****** suppose C represents mesh points in the interval [0,2*pi].
-c ****** C_SHIFT=.5*pi would move the distribution of mesh points
-c ****** so that the original mesh point with C(1)=0. would be
-c ****** close to .5*pi in the new mesh.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use mpidefs
-      use debug
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer, intent(in) :: io
-      character(*), intent(in) :: label
-      integer, intent(in) :: nc
-      real(r_typ), intent(in) :: c0,c1
-      integer, intent(in) :: nseg
-      real(r_typ), dimension(nseg), intent(in) :: frac,dratio
-      integer, intent(in) :: nfilt
-      logical, intent(in) :: periodic
-      real(r_typ), intent(in) :: c_shift
-      real(r_typ), dimension(nc), intent(out) :: c
-c
-c-----------------------------------------------------------------------
-c
-c ****** Storage for the coordinate transformation.
-c
-      integer :: ns
-      real(r_typ), dimension(:), allocatable :: xi,cs,a,r,f
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: zero=0._r_typ
-      real(r_typ), parameter :: one=1._r_typ
-      real(r_typ), parameter :: half=.5_r_typ
-      real(r_typ), parameter :: eps=1.e-5_r_typ
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j,nf,nr,ll,j0
-      real(r_typ) :: alpha,dr,fac,d,dxi,xiv,cshft,xi_shift
-      real(r_typ), dimension(:), allocatable :: dc,rdc
-c
-c-----------------------------------------------------------------------
-c
-c ****** Check that the number of mesh points is valid.
-c
-      if (nc.lt.2) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in GENMESH:'
-          write (*,*) '### Invalid number of mesh points specified.'
-          write (*,*) '### There must be at least two mesh points.'
-          write (*,*) 'Mesh coordinate: ',label
-          write (*,*) 'Number of mesh points specified = ',nc
-        end if
-        call endrun (.true.)
-      end if
-c
-c ****** Check that a positive mesh interval has been specified.
-c
-      if (c0.ge.c1) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in GENMESH:'
-          write (*,*) '### Invalid mesh interval specified.'
-          write (*,*) '### C1 must be greater than C0.'
-          write (*,*) 'Mesh coordinate: ',label
-          write (*,*) 'C0 = ',c0
-          write (*,*) 'C1 = ',c1
-        end if
-        call endrun (.true.)
-      end if
-c
-c ****** Find the number of values of FRAC specified.
-c
-      do nf=1,nseg-1
-        if (frac(nf+1).eq.zero) exit
-      enddo
-c
-c ****** When no values have been specified (NF=1, the default),
-c ****** a uniform mesh is produced.
-c
-      if (nf.eq.1.and.frac(1).eq.zero) then
-        ns=1
-        allocate (cs(ns+1))
-        allocate (r(ns))
-        cs(1)=c0
-        cs(2)=c1
-        r(1)=one
-        go to 100
-      end if
-c
-c ****** Check that the specified values of FRAC are monotonically
-c ****** increasing.
-c
-      do i=2,nf
-        if (frac(i).lt.frac(i-1)) then
-          if (iamp0) then
-            write (*,*)
-            write (*,*) '### ERROR in GENMESH:'
-            write (*,*) '### Invalid mesh specification.'
-            write (*,*) 'Mesh coordinate: ',label
-            write (*,*) 'The values in FRAC must increase'//
-     &                  ' monotonically.'
-            write (*,*) 'FRAC = ',frac(1:nf)
-          end if
-          call endrun (.true.)
-        end if
-      enddo
-c
-c ****** Check the specified values of FRAC.
-c
-      if (periodic) then
-c
-c ****** A periodic mesh requires the specified values of FRAC
-c ****** to be in the range 0. to 1.
-c
-        if (frac(1).lt.zero.or.frac(nf).gt.one) then
-          if (iamp0) then
-            write (*,*)
-            write (*,*) '### ERROR in GENMESH:'
-            write (*,*) '### Invalid mesh specification.'
-            write (*,*) 'Mesh coordinate: ',label
-            write (*,*) 'For a periodic coordinate, the values in'//
-     &                  ' FRAC must be between 0. and 1.'
-            write (*,*) 'FRAC = ',frac(1:nf)
-          end if
-          call endrun (.true.)
-        end if
-c
-c ****** A periodic mesh cannot contain both 0. and 1. in FRAC,
-c ****** since these represent the same point.
-c
-        if (frac(1).eq.zero.and.frac(nf).eq.one) then
-          if (iamp0) then
-            write (*,*)
-            write (*,*) '### ERROR in GENMESH:'
-            write (*,*) '### Invalid mesh specification.'
-            write (*,*) 'Mesh coordinate: ',label
-            write (*,*) 'For a periodic coordinate, FRAC must not'//
-     &                  ' contain both 0. and 1.'
-            write (*,*) 'FRAC = ',frac(1:nf)
-          end if
-          call endrun (.true.)
-        end if
-c
-      else
-c
-c ****** A non-periodic mesh requires the first specified value
-c ****** of FRAC to be 0., and the last value to equal 1.
-c
-        if (frac(1).ne.zero) then
-          if (iamp0) then
-            write (*,*)
-            write (*,*) '### ERROR in GENMESH:'
-            write (*,*) '### Invalid mesh specification.'
-            write (*,*) 'Mesh coordinate: ',label
-            write (*,*) 'For a non-periodic coordinate, the first'//
-     &                ' value of FRAC must equal 0.'
-            write (*,*) 'FRAC = ',frac(1:nf)
-          end if
-          call endrun (.true.)
-        end if
-c
-        if (frac(nf).ne.one) then
-          if (iamp0) then
-            write (*,*)
-            write (*,*) '### ERROR in GENMESH:'
-            write (*,*) '### Invalid mesh specification.'
-            write (*,*) 'Mesh coordinate: ',label
-            write (*,*) 'For a non-periodic coordinate, the last'//
-     &                  ' value of FRAC must equal 1.'
-            write (*,*) 'FRAC = ',frac(1:nf)
-          end if
-          call endrun (.true.)
-        end if
-c
-      end if
-c
-c ****** Check that the required values of DRATIO have been set,
-c ****** and are positive.
-c
-      if (periodic) then
-        nr=nf
-      else
-        nr=nf-1
-      end if
-c
-      do i=1,nr
-        if (dratio(i).le.zero) then
-          if (iamp0) then
-            write (*,*)
-            write (*,*) '### ERROR in GENMESH:'
-            write (*,*) '### Invalid mesh specification.'
-            write (*,*) 'Mesh coordinate: ',label
-            write (*,*) 'A required value in DRATIO has not been'//
-     &                  ' set or is not positive.'
-            write (*,*) 'DRATIO = ',dratio(1:nr)
-          end if
-          call endrun (.true.)
-        end if
-      enddo
-c
-c ****** Check that an inherently discontinuous mesh has not been
-c ****** specified inadvertently.
-c
-      if (periodic.and.nr.eq.1.and.dratio(1).ne.one) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### WARNING from GENMESH:'
-          write (*,*) '### Discontinuous mesh specification.'
-          write (*,*) 'Mesh coordinate: ',label
-          write (*,*) 'An inherently discontinuous mesh has been'//
-     &                ' specified.'
-          write (*,*) 'FRAC = ',frac(1:nf)
-          write (*,*) 'DRATIO = ',dratio(1:nr)
-        end if
-      end if
-c
-c ****** Set the number of segments.
-c
-      ns=nf-1
-c
-c ****** For a periodic coordinate, add points at XI=0. and XI=1.
-c ****** if they are not already present.
-c
-      if (periodic) then
-        if (frac(1).ne.zero) ns=ns+1
-        if (frac(nf).ne.one) ns=ns+1
-      end if
-c
-      allocate (cs(ns+1))
-      allocate (r(ns))
-c
-c ****** Set up the coordinate limits of the segments.
-c
-      if (periodic) then
-        if (frac(1).ne.zero) then
-          cs(1)=c0
-          cs(2:nf+1)=c0+(c1-c0)*frac(1:nf)
-          if (frac(nf).ne.one) then
-            alpha=(one-frac(nf))/(frac(1)+one-frac(nf))
-            r(1)=dratio(nr)/(one+alpha*(dratio(nr)-one))
-            r(2:nr+1)=dratio(1:nr)
-            cs(ns+1)=c1
-            r(ns)=one+alpha*(dratio(nr)-one)
-          else
-            r(1)=dratio(nr)
-            r(2:nr)=dratio(1:nr-1)
-          end if
-        else
-          cs(1:nf)=c0+(c1-c0)*frac(1:nf)
-          r(1:nr)=dratio(1:nr)
-          cs(ns+1)=c1
-        end if
-      else
-        cs(1:nf)=c0+(c1-c0)*frac(1:nf)
-        r(1:nr)=dratio(1:nr)
-      end if
-c
-  100 continue
-c
-      allocate (xi(ns+1))
-      allocate (a(ns))
-      allocate (f(ns))
-c
-c ****** Compute the XI values at the segment limits.
-c
-      do i=1,ns
-        dr=r(i)-one
-        if (abs(dr).lt.eps) then
-          f(i)=(cs(i+1)-cs(i))*(one+half*dr)
-        else
-          f(i)=(cs(i+1)-cs(i))*log(r(i))/dr
-        end if
-      enddo
-c
-      fac=zero
-      do i=ns,1,-1
-        fac=fac/r(i)+f(i)
-      enddo
-c
-      d=f(1)/fac
-      xi(1)=zero
-      do i=2,ns
-        xi(i)=xi(i-1)+d
-        if (i.lt.ns) d=d*f(i)/(f(i-1)*r(i-1))
-      enddo
-      xi(ns+1)=one
-c
-c ****** Set the amplification factor for each segment.
-c
-      do i=1,ns
-        a(i)=log(r(i))/(xi(i+1)-xi(i))
-      enddo
-c
-c ****** For a periodic coordinate, find the XI shift corresponding
-c ****** to a shift of C_SHIFT in the coordinate.
-c ****** Note that a positive value of C_SHIFT moves the mesh
-c ****** points to the right.
-c
-      if (periodic) then
-        cshft=-c_shift
-        call map_c_to_xi (periodic,ns,xi,cs,a,r,cshft,xi_shift)
-      else
-        xi_shift=0.
-      end if
-c
-c ****** Compute the location of the mesh points in array C
-c ****** by mapping from the XI values.
-c
-      dxi=one/(nc-one)
-c
-      c(1)=c0
-      do j=2,nc-1
-        xiv=(j-1)*dxi
-        call map_xi_to_c (periodic,ns,xi,cs,a,r,
-     &                    cshft,xi_shift,xiv,c(j))
-      enddo
-      c(nc)=c1
-c
-c ****** Filter the mesh if requested.
-c
-      if (nfilt.gt.0) then
-        do i=1,nfilt
-          if (periodic) then
-            call filter_coord_periodic (c1-c0,nc,c)
-          else
-            call filter_coord (nc,c)
-          end if
-        enddo
-      end if
-c
-c ****** Write out the mesh information.
-c
-      if (io.gt.0.and.iamp0) then
-c
-        write (io,*)
-        write (io,*) '### COMMENT from GENMESH:'
-        write (io,*) '### Mesh information for coordinate ',label,':'
-c
-        if (idebug.gt.0) then
-          write (io,*)
-          write (io,*) 'Flag to indicate a periodic mesh: ',periodic
-          write (io,*) 'Number of mesh points = ',nc
-          write (io,*) 'Lower mesh limit = ',c0
-          write (io,*) 'Upper mesh limit = ',c1
-          write (io,*) 'Number of times to filter the mesh = ',nfilt
-          if (periodic) then
-            write (io,*) 'Amount to shift the mesh = ',c_shift
-          end if
-        end if
-c
-        write (io,*)
-        write (io,*) 'Number of mesh segments = ',ns
-c
-        ll=len_trim(label)
-c
-        write (io,900) 'Segment      xi0       xi1'//
-     &                 repeat (' ',16-ll)//label//'0'//
-     &                 repeat (' ',16-ll)//label//'1'//
-     &                 '            ratio'
-        do i=1,ns
-          write (io,910) i,xi(i),xi(i+1),cs(i),cs(i+1),r(i)
-        enddo
-c
-        allocate (dc(nc))
-        allocate (rdc(nc))
-c
-        dc=c-cshift(c,-1)
-        if (periodic) dc(1)=dc(nc)
-        rdc=dc/cshift(dc,-1)
-        if (periodic) rdc(1)=rdc(nc)
-c
-        write (io,*)
-        write (io,920) 'Mesh-point locations:'
-        write (io,920) '     i'//
-     &                 repeat (' ',18-ll)//label//
-     &                 repeat (' ',17-ll)//'d'//label//
-     &                 '             ratio'
-c
-        if (periodic) then
-          j0=1
-        else
-          j0=3
-          write (io,930) 1,c(1)
-          write (io,930) 2,c(2),dc(2)
-        end if
-        do j=j0,nc
-          write (io,930) j,c(j),dc(j),rdc(j)
-        enddo
-c
-        deallocate (dc)
-        deallocate (rdc)
-c
-      end if
-c
-  900 format (/,tr1,a)
-  910 format (tr1,i4,2x,2f10.6,4f17.8)
-  920 format (tr1,a)
-  930 format (tr1,i6,3f18.8)
-c
-      deallocate (cs)
-      deallocate (r)
-      deallocate (xi)
-      deallocate (a)
-      deallocate (f)
-c
-      return
-      end
-c#######################################################################
-      subroutine map_xi_to_c (periodic,ns,xi,cs,a,r,
-     &                        cshft,xi_shift,xiv,cv)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Get the mesh coordinate value CV for the specified
-c ****** xi value XIV.
-c
-c ****** Set PERIODIC=.true. for a periodic coordinate.
-c ****** NS is the number of segments in the mesh definition.
-c ****** The arrays XI, CS, A, and R define the mesh mapping.
-c
-c ****** CSHFT represents the amount to shift a periodic coordinate.
-c ****** XI_SHIFT represents the corresponding amount to shift xi.
-c
-c ****** This is a utility routine for GENMESH.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      logical, intent(in) :: periodic
-      integer, intent(in) :: ns
-      real(r_typ), dimension(ns+1), intent(in) :: xi,cs
-      real(r_typ), dimension(ns), intent(in) :: a,r
-      real(r_typ), intent(in) :: cshft,xi_shift
-      real(r_typ), intent(in) :: xiv
-      real(r_typ), intent(out) :: cv
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: zero=0._r_typ
-      real(r_typ), parameter :: one=1._r_typ
-      real(r_typ), parameter :: half=.5_r_typ
-      real(r_typ), parameter :: eps=1.e-5_r_typ
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i
-      real(r_typ) :: xiv_p,d,d1,da,da1,fac
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), external :: fold
-c
-c-----------------------------------------------------------------------
-c
-c ****** Find the index of the segment to which XIV belongs.
-c
-      if (periodic) then
-c
-c ****** Shift XIV by XI_SHIFT.
-c
-        xiv_p=xiv+xi_shift
-c
-c ****** Fold XIV_P into the main interval.
-c
-        xiv_p=fold(zero,one,xiv_p)
-c
-      else
-c
-        xiv_p=xiv
-c
-      end if
-c
-      do i=1,ns
-        if (xiv_p.ge.xi(i).and.xiv_p.le.xi(i+1)) exit
-      enddo
-c
-      if (i.gt.ns) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in MAP_XI_TO_C:'
-          write (*,*) '### Error in finding the XI segment.'
-          write (*,*) '### Could not find XIV in the XI table.'
-          write (*,*) '[Utility routine for GENMESH.]'
-          write (*,*) '[This is an internal error.]'
-          write (*,*) 'XI = ',xi
-          write (*,*) 'XIV = ',xiv
-          write (*,*) 'XIV_P = ',xiv_p
-        end if
-        call endrun (.true.)
-      end if
-c
-      d =xiv_p  -xi(i)
-      d1=xi(i+1)-xi(i)
-c
-      da =a(i)*d
-      da1=a(i)*d1
-c
-c ****** Interpolate the mapping function at XIV_P.
-c
-      if (abs(da1).lt.eps) then
-        fac=(d*(one+half*da))/(d1*(one+half*da1))
-      else
-        fac=(exp(da)-one)/(r(i)-one)
-      end if
-c
-      cv=cs(i)+(cs(i+1)-cs(i))*fac
-c
-      if (periodic) then
-c
-c ****** Shift CV by the amount CSHFT.
-c
-        cv=cv-cshft
-c
-c ****** Fold CV into the main interval.
-c
-        cv=fold(cs(1),cs(ns+1),cv)
-c
-      end if
-c
-      return
-      end
-c#######################################################################
-      subroutine map_c_to_xi (periodic,ns,xi,cs,a,r,cv,xiv)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Get the xi value XIV for the specified coordinate value CV.
-c
-c ****** Set PERIODIC=.true. for a periodic coordinate.
-c ****** NS is the number of segments in the mesh definition.
-c ****** The arrays XI, CS, A, and R define the mesh mapping.
-c
-c ****** This is a utility routine for GENMESH.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      logical, intent(in) :: periodic
-      integer, intent(in) :: ns
-      real(r_typ), dimension(ns+1), intent(in) :: xi,cs
-      real(r_typ), dimension(ns), intent(in) :: a,r
-      real(r_typ), intent(in) :: cv
-      real(r_typ), intent(out) :: xiv
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: one=1._r_typ
-      real(r_typ), parameter :: eps=1.e-5_r_typ
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i
-      real(r_typ) :: cv_p,d,da,fac
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), external :: fold
-c
-c-----------------------------------------------------------------------
-c
-c ****** Find the index of the segment to which CV belongs.
-c
-      if (periodic) then
-c
-c ****** Fold CV_P into the main interval.
-c
-        cv_p=fold(cs(1),cs(ns+1),cv)
-c
-      else
-c
-        cv_p=cv
-c
-      end if
-c
-      do i=1,ns
-        if (cv_p.ge.cs(i).and.cv_p.le.cs(i+1)) exit
-      enddo
-c
-      if (i.gt.ns) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in MAP_C_TO_XI:'
-          write (*,*) '### Error in finding the CS segment.'
-          write (*,*) '### Could not find CV in the CS table.'
-          write (*,*) '[Utility routine for GENMESH.]'
-          write (*,*) '[This is an internal error.]'
-          write (*,*) 'CS = ',cs
-          write (*,*) 'CV = ',cv
-          write (*,*) 'CV_P = ',cv_p
-        end if
-        call endrun (.true.)
-      end if
-c
-      d=(cv_p-cs(i))/(cs(i+1)-cs(i))
-      da=(r(i)-one)*d
-c
-c ****** Interpolate the mapping function at XIV_P.
-c
-      if (abs(da).lt.eps) then
-        fac=d*(xi(i+1)-xi(i))
-      else
-        fac=log(da+one)/a(i)
-      end if
-c
-      xiv=xi(i)+fac
-c
-      return
-      end
-c#######################################################################
-      subroutine filter_coord (n,f)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Apply a "(1,2,1)/4" low-pass digital filter to a
-c ****** 1D coordinate.
-c
-c ****** The end-points F(1) and F(N) are not changed.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: n
-      real(r_typ), dimension(n) :: f
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: two=2._r_typ
-      real(r_typ), parameter :: quarter=.25_r_typ
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), dimension(n) :: ff
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i
-c
-c-----------------------------------------------------------------------
-c
-c ****** Make a copy of the function.
-c
-      ff=f
-c
-c ****** Apply the filter.
-c
-      do i=2,n-1
-        f(i)=quarter*(ff(i-1)+two*ff(i)+ff(i+1))
-      enddo
-c
-      return
-      end
-c#######################################################################
-      subroutine filter_coord_periodic (xl,n,f)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Apply a "(1,2,1)/4" low-pass digital filter to a
-c ****** periodic 1D coordinate.
-c
-c-----------------------------------------------------------------------
-c
-c ****** XL is the periodic interval for the coordinate.
-c
-c ****** The filtered coordinate is translated so that F(1)
-c ****** is preserved.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ) :: xl
-      integer :: n
-      real(r_typ), dimension(n) :: f
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: two=2._r_typ
-      real(r_typ), parameter :: quarter=.25_r_typ
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), dimension(0:n+1) :: ff
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i
-      real(r_typ) :: f1old,f1new
-c
-c-----------------------------------------------------------------------
-c
-c ****** Save the value of F(1).
-c
-      f1old=f(1)
-c
-c ****** Make a periodic copy of the function.
-c
-      ff(1:n)=f(:)
-c
-      ff(0)=f(n-1)-xl
-      ff(n+1)=f(2)+xl
-c
-c ****** Apply the filter.
-c
-      do i=1,n
-        f(i)=quarter*(ff(i-1)+two*ff(i)+ff(i+1))
-      enddo
-c
-c ****** Translate F so that F(1) is preserved.
-c
-      f1new=f(1)
-      do i=1,n
-        f(i)=f(i)-f1new+f1old
-      enddo
-c
-      return
-      end
-c#######################################################################
-      function fold (x0,x1,x)
-c
-c-----------------------------------------------------------------------
-c
-c ****** "Fold" X into the periodic interval [X0,X1].
-c
-c ****** On return, X is such that X0.le.X.lt.X1.
-c
-c-----------------------------------------------------------------------
-c
-c ****** It is assumed that X0 does not equal X1, as is physically
-c ****** necessary.  If X0 and X1 are equal, the routine just
-c ****** returns with FOLD=X.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ) :: fold
-      real(r_typ) :: x0,x1,x
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ) :: xl
-c
-c-----------------------------------------------------------------------
-c
-      fold=x
-c
-      if (x0.eq.x1) return
-c
-      xl=x1-x0
-c
-      fold=mod(x-x0,xl)+x0
-c
-      if (fold.lt.x0) fold=fold+xl
-      if (fold.ge.x1) fold=fold-xl
-c
-      return
-      end
-c#######################################################################
-      subroutine set_flux
-c
-c-----------------------------------------------------------------------
-c
-c ****** Set the radial magnetic field at the photosphere.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use global_dims
-      use global_mesh
-      use local_dims_r
-      use local_mesh_r
-      use local_dims_tp
-      use local_mesh_tp
-      use fields
-      use vars
-      use mpidefs
-      use timing
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-c ****** Global flux array.
-c
-      real(r_typ), dimension(:,:), allocatable :: br0_g
-c
-c-----------------------------------------------------------------------
-c
-      integer :: j,k,ierr
-c
-c-----------------------------------------------------------------------
-c
-      call timer_on
-c
-      allocate (br0_g(nt_g,np_g))
-c
-c ****** Define the global flux array.
-c
-c ****** Read the flux from file BR0FILE (only on processor IPROC0).
-c
-      if (iamp0) then
-        call readbr (br0file,br0_g,ierr)
-      end if
-      call check_error_on_p0 (ierr)
-c
-c ****** Broadcast BR0_G to all the processors.
-c
-      call MPI_Bcast (br0_g,nt_g*np_g,ntype_real,0,comm_all,ierr)
-c
-c ****** For a fully open field, reverse negative Br
-c ****** (i.e., use the monopole trick).
-c
-      if (option.eq.'open') then
-c
-c ****** Write the boundary flux (before the sign flip) to a file
-c ****** if requested.
-c
-        if (iamp0) then
-          if (br_photo_original_file.ne.'') then
-            write (*,*)
-            write (*,*) '### COMMENT from SET_FLUX:'
-            write (*,*)
-            write (*,*) 'Writing BR0 (before sign flip) to file: ',
-     &                  trim(br_photo_original_file)
-            write (9,*)
-            write (9,*) '### COMMENT from SET_FLUX:'
-            write (9,*)
-            write (9,*) 'Writing BR0 (before sign flip) to file: ',
-     &                  trim(br_photo_original_file)
-            call wrhdf_2d (br_photo_original_file,
-     &                     .true.,nt_g,np_g,
-     &                     br0_g,th_g,ph_g,hdf32,ierr)
-          end if
-        end if
-c
-c ****** Reverse Br.
-c
-        br0_g(:,:)=abs(br0_g(:,:))
-c
-      end if
-c
-c ****** Write the boundary flux to a file if requested.
-c
-      if (iamp0) then
-        if (br_photo_file.ne.' ') then
-          write (*,*)
-          write (*,*) '### COMMENT from SET_FLUX:'
-          write (*,*)
-          write (*,*) 'Writing BR0 to file: ',trim(br_photo_file)
-          write (9,*)
-          write (9,*) '### COMMENT from SET_FLUX:'
-          write (9,*)
-          write (9,*) 'Writing BR0 to file: ',trim(br_photo_file)
-          call wrhdf_2d (br_photo_file,.true.,nt_g,np_g,
-     &                   br0_g,th_g,ph_g,hdf32,ierr)
-        end if
-      end if
-c
-      do j=1,nt
-        do k=1,np
-          br0(j,k)=br0_g(j0_g+j-1,k0_g+k-1)
-        enddo
-      enddo
-c
-      deallocate(br0_g)
-c
-      call timer_off (t_io)
-c
-      end subroutine
-c#######################################################################
-      subroutine potfld
-c
-c-----------------------------------------------------------------------
-c
-c ****** Find the potential field solution.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use local_dims_r
-      use local_mesh_r
-      use local_dims_tp
-      use local_mesh_tp
-      use fields
-      use cgcom
-      use solve_params
-      use mpidefs
-      use debug
-      use timing
-      use matrix_storage_pot3d_solve
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: one=1._r_typ
-c
-c-----------------------------------------------------------------------
-c
-      integer :: ierr,nrm2,ntm2,npm2,i
-c
-      real(r_typ), dimension(:), allocatable :: rhs_cg,x_cg
-c
-c-----------------------------------------------------------------------
-c
-c ****** Load matrix and preconditioner.
-c
-      nrm2=nrm1-1
-      ntm2=ntm1-1
-      npm2=npm1-1
-c
-      a_offsets(1)=-nrm2*ntm2
-      a_offsets(2)=-nrm2
-      a_offsets(3)=-1
-      a_offsets(4)= 0
-      a_offsets(5)= 1
-      a_offsets(6)= nrm2
-      a_offsets(7)= nrm2*ntm2
-c
-c ****** Allocate cg 1D vectors.
-c
-      N=nrm2*ntm2*npm2
-c
-c ****** Prepare the guess, and rhs for the solve.
-c
-      allocate(rhs_cg(N))
-      allocate(x_cg(N))
-      rhs_cg(:)=0.
-      x_cg(:)=0.
-c
-      call getM (N,a_offsets,M)
-      call alloc_pot3d_matrix_coefs
-      call load_matrix_pot3d_solve
-      call load_preconditioner_pot3d_solve
-c
-c ****** Use a trick to accumulate the contribution of the
-c ****** boundary conditions (i.e., the inhomogenous part).
-c
-      call set_boundary_points (x_ax,one)
-      call seam (x_ax,nr,nt,np)
-      call delsq (x_ax,rhs_cg)
-c
-c ****** Original rhs is zero so just use negative of boundary
-c        trick contributions:
-c
-      do concurrent (i=1:N)
-        rhs_cg(i)=-rhs_cg(i)
-      enddo
-c
-c ****** Solve for the potential.
-c
-      if (idebug.gt.0.and.iamp0) then
-        write (*,*)
-        write (*,*) '### COMMENT from POTFLD:'
-        write (*,*) '### Doing a solution:'
-      end if
-c
-      call solve (x_cg,rhs_cg,N,ierr)
-c
-      if (ierr.ne.0) then
-        call endrun (.true.)
-      end if
-c
-      call unpack_scalar (phi,x_cg)
-c
-      call set_boundary_points (phi,one)
-      call seam (phi,nr,nt,np)
-c
-      call dealloc_pot3d_matrix_coefs
-      deallocate(rhs_cg)
-      deallocate(x_cg)
-c
-      return
-      end
-c#######################################################################
-      subroutine set_validation_flux
-c
-c-----------------------------------------------------------------------
-c
-c ****** Set the radial magnetic field at the photosphere.
-c ****** This uses the tilted dipole analytic solution for validation.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use global_dims
-      use global_mesh
-      use local_dims_r
-      use local_mesh_r
-      use local_dims_tp
-      use local_mesh_tp
-      use fields
-      use vars
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: two=2.0_r_typ
-c
-c-----------------------------------------------------------------------
-c
-c ****** Global Br boundary arrays.
-c
-      real(r_typ), dimension(:,:), allocatable :: br0_g
-      real(r_typ), dimension(:,:), allocatable :: br1_g
-c
-c-----------------------------------------------------------------------
-c
-      integer :: j,k
-      real(r_typ) :: sum00,sum11
-c
-c-----------------------------------------------------------------------
-c
-c ****** Define the global boundary condition arrays.
-c
-      allocate (br0_g(nt_g,np_g))
-      allocate (br1_g(nt_g,np_g))
-c
-c ****** Set the tilted dipole analytic Br.
-c
-      do j=1,nt_g
-        do k=1,np_g
-          br0_g(j,k)=(two/r0**3)*(cth_g(j)*cos(dipole_angle) +
-     &                            sth_g(j)*cph_g(k)*sin(dipole_angle))
-          br1_g(j,k)=(two/r1**3)*(cth_g(j)*cos(dipole_angle) +
-     &                            sth_g(j)*cph_g(k)*sin(dipole_angle))
-        enddo
-      enddo
-c
-c ****** Set Br to be periodic (should not be nesessary?).
-c
-      br0_g(:,1)=br0_g(:,npm1_g)
-      br0_g(:,np_g)=br0_g(:,2)
-      br1_g(:,1)=br1_g(:,npm1_g)
-      br1_g(:,np_g)=br1_g(:,2)
-c
-c ****** Set BCs at the poles.
-c
-      sum00=sum(br0_g(     2,2:npm1_g)*dph_g(2:npm1_g))*pl_i
-      sum11=sum(br0_g(ntm1_g,2:npm1_g)*dph_g(2:npm1_g))*pl_i
-c
-      br0_g(1   ,:)=two*sum00-br0_g(     2,:)
-      br0_g(nt_g,:)=two*sum11-br0_g(ntm1_g,:)
-c
-      sum00=sum(br1_g(     2,2:npm1_g)*dph_g(2:npm1_g))*pl_i
-      sum11=sum(br1_g(ntm1_g,2:npm1_g)*dph_g(2:npm1_g))*pl_i
-c
-      br1_g(1   ,:)=two*sum00-br1_g(     2,:)
-      br1_g(nt_g,:)=two*sum11-br1_g(ntm1_g,:)
-c
-      do j=1,nt
-        do k=1,np
-          br0(j,k)=br0_g(j0_g+j-1,k0_g+k-1)
-          br1(j,k)=br1_g(j0_g+j-1,k0_g+k-1)
-        enddo
-      enddo
-c
-      deallocate(br0_g)
-      deallocate(br1_g)
-c
-      end subroutine
-c#######################################################################
-      subroutine write_validation_solution
-c
-c-----------------------------------------------------------------------
-c
-c ****** Write out the tilted dipole analytic solution for validation.
-c ****** This writes out the phi, B, and B computed from phi.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use global_dims
-      use global_mesh
-      use local_dims_r
-      use local_mesh_r
-      use local_dims_tp
-      use local_mesh_tp
-      use fields
-      use vars
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: one=1.0_r_typ
-      real(r_typ), parameter :: two=2.0_r_typ
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j,k
-      character(256) :: fnamephi,fnamebr,fnamebt,fnamebp
-c
-c-----------------------------------------------------------------------
-c
-c ****** Set phi.
-c
-      do concurrent (k=1:np, j=1:nt, i=1:nr)
-        phi(i,j,k)=(-one/rh(i)**2)*(cth(j)*cos(dipole_angle)+
-     &                             sth(j)*cph(k)*sin(dipole_angle))
-      enddo
-c
-      call set_boundary_points (phi,one)
-      call seam (phi,nr,nt,np)
-c
-c
-c ****** Set Br.
-c
-      do concurrent (k=1:np, j=1:nt, i=1:nrm1)
-        br(i,j,k)=(two/r(i)**3)*(cth(j)*cos(dipole_angle) +
-     &                           sth(j)*cph(k)*sin(dipole_angle))
-      enddo
-c
-c ****** Set Bt.
-c
-      do concurrent (k=1:np, j=1:ntm1, i=1:nr)
-        bt(i,j,k)=(one/rh(i)**3)*(st(j)*cos(dipole_angle) -
-     &                            ct(j)*cph(k)*sin(dipole_angle))
-      enddo
-c
-c ****** Set Bp.
-c
-      do concurrent (k=1:npm1, j=1:nt, i=1:nr)
-        bp(i,j,k)=(one/rh(i)**3)*(sph(k)*sin(dipole_angle))
-      enddo
-c
-c ****** Due to half mesh trig issues on periodic domain, we need
-c ****** to seam these even though seaming is not needed in getb().
-c
-      call seam (br,nrm,nt,np)
-      call seam (bt,nr,ntm,np)
-      call seam (bp,nr,nt,npm)
-c
-c ****** Backup fnames and set new ones so we can reuse write routine.
-c
-      fnamephi=phifile
-      phifile='phi_exact.'//trim(fmt)
-      fnamebr=brfile
-      brfile='br_exact.'//trim(fmt)
-      fnamebt=btfile
-      btfile='bt_exact.'//trim(fmt)
-      fnamebp=bpfile
-      bpfile='bp_exact.'//trim(fmt)
-c
-      call write_solution
-c
-c ****** Restore original output names.
-c
-      phifile=fnamephi
-      brfile=fnamebr
-      btfile=fnamebt
-      bpfile=fnamebp
-c
-c ****** Reset phi.
-c
-      phi(:,:,:)=0.
-c
-      end subroutine
-c#######################################################################
-      subroutine solve (x,rhs,N,ierr)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Solve the implicit equations iteratively.
-c
-c-----------------------------------------------------------------------
-c
-c ****** Return IERR=0 if the iteration converges; otherwise,
-c ****** IERR is set to a nonzero value.
-c
-c ****** X is the initial guess at the solution.
-c ****** RHS is the right-hand side.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use cgcom
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: N
-      real(r_typ), dimension(N) :: x,rhs
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-c ****** Solve the equations using the CG method.
-c
-      call cgsolve (x,rhs,N,ierr)
-c
-c ****** Check for convergence.
-c
-      if (ierr.ne.0) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in SOLVE:'
-          write (*,*) '### The field solver did not converge.'
-          write (*,*) 'IERR = ',ierr
-          write (*,100) ncg,epsn
-  100     format (1x,'N = ',i6,' EPSN = ',1pe13.6)
-        end if
-      else
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### COMMENT from SOLVE:'
-          write (*,*) '### The field solver converged.'
-          write (*,*) 'Number of iterations = ',ncg
-          write (9,*)
-          write (9,*) '### COMMENT from SOLVE:'
-          write (9,*) '### The field solver converged.'
-          write (9,*) 'Number of iterations = ',ncg
-        end if
-      end if
-c
-      return
-      end
-c#######################################################################
-      subroutine cgsolve (x,r,N,ierr)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Solve the linear system:
-c
-c            A * x = b
-c
-c ****** using the classical Conjugate Gradient method for symmetric
-c ****** and positive-definite matrices.
-c
-c-----------------------------------------------------------------------
-c
-c ****** On input, X(N) contains a guess at the solution, and
-c ****** R(N) contains the right-hand side, b.
-c
-c ****** On exit, X contains an estimate to the solution, and
-c ****** R contains the residual (b-Ax).
-c
-c ****** IERR=0 indicates that the solution converged to the
-c ****** requested accuracy.  Other values indicate that the
-c ****** iteration did not converge for the given maximum number
-c ****** of iterations.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use cgcom
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: N
-      real(r_typ), dimension(N) :: x,r
-      integer :: ierr,i
-c
-c-----------------------------------------------------------------------
-c
-c ****** Scratch space for the CG iteration vectors.
-c
-      real(r_typ), dimension(N), target :: p,ap
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), external :: cgdot
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ) :: bdotb,rdotr,pdotap,alphai,rdotr_old,betai
-c
-c-----------------------------------------------------------------------
-c
-      ncg=0
-c
-c ****** Get the norm of the RHS.
-c
-      do concurrent (i=1:N)
-        p(i)=r(i)
-      enddo
-c
-      call prec_inv (p)
-      bdotb=cgdot(r,p,N)
-c
-c ****** If the RHS is zero, return with a zero solution.
-c
-      if (bdotb.eq.0.) then
-        do concurrent (i=1:N)
-          x(i)=0.
-        enddo
-        epsn=0.
-        ierr=0
-        return
-      end if
-c
-c-----------------------------------------------------------------------
-c ****** Initialization.
-c-----------------------------------------------------------------------
-c
-      call ax (x,ap,N)
-c
-      do concurrent (i=1:N)
-        r(i)=r(i)-ap(i)
-        p(i)=r(i)
-      enddo
-c
-c ****** Find the initial error norm.
-c
-      call prec_inv (p)
-      rdotr=cgdot(r,p,N)
-c
-      call ernorm (bdotb,rdotr,ierr)
-      if (ierr.ge.0) return
-c
-c-----------------------------------------------------------------------
-c ****** Main iteration loop.
-c-----------------------------------------------------------------------
-c
-      do
-        ncg=ncg+1
-c
-        call ax (p,ap,N)
-c
-        pdotap=cgdot(p,ap,N)
-        alphai=rdotr/pdotap
-c
-        do concurrent (i=1:N)
-          x(i)=x(i)+alphai*p(i)
-          r(i)=r(i)-alphai*ap(i)
-          ap(i)=r(i)
-        enddo
-c
-        call prec_inv (ap)
-        rdotr_old=rdotr
-        rdotr=cgdot(r,ap,N)
-c
-c ****** Check for convergence.
-c
-        call ernorm (bdotb,rdotr,ierr)
-        if (ierr.ge.0) exit
-c
-        betai=rdotr/rdotr_old
-c
-        do concurrent (i=1:N)
-          p(i)=betai*p(i)+ap(i)
-        enddo
-c
-      enddo
-c
-      end subroutine
-c#######################################################################
-      subroutine ernorm (bdotb,rdotr,ierr)
-c
-c-----------------------------------------------------------------------
-c
-c ****** This subroutine checks if the iterative solver has
-c ****** converged or if the maximum allowed number of iterations,
-c ****** NCGMAX, has been exceeded.
-c
-c-----------------------------------------------------------------------
-c
-c ****** Convergence is deemed to have occurred when:
-c ******
-c ******     ||R||/||B|| .lt. EPSCG
-c ******
-c ****** where ||R|| is the norm of the (preconditioned)
-c ****** residual, ||B|| is the norm of the (preconditioned)
-c ****** RHS, and EPSCG is the specified convergence criterion.
-c
-c ****** Set IERR=0 if the error is below the error criterion
-c ****** (i.e., the solution has converged).
-c ****** Set IERR=-1 if the error does not yet meet the error
-c ****** criterion and the number of iterations is less than NCGMAX.
-c ****** Set IERR=1 if the maximum number of iterations has
-c ****** been exceeded without convergence.
-c
-c-----------------------------------------------------------------------
-c
-c ****** On input, BDOTB has the dot product of the RHS vector
-c ****** with itself, weighted by the preconditioning matrix.
-c ****** Similarly, RDOTR has the dot product of the residual vector
-c ****** with itself, weighted by the preconditioning matrix.
-c ****** This is used to normalize the error estimate.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use cgcom
-      use mpidefs
-      use vars
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ) :: bdotb,rdotr
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: one=1._r_typ
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ) :: epssq
-c
-c-----------------------------------------------------------------------
-c
-      ierr=-1
-c
-      epssq=rdotr/bdotb
-      epsn=sign(one,epssq)*sqrt(abs(epssq))
-c
-      if (ncghist.gt.0) then
-c
-        if (mod(ncg,ncghist).eq.0) then
-          if (iamp0) then
-            if (ncg.eq.0) then
-              write (9,*)
-              write (9,*) '### Convergence information:'
-              write (*,*)
-            end if
-            write (*,100) ncg,epsn
-            FLUSH(OUTPUT_UNIT)
-            write (9,100) ncg,epsn
-            FLUSH(9)
-  100       format (1x,'Iteration: ',i8,'   Residual: ',1pe23.15)
-          end if
-        end if
-      end if
-c
-c ****** Check for convergence.
-c
-      if (epsn.lt.epscg) then
-        if (ncghist.gt.0) then
-          if (iamp0) then
-            write (*,100) ncg,epsn
-            FLUSH(OUTPUT_UNIT)
-            write (9,*)
-            write (9,*) '### The CG solver has converged.'
-            write (9,100) ncg,epsn
-            FLUSH(9)
-          end if
-        end if
-        ierr=0
-      else if (ncg.ge.ncgmax) then
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### ERROR in ERNORM:'
-          write (*,*) '### Exceeded maximum number of iterations.'
-          write (*,*) 'NCGMAX = ',ncgmax
-          write (*,*) 'EPSN = ',epsn
-          FLUSH(OUTPUT_UNIT)
-          write (9,*)
-          write (9,*) '### ERROR in ERNORM:'
-          write (9,*) '### Exceeded maximum number of iterations.'
-          write (9,*) 'NCGMAX = ',ncgmax
-          write (9,*) 'EPSN = ',epsn
-          FLUSH(9)
-        end if
-        ierr=1
-      end if
-c
-      return
-      end
-c#######################################################################
-      subroutine alloc_pot3d_matrix_coefs
-c
-c-----------------------------------------------------------------------
-c
-c ****** Allocate the arrays in which the matrix coefficients
-c ****** for the pot3d solve are stored.
-c
-c-----------------------------------------------------------------------
-c
-      use matrix_storage_pot3d_solve
-      use cgcom
-      use local_dims_r
-      use local_dims_tp
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      allocate (a(2:nrm1,2:ntm1,2:npm1,7))
-      a=0.
-      allocate (a_i(N))
-      a_i=0.
-c
-      if (ifprec.eq.2) then
-        allocate (a_csr(M))
-        allocate (lu_csr(M))
-        allocate (lu_csr_ja(M))
-        allocate (a_csr_ja(M))
-        allocate (a_csr_ia(1+N))
-        allocate (a_N1(N))
-        allocate (a_N2(N))
-        allocate (a_csr_d(N))
-        allocate (a_csr_dptr(N))
-      endif
-c
-      return
-      end
-c#######################################################################
-      subroutine dealloc_pot3d_matrix_coefs
-c
-c-----------------------------------------------------------------------
-c
-c ****** Deallocate the arrays in which the matrix coefficients
-c ****** for the pot3d solve are stored.
-c
-c-----------------------------------------------------------------------
-c
-      use matrix_storage_pot3d_solve
-      use cgcom
-      use cusparse_interface
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      deallocate (a)
-      deallocate (a_i)
-c
-      if (ifprec.eq.2) then
-#ifdef CUSPARSE
-        call unload_lusol_cusparse
-#endif
-        deallocate (a_csr)
-        deallocate (lu_csr)
-        deallocate (lu_csr_ja)
-        deallocate (a_csr_ia)
-        deallocate (a_csr_ja)
-        deallocate (a_csr_d)
-        deallocate (a_N1)
-        deallocate (a_N2)
-        deallocate (a_csr_dptr)
-      endif
-c
-      end subroutine
-c#######################################################################
-      subroutine load_matrix_pot3d_solve
-c
-c-----------------------------------------------------------------------
-c
-c ****** Load the matrix coefficients for the pot3d solve.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use matrix_storage_pot3d_solve
-      use local_dims_r
-      use local_mesh_r
-      use local_dims_tp
-      use local_mesh_tp
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j,k
-c
-c-----------------------------------------------------------------------
-c
-c ****** Set matrix coefs
-c
-      do k=2,npm1
-        do j=2,ntm1
-          do i=2,nrm1
-c           a*ps(i,j,k-1):
-            a(i,j,k,1)=-drh(i)*dth(j)*sth_i(j)*dp_i(k-1)
-c           a*ps(i,j-1,k):
-            a(i,j,k,2)=-drh(i)*dph(k)*st(j-1)*dt_i(j-1)
-c           a*ps(i-1,j,k):
-            a(i,j,k,3)=-sth(j)*dth(j)*dph(k)*r2(i-1)*dr_i(i-1)
-c           a*ps(i+1,j,k):
-            a(i,j,k,5)=-sth(j)*dth(j)*dph(k)*r2(i  )*dr_i(i  )
-c           a*ps(i,j+1,k):
-            a(i,j,k,6)=-drh(i)*dph(k)*st(j  )*dt_i(j  )
-c           a*ps(i,j,k+1):
-            a(i,j,k,7)=-drh(i)*dth(j)*sth_i(j)*dp_i(k  )
-c
-c           a*ps(i,j,k):
-            a(i,j,k,4)=-(a(i,j,k,1)+a(i,j,k,2)+a(i,j,k,3)+
-     &                   a(i,j,k,5)+a(i,j,k,6)+a(i,j,k,7))
-          enddo
-        enddo
-      enddo
-c
-      return
-      end
-c#######################################################################
-      subroutine load_preconditioner_pot3d_solve
-c
-c-----------------------------------------------------------------------
-c
-c ****** Load the preconditioner for the pot3d solve.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use matrix_storage_pot3d_solve
-      use cgcom
-      use local_dims_r
-      use local_dims_tp
-      use, intrinsic :: iso_c_binding
-      use cusparse_interface
-      use timing
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: one=1._r_typ
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j,k,icode,ii
-c
-c-----------------------------------------------------------------------
-c
-      call timer_on
-c
-      if (ifprec.eq.0) return
-c
-      if (ifprec.eq.1) then
-c
-c ****** Diagonal scaling:
-c
-        ii=0
-        do k=2,npm1
-          do j=2,ntm1
-            do i=2,nrm1
-              ii=ii+1
-              a_i(ii)=real(one/a(i,j,k,4),r_typ_pc)
-            enddo
-          enddo
-        enddo
-c
-      elseif (ifprec.eq.2) then
-c
-c ****** Convert A matrix into CSR format:
-c
-        call diacsr (N,M,a,a_offsets,a_csr,a_csr_ja,
-     &                     a_csr_ia,a_csr_dptr)
-#ifdef CUSPARSE
-      cN=N
-      cM=M
-      call load_lusol_cusparse (C_LOC(a_csr(1)),
-     &                          C_LOC(a_csr_ia(1)),
-     &                          C_LOC(a_csr_ja(1)),cN,cM)
-#else
-c
-c ****** Overwrite CSR A with preconditioner L and U matrices:
-c
-c ****** Incomplete LU (ILU)
-c
-        icode=0
-        call ilu0 (N,M,a_csr,a_csr_ja,a_csr_ia,a_csr_dptr,icode)
-c
-        if (icode.ne.0) then
-          print*, '### ERROR IN ILU FORMATION'
-        endif
-c
-c ****** Convert LU stored in A to LU matrix in optimized layout.
-c
-        call lu2luopt (N,M,lu_csr,a_csr,a_csr_ia,a_csr_ja,lu_csr_ja,
-     &                 a_csr_dptr,a_N1,a_N2)
-c
-c ****** Store inverse of diagonal of LU matrix.
-c
-        do i=1,N
-          a_csr_d(i)=one/a_csr(a_csr_dptr(i))
-        enddo
-c
-#endif
-      endif
-c
-      call timer_off (t_pc_load)
-c
-      end subroutine
-c#######################################################################
-      subroutine ilu0 (N,M,A,JA,IA,A_da,icode)
-c
-c-----------------------------------------------------------
-c
-c     Set-up routine for ILU(0) preconditioner. This routine
-c     computes the L and U factors of the ILU(0) factorization
-c     of a general sparse matrix A stored in CSR format with
-c     1-based indexing. Since
-c     L is unit triangular, the L and U factors can be stored
-c     as a single matrix which occupies the same storage as A.
-c     New ja and ia arrays are not needed for the LU matrix
-c     since the pattern of the LU matrix is identical with
-c     that of A.
-c
-c     Original Author:  Yousef Saad
-c            Iterative Methods for Sparse Linear Systems 2nd Ed. pg. 309
-c     Modified by R.M. Caplan
-c
-c-----------------------------------------------------------
-c     INPUT:
-c     N         : Dimension of matrix
-c     A, JA, IA : Sparse matrix in CSR sparse storage format
-c     A_da      : Pointers to the diagonal elements in the CSR
-c                 data structure luval
-c
-c     OUTPUT:
-c     A     : L/U matrices stored together. On return A,
-c             JA, and IA are the combined CSR data structure for
-c             the L and U factors.
-c     icode : Integer indicating error code on return:
-c             (0): Normal return.
-c             (k): Encountered a zero pivot at step k.
-c------------------------------------------------------------
-c
-      use number_types
-      use number_types_pc
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: JA(M),IA(N+1),A_da(N),iw(N)
-      integer :: icode,N,M
-      real(r_typ_pc) :: A(M)
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,ik,kj,k,ij,IA_i,IA_ip1m1
-      real(r_typ_pc) :: Aik
-c
-c-----------------------------------------------------------------------
-c
-      icode=0
-c     Initialize scratch index array:
-      iw(:)=0
-c
-      do i=2,N
-c       Store index of (i,j) in A in scratch array of iw(j=1:N)
-c       This allows lookup given a column index (j) in row (k)
-c       to see if the column is in row (i).
-        IA_i    =IA(i)
-        IA_ip1m1=IA(i+1)-1
-c
-        do ij=IA_i,IA_ip1m1
-          iw(JA(ij))=ij
-        enddo
-c
-c       Loop from first element in row i to 1 less than diagonal elem:
-        do ik=IA_i,A_da(i)-1     !IA(i+1) !ik is index of (i,k) in A[]
-          k    =JA(ik)           !Actual column index in matrix (k)
-          Aik  =A(ik)/A(A_da(k)) !Save Aik for next loop as an optim.
-          A(ik)=Aik
-c
-c         Loop from 1 more than diag elem to last elem in row k:
-          do kj=A_da(k)+1,IA(k+1)-1 !kj is index of (k,j) in A[]
-c            Get ij location from scratch array (if 0, no ij present)
-             ij=iw(JA(kj))
-             if (ij .ne. 0) then
-               A(ij)=A(ij)-Aik*A(kj)
-             endif
-          enddo
-        enddo
-c
-        if (A(ik).eq.0) then
-          icode=i
-          exit
-        endif
-c
-c       Reset scratch index array:
-        do ij=IA_i,IA_ip1m1
-          iw(JA(ij))=0
-        enddo
-      enddo
-c
-      end subroutine
-c#######################################################################
-      subroutine lu2luopt (N,M,LU,A,IA,JA,LUJA,A_da,N1,N2)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Re-order elements of LU matrix in CSR format into custom,
-c ****** optimized format for use with lusol().
-c ****** (Eventually, this could be merged with the ilu0 and/or diacsr)
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use number_types_pc
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: N,M
-      integer :: JA(M),LUJA(M),IA(N+1),A_da(N)
-      integer :: N1(N),N2(N)
-      real(r_typ_pc) :: A(M),LU(M)
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,k,ii
-c
-c-----------------------------------------------------------------------
-c
-      ii=0
-c
-      do i=1,N
-        do k=IA(i),A_da(i)-1
-           ii=ii+1
-           LU(ii)=A(k)
-           LUJA(ii)=JA(k)
-        enddo
-c
-c       Store k1 and k2 ranges for lusolve:
-c
-        N1(i)=A_da(i)-1-IA(i)
-        N2(i)=IA(i+1)-2-A_da(i)
-      enddo
-c
-      do i=N,1,-1
-        do k=A_da(i)+1,IA(i+1)-1
-           ii=ii+1
-           LU(ii)=A(k)
-           LUJA(ii)=JA(k)
-        enddo
-      enddo
-c
-      end subroutine
-c#######################################################################
-      subroutine diacsr (N,M,Adia,ioff,Acsr,JA,IA,Adptr)
-c
-c-----------------------------------------------------------------------
-c
-c *** DIACSR_INNER converts a solver matrix in a MAS-style
-c     diagonal format to standard compressed sparse row (CSR)
-c     including periodic coefficents when nproc_p=1.
-c
-c     Author of original diacsr: Youcef Saad
-c     Modifications for MAS:     RM Caplan
-c
-c     Input:
-c                     N: Size of the matrix (NxN)
-c                     M: Number of non-zero entries in matrix
-c                        (computed with getM_tc())
-c         Adia(IDIAG,N): The matrix in modified "DIA" format
-c           ioff(IDIAG): Offsets of the diagonals in A.
-c
-c     Output:
-c            Acsr(M), JA(M), IA(N+1): The matrix A in CSR.
-c                           Adptr(N): Pointers to diag elements in A,
-c                                     [e.g. A(i,i) == A(Adptr(i))]
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use number_types_pc
-      use local_dims_r
-      use local_dims_tp
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer, parameter :: IDIAG=7
-c
-c-----------------------------------------------------------------------
-c
-      real (r_typ_pc) :: Acsr(M)
-      real (r_typ) :: Adia(N,IDIAG)
-      integer :: N,M
-      integer :: Adptr(N)
-      integer :: IA(N+1)
-      integer :: JA(M)
-      integer :: ioff(IDIAG)
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j,jj,mi,mj,mk,ko,x
-      integer :: ioffok(IDIAG)
-c
-c-----------------------------------------------------------------------
-c
-      x=0
-c
-      IA(1)=1
-      ko=1
-      i=0
-c
-      do mk=2,npm1
-        do mj=2,ntm1
-          do mi=2,nrm1
-c ********* Set index of value and column indicies array:
-            i=i+1
-c
-c ********* Do not add coefs that multiply boundaries:
-c           For each boundary, there is a sub-set of coefs in the
-c           matrix row that should not be added.
-c           This makes "local" matrices have no bc info
-c
-c ********* Reset "i-offset-ok-to-use-coef-jj" array:
-c
-            ioffok(:)=1
-c
-            if (mi.eq.2) then
-              ioffok(3)=0;
-            endif
-c
-            if (mi.eq.nrm1) then
-              ioffok(5)=0;
-            endif
-c
-            if (mj.eq.2) then
-              ioffok(2)=0;
-            endif
-c
-            if (mj.eq.ntm1) then
-              ioffok(6)=0;
-            endif
-c
-c ********* Eliminate periodic ceofs in the case nproc_p>1
-c
-            if (nproc_p.gt.1) then
-              if (mk.eq.2) then
-                ioffok(1)=0
-              endif
-              if (mk.eq.npm1) then
-                ioffok(7)=0
-              endif
-            endif
-c
-c ********* To handle periodicity of phi in nproc_p=1 case:
-c           We want CSR matrix to be in order so
-c           have to sweep three times to avoid sorting:
-c
-c ********* Add periodic coefs of "right side":
-c
-            do jj=1,IDIAG
-              if (ioffok(jj).eq.1) then
-                j=i+ioff(jj)-x
-                if (j.gt.N-x) then
-                  j=j-N
-                  Acsr(ko)=real(Adia(i,jj),r_typ_pc)
-                  JA(ko)=j
-                  ko=ko+1
-                endif
-              endif
-            enddo
-c
-c ********* Now do non-periodic coefs:
-c
-            do jj=1,IDIAG
-              if (ioffok(jj).eq.1) then
-                j=i+ioff(jj)-x
-                if (j.ge.1.and.j.le.N-x) then
-c                 Store pointer to diagonal elements in A:
-                  if (jj.eq.4) Adptr(i)=ko
-                  Acsr(ko)=real(Adia(i,jj),r_typ_pc)
-                  JA(ko)=j
-                  ko=ko+1
-                endif
-              endif
-            enddo
-c
-c ********* Now do periodic coefs of "left side":
-c
-            do jj=1,IDIAG
-              if (ioffok(jj).eq.1) then
-                j=i+ioff(jj)-x
-                if (j.lt.1) then
-                  j=N+j
-                  Acsr(ko)=real(Adia(i,jj),r_typ_pc)
-                  JA(ko)=j
-                  ko=ko+1
-                endif
-              endif
-            enddo
-c
-c ********* Set row offset:
-c
-            IA(i+1)=ko-x
-          enddo
-        enddo
-      enddo
-c
-      return
-      end
-c#######################################################################
-      subroutine getM (N, ioff, M)
-c
-c-----------------------------------------------------------------------
-c
-c *** This routine computes the number of non-zeros in the
-c     solver matrix for use with allocating the matrices.
-c     See diacsr() for description of inputs.
-c
-c     Output:  M  # of nonzeros.
-c
-c-----------------------------------------------------------------------
-c
-      use mpidefs
-      use local_dims_r
-      use local_dims_tp
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer, parameter :: IDIAG=7
-      integer :: N,M,i,j,jj,ko,mi,mj,mk,x
-      integer :: ioff(IDIAG)
-      integer :: ioffok(IDIAG)
-c
-      x=0
-c
-      ko=1
-      i=0
-c
-      do mk=2,npm1
-        do mj=2,ntm1
-          do mi=2,nrm1
-c
-            ioffok(:)=1
-c
-            if (mi.eq.2) then
-              ioffok(3)=0;
-            endif
-c
-            if (mi.eq.nrm1) then
-              ioffok(5)=0;
-            endif
-c
-            if (mj.eq.2) then
-              ioffok(2)=0;
-            endif
-c
-            if (mj.eq.ntm1) then
-              ioffok(6)=0;
-            endif
-c
-c ********* Eliminate periodic ceofs in the case nproc_p>1
-c
-            if (nproc_p.gt.1) then
-              if (mk.eq.2) then
-                ioffok(1)=0
-              endif
-              if (mk.eq.npm1) then
-                ioffok(7)=0
-              endif
-            endif
-c
-            do jj=1,IDIAG
-              if (ioffok(jj).eq.1) then
-                j=i+ioff(jj)-x
-                if (j.gt.N-x) then
-                  ko=ko+1
-                endif
-              endif
-            enddo
-c
-            do jj=1,IDIAG
-              if (ioffok(jj).eq.1) then
-                j=i+ioff(jj)-x
-                if (j.ge.1.and.j.le.N-x) then
-                  ko=ko+1
-                endif
-              endif
-            enddo
-c
-            do jj=1,IDIAG
-              if (ioffok(jj).eq.1) then
-                j=i+ioff(jj)-x
-                if (j.lt.1) then
-                  ko=ko+1
-                endif
-              endif
-            enddo
-          enddo
-        enddo
-      enddo
-c
-c *** Save number of non-zeros of matrix:
-c
-      M=ko-1
-c
-      return
-      end
-c#######################################################################
-      subroutine ax (x,y,N)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Set y = A * x.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use local_dims_r
-      use local_dims_tp
-      use fields, ONLY : x_ax
-      use timing
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: zero=0._r_typ
-c
-c-----------------------------------------------------------------------
-c
-      integer :: N
-      real(r_typ), dimension(N) :: x,y
-c
-c-----------------------------------------------------------------------
-c
-c ****** Expand X array to allow for boundary and seam values.
-c
-      call unpack_scalar (x_ax,x)
-c
-c ****** Set the boundary values of X.
-c
-      call set_boundary_points (x_ax,zero)
-c
-c ****** Seam along edges between processors.
-c
-      call seam (x_ax,nr,nt,np)
-c
-c ****** Get the matrix-vector product.
-c
-      call timer_on
-c
-      call delsq (x_ax,y)
-c
-      call timer_off (t_ax)
-c
-      end subroutine
-c#######################################################################
-      subroutine prec_inv (x)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Apply preconditioner: x := M(inv) * x.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use number_types_pc
-      use cgcom
-      use solve_params
-      use matrix_storage_pot3d_solve
-      use, intrinsic :: iso_c_binding
-      use cusparse_interface
-      use timing
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), dimension(N) :: x
-      real(r_typ_pc), dimension(N) :: x_32
-      integer :: i
-c
-c-----------------------------------------------------------------------
-c
-      if (ifprec.eq.0) return
-c
-      call timer_on
-c
-      if (ifprec.eq.1) then
-c
-c ****** Point-Jacobi (diagonal scaling):
-c
-        do concurrent (i=1:N)
-          x(i)=a_i(i)*x(i)
-        enddo
-c
-      elseif (ifprec.eq.2) then
-c
-c ****** ILU0 Partial-Block-Jacobi:
-c
-#ifdef CUSPARSE
-        call lusol_cusparse(C_LOC(x(1)))
-c
-#else
-c
-c ****** Convert input array to single precision.
-c
-        do i=1,N
-          x_32(i) = real(x(i),r_typ_pc)
-        enddo
-c
-        call lusol (N,M,x_32,lu_csr,lu_csr_ja,a_N1,a_N2,a_csr_d)
-c
-c ****** Convert result back to double precision.
-c
-        do i=1,N
-          x(i) = real(x_32(i),r_typ)
-        enddo
-c
-#endif
-      endif
-c
-      call timer_off (t_pc)
-c
-      end subroutine
-c#######################################################################
-      subroutine lusol (N,M,x,LU,LU_ja,N1,N2,LUd_i)
-c
-c-----------------------------------------------------------
-c
-c     Performs a forward and a backward solve for the sparse system
-c     (LU) x=y  where LU is in an optimized custom CSR format
-c                                              (see lu2luopt())
-c
-c     For use where LU is an ILU or SSOR/SGS factorization.
-c
-c     Author of original lusol: Yousef Saad
-c           Iterative Methods for Sparse Linear Systems 2nd Ed. pg. 299
-c
-c     Modified by RM Caplan to include optimized memory access
-c     as described in
-c     B. Smith, H. Zhang  Inter. J. of High Perf. Comp. Appl.
-c     Vol. 25 #4 pg. 386-391 (2011)
-c
-c-----------------------------------------------------------
-c     PARAMETERS:
-c     N     : Dimension of problem
-c     x     : At input, x is rhs (y), at output x is the solution.
-c     LU    : Values of the LU matrix. L and U are stored together in
-c             order of access in this routine.
-c     LU_ja : Column indices of elements in LU.
-c     N1    : Row-start indicies in original CSR LU.
-c     N2    : Indices of diagonal elements in orig CSR LU
-c     LUd_i : Inverse diagonal elements of U
-c------------------------------------------------------------
-c
-      use number_types
-      use number_types_pc
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ_pc) :: x(N),LUd_i(N),LU(M)
-      integer :: N1(N),N2(N),LU_ja(M)
-      integer :: N,M
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,k,k1,k2
-c
-c-----------------------------------------------------------------------
-c
-c ****** FORWARD SOLVE: Solve L x'=y
-c
-      k2=0
-      do i=1,N
-c       Compute x(i) := x(i) - sum L(i,j) * x(j)
-        k1=k2+1
-        k2=k1+N1(i)
-        do k=k1,k2
-          x(i)=x(i)-LU(k)*x(LU_ja(k))
-        enddo
-c       Diagonal is always 1 for L so no division here is nessesary.
-      enddo
-c
-c ****** BACKWARD SOLVE: Solve U x=x'
-c
-      do i=N,1,-1
-c       Compute x(i) := x(i) - sum U(i,j) * x(j)
-        k1=k2+1
-        k2=k1+N2(i)
-        do k=k1,k2
-          x(i)=x(i)-LU(k)*x(LU_ja(k))
-        enddo
-c       Compute x(i) := x(i) / U(i,i)
-        x(i)=x(i)*LUd_i(i)
-      enddo
-c
-      end subroutine
-c#######################################################################
-      subroutine unpack_scalar (s,x)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Unpack the scalar x into
-c ****** three-dimensional array s leaving room for boundaries.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use local_dims_r
-      use local_dims_tp
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), dimension(nr,nt,np) :: s
-      real(r_typ), dimension(2:nrm1,2:ntm1,2:npm1) :: x
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j,k
-c
-c-----------------------------------------------------------------------
-c
-      do concurrent (k=2:npm1, j=2:ntm1, i=2:nrm1)
-        s(i,j,k)=x(i,j,k)
-      enddo
-c
-      return
-      end
-c#######################################################################
-      subroutine delsq (x,y)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Set Y = - (dV * del-squared X) at the internal points.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use local_dims_r
-      use local_dims_tp
-      use matrix_storage_pot3d_solve
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), dimension(nr,nt,np) :: x
-      real(r_typ), dimension(2:nrm1,2:ntm1,2:npm1) :: y
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j,k
-c
-c-----------------------------------------------------------------------
-c
-      do concurrent (k=2:npm1, j=2:ntm1, i=2:nrm1)
-        y(i,j,k)=a(i,j,k,1)*x(i  ,j  ,k-1)
-     &          +a(i,j,k,2)*x(i  ,j-1,k  )
-     &          +a(i,j,k,3)*x(i-1,j  ,k  )
-     &          +a(i,j,k,4)*x(i  ,j  ,k  )
-     &          +a(i,j,k,5)*x(i+1,j  ,k  )
-     &          +a(i,j,k,6)*x(i  ,j+1,k  )
-     &          +a(i,j,k,7)*x(i  ,j  ,k+1)
-      enddo
-c
-      return
-      end
-c#######################################################################
-      subroutine set_boundary_points (x,vmask)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Set boundary points of X at the physical boundaries.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use global_mesh
-      use local_dims_r
-      use local_mesh_r
-      use local_dims_tp
-      use local_mesh_tp
-      use fields
-      use solve_params
-      use vars, ONLY : validation_run
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), dimension(nr,nt,np) :: x
-      real(r_typ) :: vmask,temp_sum0
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: two=2._r_typ
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j,k
-c
-c-----------------------------------------------------------------------
-c
-c ****** Set X at the radial boundaries.
-c
-      if (rb0) then
-        do concurrent (k=2:npm1, j=2:ntm1)
-          x( 1,j,k)=x(2,j,k)-vmask*br0(j,k)*dr1
-        enddo
-      end if
-c
-      if (rb1) then
-        if (validation_run) then
-          do concurrent (k=2:npm1, j=2:ntm1)
-            x(nr,j,k)=x(nrm1,j,k)+vmask*br1(j,k)*drn
-          enddo
-        else
-          do concurrent (k=2:npm1, j=2:ntm1)
-            x(nr,j,k)=pm_r1*x(nrm1,j,k)
-          enddo
-        end if
-      end if
-c
-c ****** If this processor does not contain any points at the
-c ****** pole, return.
-c
-      if (.not.(tb0.or.tb1)) return
-c
-c ****** Get the m=0 component of X at the poles.
-c
-      if (tb0) then
-        do concurrent (i=1:nr)
-          sum0(i)=0
-        enddo
-        do concurrent (i=1:nr)
-          temp_sum0=0.
-          do concurrent (k=2:npm1) reduce(+:temp_sum0)
-            temp_sum0=temp_sum0+x(i,2,k)*dph(k)*pl_i
-          enddo
-          sum0(i)=temp_sum0
-        enddo
-      end if
-c
-      if (tb1) then
-        do concurrent (i=1:nr)
-          sum1(i)=0
-        enddo
-        do concurrent (i=1:nr)
-          temp_sum0=0.
-          do concurrent (k=2:npm1) reduce(+:temp_sum0)
-            temp_sum0=temp_sum0+x(i,ntm1,k)*dph(k)*pl_i
-          enddo
-          sum1(i)=temp_sum0
-        enddo
-      end if
-c
-c ****** Sum over all processors.
-c
-      call sum_over_phi (nr,sum0,sum1)
-c
-c ****** Set X to have only an m=0 component at the poles.
-c
-      if (tb0) then
-        do concurrent (k=2:npm1, i=1:nr)
-          x(i,1,k)=two*sum0(i)-x(i,2,k)
-        enddo
-      end if
-c
-      if (tb1) then
-        do concurrent (k=2:npm1, i=1:nr)
-          x(i,nt,k)=two*sum1(i)-x(i,ntm1,k)
-        enddo
-      end if
-c
-      return
-      end subroutine
-c#######################################################################
-      subroutine sum_over_phi (n,a0,a1)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Sum the contribution over all processors in the phi
-c ****** dimension (only for processors with points on the poles).
-c
-c ****** The sum is performed for all N points in the vectors
-c ****** SUM0(N) and SUM1(N), at the North and South pole,
-c ****** respectively.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use local_dims_tp
-      use mpidefs
-      use timing
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: n
-      real(r_typ), dimension(n) :: a0,a1
-c
-c-----------------------------------------------------------------------
-c
-c ****** MPI error return.
-c
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-      call timer_on
-c
-      if (tb0) then
-        call MPI_Allreduce (MPI_IN_PLACE,a0,n,ntype_real,
-     &                        MPI_SUM,comm_phi,ierr)
-      end if
-c
-      if (tb1) then
-        call MPI_Allreduce (MPI_IN_PLACE,a1,n,ntype_real,
-     &                        MPI_SUM,comm_phi,ierr)
-      end if
-c
-      call timer_off (c_sumphi)
-c
-      return
-      end
-c#######################################################################
-      subroutine zero_boundary_points (x)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Set the boundary points at the physical boundaries
-c ****** of X to zero.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use local_dims_r
-      use local_dims_tp
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), dimension(nr,nt,np) :: x
-c
-c-----------------------------------------------------------------------
-c
-      if (rb0) x( 1,:,:)=0.
-      if (rb1) x(nr,:,:)=0.
-      if (tb0) x(:, 1,:)=0.
-      if (tb1) x(:,nt,:)=0.
-c
-      return
-      end
-c#######################################################################
-      function cgdot (x,y,N)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Get the dot product of the vectors X and Y.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: N,i
-      real(r_typ) :: cgdot
-      real(r_typ), dimension(N) :: x,y
-c
-c-----------------------------------------------------------------------
-c
-      cgdot=0.
-c
-      do concurrent (i=1:N) reduce(+:cgdot)
-        cgdot=cgdot+x(i)*y(i)
-      enddo
-c
-c ****** Sum over all the processors.
-c
-      call global_sum (cgdot)
-c
-      return
-      end function
-c#######################################################################
-      subroutine global_sum (x)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Overwrite X by the its sum over all processors.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use mpidefs
-      use timing
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ) :: x
-c
-c-----------------------------------------------------------------------
-c
-c ****** MPI error return.
-c
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-      call timer_on
-c
-c ****** Take the sum over all the processors.
-c
-      call MPI_Allreduce (MPI_IN_PLACE,x,1,ntype_real,
-     &                    MPI_SUM,comm_all,ierr)
-c
-      call timer_off (c_cgdot)
-c
-      end subroutine
-c#######################################################################
-      subroutine seam (a,n1,n2,n3)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Seam the boundary points of 3D (r,t,p) array A between
-c ****** adjacent processors.
-c
-c ****** This routine assumes that there is a two-point
-c ****** overlap between processors in each dimension.
-c
-c-----------------------------------------------------------------------
-c
-c ****** This version uses non-blocking MPI sends and receives
-c ****** whenever possible in order to overlap communications.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use mpidefs
-      use timing
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), dimension(n1,n2,n3) :: a
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), dimension(n2,n3) :: sbuf11,rbuf11
-      real(r_typ), dimension(n2,n3) :: sbuf12,rbuf12
-      real(r_typ), dimension(n1,n3) :: sbuf21,rbuf21
-      real(r_typ), dimension(n1,n3) :: sbuf22,rbuf22
-c
-c-----------------------------------------------------------------------
-c
-c ****** MPI error return.
-c
-      integer :: ierr
-c
-c ****** MPI tag for MPI_ISEND and MPI_IRECV (not tagged).
-c
-      integer :: tag=0
-c
-c-----------------------------------------------------------------------
-c
-      integer :: lbuf,i,j
-      integer :: n1,n2,n3
-      integer :: reqs(4)
-c
-c-----------------------------------------------------------------------
-c
-      call timer_on
-c
-c ****** Seam the third (periodic) dimension.
-c ****** Since halo data is stride-1, no need for buffers.
-c
-      lbuf=n1*n2
-c
-      call MPI_Isend (a(:,:,n3-1),lbuf,ntype_real,iproc_pp,tag,
-     &                comm_all,reqs(1),ierr)
-c
-      call MPI_Isend (a(:,:,   2),lbuf,ntype_real,iproc_pm,tag,
-     &                comm_all,reqs(2),ierr)
-c
-      call MPI_Irecv (a(:,:, 1),lbuf,ntype_real,iproc_pm,tag,
-     &                comm_all,reqs(3),ierr)
-c
-      call MPI_Irecv (a(:,:,n3),lbuf,ntype_real,iproc_pp,tag,
-     &                comm_all,reqs(4),ierr)
-c
-      call MPI_Waitall (4,reqs,MPI_STATUSES_IGNORE,ierr)
-c
-c ****** Seam the first dimension.
-c
-      if (nproc_r.gt.1) then
-c
-c
-        lbuf=n2*n3
-c
-        do concurrent (j=1:n3, i=1:n2)
-          sbuf11(i,j)=a(n1-1,i,j)
-          sbuf12(i,j)=a(   2,i,j)
-        enddo
-c
-        call MPI_Isend (sbuf11,lbuf,ntype_real,iproc_rp,tag,
-     &                  comm_all,reqs(1),ierr)
-c
-        call MPI_Isend (sbuf12,lbuf,ntype_real,iproc_rm,tag,
-     &                  comm_all,reqs(2),ierr)
-c
-        call MPI_Irecv (rbuf11,lbuf,ntype_real,iproc_rm,tag,
-     &                  comm_all,reqs(3),ierr)
-c
-        call MPI_Irecv (rbuf12,lbuf,ntype_real,iproc_rp,tag,
-     &                  comm_all,reqs(4),ierr)
-c
-        call MPI_Waitall (4,reqs,MPI_STATUSES_IGNORE,ierr)
-c
-        if (iproc_rm.ne.MPI_PROC_NULL) then
-          do concurrent (j=1:n3, i=1:n2)
-            a( 1,i,j)=rbuf11(i,j)
-          enddo
-        end if
-c
-        if (iproc_rp.ne.MPI_PROC_NULL) then
-          do concurrent (j=1:n3, i=1:n2)
-            a(n1,i,j)=rbuf12(i,j)
-          enddo
-        end if
-c
-      end if
-c
-c ****** Seam the second dimension.
-c
-      if (nproc_t.gt.1) then
-c
-c
-        lbuf=n1*n3
-c
-        do concurrent (j=1:n3, i=1:n1)
-          sbuf21(i,j)=a(i,n2-1,j)
-          sbuf22(i,j)=a(i,   2,j)
-        enddo
-c
-        call MPI_Isend (sbuf21,lbuf,ntype_real,iproc_tp,tag,
-     &                  comm_all,reqs(1),ierr)
-c
-        call MPI_Isend (sbuf22,lbuf,ntype_real,iproc_tm,tag,
-     &                  comm_all,reqs(2),ierr)
-c
-        call MPI_Irecv (rbuf21,lbuf,ntype_real,iproc_tm,tag,
-     &                  comm_all,reqs(3),ierr)
-c
-        call MPI_Irecv (rbuf22,lbuf,ntype_real,iproc_tp,tag,
-     &                  comm_all,reqs(4),ierr)
-c
-        call MPI_Waitall (4,reqs,MPI_STATUSES_IGNORE,ierr)
-c
-        if (iproc_tm.ne.MPI_PROC_NULL) then
-          do concurrent (j=1:n3, i=1:n1)
-            a(i, 1,j)=rbuf21(i,j)
-          enddo
-        end if
-c
-        if (iproc_tp.ne.MPI_PROC_NULL) then
-          do concurrent (j=1:n3, i=1:n1)
-            a(i,n2,j)=rbuf22(i,j)
-          enddo
-        end if
-c
-      end if
-c
-      call timer_off (c_seam)
-c
-      end subroutine
-c#######################################################################
-      subroutine write_solution
-c
-c-----------------------------------------------------------------------
-c
-c ****** Write the global solution.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use global_dims
-      use global_mesh
-      use fields
-      use vars
-      use solve_params
-      use mpidefs
-      use decomposition
-      use assemble_array_interface
-      use timing
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-c ****** Declaration for the global arrays.
-c ****** These arrays are only allocated on processor IPROC0.
-c
-      real(r_typ), dimension(:,:,:), allocatable :: phi_g
-      real(r_typ), dimension(:,:,:), allocatable :: br_g
-      real(r_typ), dimension(:,:,:), allocatable :: bt_g
-      real(r_typ), dimension(:,:,:), allocatable :: bp_g
-c
-c-----------------------------------------------------------------------
-c
-      integer :: ierr
-      character(256) :: fname
-c
-c-----------------------------------------------------------------------
-c
-      call timer_on
-c
-c ****** Potential.
-c
-      if (phifile.ne.'') then
-c
-c ****** Allocate the global array PHI_G (on processor IPROC0).
-c
-        if (iamp0) then
-          allocate (phi_g(nr_g,nt_g,np_g))
-        else
-          allocate (phi_g(1,1,1))
-        end if
-c
-c ****** Assemble the global PHI array.
-c
-        call assemble_array (map_rh,map_th,map_ph,phi,phi_g)
-c
-        fname=phifile
-c
-c ****** Write out the potential to a file.
-c
-        if (iamp0) then
-            write (*,*)
-            write (*,*) '### COMMENT from WRITE_SOLUTION:'
-            write (*,*)
-            write (*,*) 'Writing the potential to file: ',trim(fname)
-            call wrhdf_3d (fname,.true.,nr_g,nt_g,np_g,
-     &                     phi_g,rh_g,th_g,ph_g,hdf32,ierr)
-        end if
-c
-        deallocate (phi_g)
-c
-      end if
-c
-c ****** Br.
-c
-      if (brfile.ne.'') then
-c
-        fname=brfile
-c
-        if (iamp0) then
-          allocate (br_g(nrm1_g,nt_g,np_g))
-        else
-          allocate (br_g(1,1,1))
-        end if
-c
-c ****** Assemble the global PHI array.
-c
-        call assemble_array (map_rm,map_th,map_ph,br,br_g)
-c
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### COMMENT from WRITE_SOLUTION:'
-          write (*,*)
-          write (*,*) 'Writing Br to file: ',trim(fname)
-          call wrhdf_3d (fname,.true.,nrm1_g,nt_g,np_g,
-     &                   br_g,r_g,th_g,ph_g,hdf32,ierr)
-        end if
-c
-        deallocate (br_g)
-c
-      end if
-c
-c ****** Bt.
-c
-      if (btfile.ne.'') then
-c
-        fname=btfile
-c
-        if (iamp0) then
-          allocate (bt_g(nr_g,ntm1_g,np_g))
-        else
-          allocate (bt_g(1,1,1))
-        end if
-c
-c ****** Assemble the global PHI array.
-c
-        call assemble_array (map_rh,map_tm,map_ph,bt,bt_g)
-c
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### COMMENT from WRITE_SOLUTION:'
-          write (*,*)
-          write (*,*) 'Writing Bt to file: ',trim(fname)
-          call wrhdf_3d (fname,.true.,nr_g,ntm1_g,np_g,
-     &                   bt_g,rh_g,t_g,ph_g,hdf32,ierr)
-c
-        end if
-c
-        deallocate (bt_g)
-c
-      end if
-c
-c ****** Bp.
-c
-      if (bpfile.ne.'') then
-c
-        fname=bpfile
-c
-        if (iamp0) then
-          allocate (bp_g(nr_g,nt_g,npm1_g))
-        else
-          allocate (bp_g(1,1,1))
-        end if
-c
-c ****** Assemble the global PHI array.
-c
-        call assemble_array (map_rh,map_th,map_pm,bp,bp_g)
-c
-        if (iamp0) then
-          write (*,*)
-          write (*,*) '### COMMENT from WRITE_SOLUTION:'
-          write (*,*)
-          write (*,*) 'Writing Bp to file: ',trim(fname)
-          call wrhdf_3d (fname,.true.,nr_g,nt_g,npm1_g,
-     &                   bp_g,rh_g,th_g,p_g,hdf32,ierr)
-c
-        end if
-c
-        deallocate (bp_g)
-c
-      end if
-c
-      call timer_off (t_io)
-c
-      end subroutine
-c#######################################################################
-      subroutine getb
-c
-c-----------------------------------------------------------------------
-c
-c ****** Calculate B from grad-phi.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use global_dims
-      use global_mesh
-      use vars
-      use fields
-      use local_dims_r
-      use local_dims_tp
-      use local_mesh_r
-      use local_mesh_tp
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j,k
-c
-c-----------------------------------------------------------------------
-c
-c ****** Get Br.
-c
-      do concurrent (k=1:np, j=1:nt, i=1:nrm1)
-        br(i,j,k)=(phi(i+1,j,k)-phi(i,j,k))/dr(i)
-      enddo
-c
-c ****** Get Bt.
-c
-      do concurrent (k=1:np, j=1:ntm1, i=1:nr)
-        bt(i,j,k)=(phi(i,j+1,k)-phi(i,j,k))/(rh(i)*dt(j))
-      enddo
-c
-c ****** Get Bp.
-c
-      do concurrent (k=1:npm1, j=1:nt, i=1:nr)
-        bp(i,j,k)=(phi(i,j,k+1)-phi(i,j,k))/(rh(i)*sth(j)*dp(k))
-      enddo
-c
-      end subroutine
-c#######################################################################
-      subroutine magnetic_energy
-c
-c-----------------------------------------------------------------------
-c
-c ****** Calculate magnetic energy from B.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use global_dims
-      use global_mesh
-      use vars
-      use fields
-      use mpidefs
-      use local_dims_r
-      use local_dims_tp
-      use local_mesh_r
-      use local_mesh_tp
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: half=.5_r_typ
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j,k,ierr
-      real(r_typ) :: brav,btav,bpav,dv
-      real(r_typ) :: wr,wt,wp
-      real(r_typ), dimension(3) :: w
-      character(32) :: fmtstr
-c
-c-----------------------------------------------------------------------
-c
-      if (hdf32) then
-        fmtstr="(A, ES14.8)"
-      else
-        fmtstr="(A,ES22.16)"
-      end if
-c
-      wr=0.
-      wt=0.
-      wp=0.
-      do concurrent (i=2:nrm1,j=2:ntm1,k=2:npm1) reduce(+:wr,wt,wp)
-        dv=rh(i)**2*drh(i)*dth(j)*sth(j)*dph(k)
-        brav=half*(br(i,j,k)+br(i-1,j,k))
-        btav=half*(bt(i,j,k)+bt(i,j-1,k))
-        bpav=half*(bp(i,j,k)+bp(i,j,k-1))
-        wr=wr+half*brav**2*dv
-        wt=wt+half*btav**2*dv
-        wp=wp+half*bpav**2*dv
-      enddo
-c
-c ****** Sum up all processors into final values and print.
-c
-      w(1)=wr
-      w(2)=wt
-      w(3)=wp
-      call MPI_Allreduce(MPI_IN_PLACE,w,3,ntype_real,
-     &                   MPI_SUM,comm_all,ierr)
-c
-      if (iamp0) then
-        write (*,*)
-        write (*,*) '### COMMENT from GETB:'
-        write (*,*) '### Magnetic energy diagnostic:'
-        write (*,*)
-        write (*,trim(fmtstr)) 'Energy in Br**2 = ',w(1)
-        write (*,trim(fmtstr)) 'Energy in Bt**2 = ',w(2)
-        write (*,trim(fmtstr)) 'Energy in Bp**2 = ',w(3)
-        write (*,trim(fmtstr)) 'Magnetic energy = ',SUM(w)
-        write (9,*)
-        write (9,*) '### COMMENT from GETB:'
-        write (9,*) '### Magnetic energy diagnostic:'
-        write (9,*)
-        write (9,trim(fmtstr)) 'Energy in Br**2 = ',w(1)
-        write (9,trim(fmtstr)) 'Energy in Bt**2 = ',w(2)
-        write (9,trim(fmtstr)) 'Energy in Bp**2 = ',w(3)
-        write (9,trim(fmtstr)) 'Magnetic energy = ',SUM(w)
-      end if
-c
-      end subroutine
-c#######################################################################
-      subroutine assemble_array (map_r,map_t,map_p,a,a_g)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Assemble a global array (into A_G) on processor IPROC0 by
-c ****** fetching the local sections (A) from all the processors.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use decomposition
-      use mpidefs
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      type(map_struct), dimension(0:nproc-1) :: map_r,map_t,map_p
-      real(r_typ), dimension(:,:,:) :: a,a_g
-c
-c-----------------------------------------------------------------------
-c
-c ****** Storage for the buffers.
-c
-      integer :: lbuf,lsbuf
-      real(r_typ), dimension(:), allocatable :: sbuf
-      real(r_typ), dimension(:), allocatable :: rbuf
-c
-c-----------------------------------------------------------------------
-c
-      integer :: tag=0
-      integer :: irank,l1,l2,l3,i,j,k,ii
-      integer :: i0,j0,k0,i1,j1,k1
-      integer :: i0g,j0g,k0g
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-      l1=map_r(iproc)%n
-      l2=map_t(iproc)%n
-      l3=map_p(iproc)%n
-      lsbuf=l1*l2*l3
-c
-      i0=map_r(iproc)%i0
-      i1=map_r(iproc)%i1
-      j0=map_t(iproc)%i0
-      j1=map_t(iproc)%i1
-      k0=map_p(iproc)%i0
-      k1=map_p(iproc)%i1
-c
-c ***** Extract 1D array of non-overlapping values from local array.
-c
-      allocate (sbuf(lsbuf))
-c
-      sbuf=reshape(a(i0:i1,j0:j1,k0:k1),(/lsbuf/))
-c
-c ****** If proc0, recieve/store local arrays into global array.
-c
-      if (iamp0) then
-        do irank=0,nproc-1
-c
-          l1=map_r(irank)%n
-          l2=map_t(irank)%n
-          l3=map_p(irank)%n
-          lbuf=l1*l2*l3
-c
-          i0g=map_r(irank)%offset
-          j0g=map_t(irank)%offset
-          k0g=map_p(irank)%offset
-c
-c ****** If proc0 is the current rank in loop, simply copy local array.
-          if (iproc==irank) then
-            do k=1,l3
-              do j=1,l2
-                do i=1,l1
-                  ii=l2*l1*(k-1)+l1*(j-1)+i
-                  a_g(i0g+i-1,j0g+j-1,k0g+k-1)=sbuf(ii)
-                enddo
-              enddo
-            enddo
-c ****** Otherwise recieve data:
-          else
-            allocate (rbuf(lbuf))
-            call MPI_Recv (rbuf,lbuf,ntype_real,irank,tag,
-     &                     comm_all,MPI_STATUS_IGNORE,ierr)
-            do k=1,l3
-              do j=1,l2
-                do i=1,l1
-                  ii=l2*l1*(k-1)+l1*(j-1)+i
-                  a_g(i0g+i-1,j0g+j-1,k0g+k-1)=rbuf(ii)
-                enddo
-              enddo
-            enddo
-            deallocate(rbuf)
-          end if
-        enddo
-      else
-c
-c ****** Send local array to iproc0.
-c
-      call MPI_Ssend (sbuf,lsbuf,ntype_real,iproc0,tag,comm_all,ierr)
-c
-      end if
-      deallocate (sbuf)
-c
-      return
-      end
-c#######################################################################
-      subroutine timer_on
-c
-c-----------------------------------------------------------------------
-c
-c ****** Push an entry onto the timing stack and initialize
-c ****** a timing event.
-c
-c-----------------------------------------------------------------------
-c
-c ****** This routine can be called in a nested way to measure
-c ****** multiple timing events.  Calls to TIMER_ON and TIMER_OFF
-c ****** need to be nested like do-loops in FORTRAN.
-c
-c-----------------------------------------------------------------------
-c
-      use mpidefs
-      use timer
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      if (istack.ge.nstack) then
-        write (*,*)
-        write (*,*) '### WARNING from TIMER_ON:'
-        write (*,*) '### Timing stack depth exceeded.'
-        write (*,*) 'This may signal an incorrect nesting of '//
-     &              'TIMER_ON/TIMER_OFF calls.'
-        write (*,*) 'Timing information will not be valid.'
-        return
-      else
-        istack=istack+1
-      end if
-c
-      tstart(istack)=MPI_Wtime()
-c
-      return
-      end
-c#######################################################################
-      subroutine timer_off (tused)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Increment the CPU time used since the call to TIMER_ON
-c ****** in variable TUSED, and pop an entry off the timing
-c ****** stack.
-c
-c-----------------------------------------------------------------------
-c
-c ****** This routine can be called in a nested way to measure
-c ****** multiple timing events.  Calls to TIMER_ON and TIMER_OFF
-c ****** need to be nested like do-loops in FORTRAN.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use mpidefs
-      use timer
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ) :: tused
-c
-c-----------------------------------------------------------------------
-c
-      if (istack.le.0) then
-        write (*,*)
-        write (*,*) '### WARNING from TIMER_OFF:'
-        write (*,*) '### Timing stack cannot be popped.'
-        write (*,*) 'This may signal an incorrect nesting of '//
-     &              'TIMER_ON/TIMER_OFF calls.'
-        write (*,*) 'Timing information will not be valid.'
-        return
-      else
-        istack=istack-1
-      end if
-c
-      tused=tused+MPI_Wtime()-tstart(istack+1)
-c
-      return
-      end
-c#######################################################################
-      subroutine write_timing
-c
-c-----------------------------------------------------------------------
-c
-c ****** Write out the timing info.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use mpidefs
-      use timing
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-c ****** Timing buffers.
-c
-      integer, parameter :: lbuf=10
-      real(r_typ), dimension(lbuf) :: sbuf
-      real(r_typ), dimension(lbuf,0:nproc-1) :: tbuf
-c
-c ****** Timing statistics.
-c
-      real(r_typ), dimension(lbuf) :: tmin,tmax,tavg,tsdev
-c
-c-----------------------------------------------------------------------
-c
-      integer :: ierr,irank
-      real(r_typ) :: t_tot_avg,c_tot_avg,c_tot
-c
-      character(80) :: tfile='timing.out'
-c
-c-----------------------------------------------------------------------
-c
-c ****** Gather the timing information for all processors into TBUF.
-c
-      sbuf(1)=t_solve
-      sbuf(2)=t_startup
-      sbuf(3)=t_io
-      sbuf(4)=t_ax
-      sbuf(5)=t_pc_load
-      sbuf(6)=t_pc
-      sbuf(7)=c_seam
-      sbuf(8)=c_cgdot
-      sbuf(9)=c_sumphi
-      sbuf(10)=t_wall
-c
-      call MPI_Allgather (sbuf,lbuf,ntype_real,
-     &                    tbuf,lbuf,ntype_real,comm_all,ierr)
-c
-c ****** Calculate the timing statistics.
-c
-      tavg=sum(tbuf,dim=2)/nproc
-      tmin=minval(tbuf,dim=2)
-      tmax=maxval(tbuf,dim=2)
-c
-      tsdev(:)=0.
-      do irank=0,nproc-1
-        tsdev(:)=tsdev(:)+(tbuf(:,irank)-tavg(:))**2
-      enddo
-      tsdev(:)=sqrt(tsdev(:)/nproc)
-c
-      t_tot_avg=tavg(10)
-      c_tot_avg=tavg(7)+tavg(8)+tavg(9)
-c
-      if (iamp0) then
-c
-        call ffopen (1,tfile,'rw',ierr)
-c
-        if (ierr.ne.0) then
-          write (*,*)
-          write (*,*) '### WARNING from WRITE_TIMING:'
-          write (*,*) '### Could not create the timing file.'
-          write (*,*) 'File name: ',trim(tfile)
-        end if
-c
-        do irank=0,nproc-1
-          c_tot=tbuf(7,irank)+tbuf(8,irank)+tbuf(9,irank)
-          write (1,*)
-          write (1,100)
-          write (1,*)
-          write (1,*) 'Processor id = ',irank
-          write (1,*)
-          write (1,200) 'Comm. time in SEAM    = ',tbuf(7,irank)
-          write (1,200) 'Comm. time in CGDOT   = ',tbuf(8,irank)
-          write (1,200) 'Comm. time in SUMPHI  = ',tbuf(9,irank)
-          write (1,*)   '------------------------------------'
-          write (1,200) 'Total comm. time      = ',c_tot
-          write (1,*)
-          write (1,200) 'Time used in start-up = ',tbuf(2,irank)
-          write (1,200) 'Time used in i/o      = ',tbuf(3,irank)
-          write (1,200) 'Time used in POTFLD   = ',tbuf(1,irank)
-          write (1,200) 'Time used in AX       = ',tbuf(4,irank)
-          write (1,200) 'Time used in PCLOAD   = ',tbuf(5,irank)
-          write (1,200) 'Time used in PC       = ',tbuf(6,irank)
-          write (1,*)   '------------------------------------'
-          write (1,200) 'Total time used       = ',tbuf(10,irank)
-  100     format (80('-'))
-  200     format (1x,a,f12.6)
-        enddo
-        write (1,*)
-        write (1,100)
-c
-        write (1,*)
-        write (1,*) 'Average times:'
-        write (1,*) '-------------'
-        write (1,*)
-        write (1,300) 'Avg         Min         Max      S. Dev'
-        write (1,300) '---         ---         ---      ------'
-        write (1,400) 'Avg comm. time in SEAM    = ',
-     &                tavg(7),tmin(7),tmax(7),tsdev(7)
-        write (1,400) 'Avg comm. time in CGDOT   = ',
-     &                tavg(8),tmin(8),tmax(8),tsdev(8)
-        write (1,400) 'Avg comm. time in SUMPHI  = ',
-     &                tavg(9),tmin(9),tmax(9),tsdev(9)
-        write (1,400) 'Avg time used in start-up = ',
-     &                tavg(2),tmin(2),tmax(2),tsdev(2)
-        write (1,400) 'Avg time used in i/o      = ',
-     &                tavg(3),tmin(3),tmax(3),tsdev(3)
-        write (1,400) 'Avg time used in POTFLD   = ',
-     &                tavg(1),tmin(1),tmax(1),tsdev(1)
-        write (1,400) 'Avg time used in AX       = ',
-     &                tavg(4),tmin(4),tmax(4),tsdev(4)
-        write (1,400) 'Avg time used in PCLOAD   = ',
-     &                tavg(5),tmin(5),tmax(5),tsdev(5)
-        write (1,400) 'Avg time used in PC       = ',
-     &                tavg(6),tmin(6),tmax(6),tsdev(6)
-        write (1,400) 'Avg total time            = ',
-     &                tavg(10),tmin(10),tmax(10),tsdev(10)
-  300   format (1x,33x,a)
-  400   format (1x,a,4f12.3)
-c
-        write (1,*)
-        write (1,200) 'Average time used per proc  = ',t_tot_avg
-        write (1,200) 'Average comm. time per proc = ',c_tot_avg
-        write (1,*)
-        write (1,100)
-        write (1,*)
-c
-        close (1)
-c
-        write(*,*)
-        write(*,200) 'Total time:               ',tmax(10)
-        write(*,200) 'Max Comm/Imbalance time:  ',
-     &                tmax(7)+tmax(8)+tmax(9)
-        write(*,*)
-c
-      end if
-c
-      end subroutine
-c#######################################################################
-      subroutine readbr (fname,br0_g,ierr)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Read in the radial magnetic field at the photosphere
-c ****** and interpolate it into array BR0_G.
-c
-c ****** FNAME is the name of the file to read.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-      use global_dims
-      use global_mesh
-      use vars
-      use rdhdf_2d_interface
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      character(*) :: fname
-      real(r_typ), dimension(nt_g,np_g) :: br0_g
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: two=2._r_typ
-c
-c-----------------------------------------------------------------------
-c
-c ****** Br array read in and its scales.
-c
-      real(r_typ), dimension(:,:), pointer :: bn
-      real(r_typ), dimension(:), pointer :: tn,pn
-c
-c-----------------------------------------------------------------------
-c
-      integer :: ntn,npn,j,k
-      logical :: scale
-      real(r_typ) :: sum0,sum1,area,fluxp,fluxm,da,br00err
-c
-c-----------------------------------------------------------------------
-c
-      ierr=0
-c
-c ****** Read in the normal field.
-c
-      write (*,*)
-      write (*,*) '### COMMENT from READBR:'
-      write (*,*) '### Reading Br file: ',trim(fname)
-c
-      call rdhdf_2d (fname,scale,ntn,npn,bn,tn,pn,ierr)
-c
-      if (ierr.ne.0) then
-        write (*,*)
-        write (*,*) '### ERROR in READBR:'
-        write (*,*) '### The flux file has the wrong format.'
-        write (*,*) 'IERR (from RDHDF_2D) = ',ierr
-        write (*,*) 'File name: ',trim(fname)
-        ierr=1
-        return
-      end if
-c
-c ****** Check that the arrays has scales.
-c
-      if (.not.scale) then
-        write (*,*)
-        write (*,*) '### ERROR in READBR:'
-        write (*,*) '### The flux file does not have scales.'
-        write (*,*) 'File name: ',trim(fname)
-        ierr=2
-        return
-      end if
-c
-c ****** Interpolate the field to the code mesh (into array BR0_G).
-c
-      call intrp2d (ntn,npn,tn,pn,bn,
-     &              nt_g-2,np_g-2,th_g(2:ntm1_g),ph_g(2:npm1_g),
-     &              br0_g(2:ntm1_g,2:npm1_g),ierr)
-c
-      if (ierr.ne.0) then
-        write (*,*)
-        write (*,*) '### ERROR in READBR:'
-        write (*,*) '### The scales in the Br file are invalid.'
-        write (*,*) 'File name: ',trim(fname)
-        ierr=3
-        return
-      end if
-c
-c ****** De-allocate the memory for the BN array and its scales.
-c
-      deallocate (bn)
-      deallocate (tn)
-      deallocate (pn)
-c
-c ****** Set Br to be periodic.
-c
-      br0_g(:,1)=br0_g(:,npm1_g)
-      br0_g(:,np_g)=br0_g(:,2)
-c
-c ****** Set BCs at the poles.
-c
-c ****** Br has only an m=0 component there.
-c
-      sum0=sum(br0_g(     2,2:npm1_g)*dph_g(2:npm1_g))*pl_i
-      sum1=sum(br0_g(ntm1_g,2:npm1_g)*dph_g(2:npm1_g))*pl_i
-c
-      br0_g(1   ,:)=two*sum0-br0_g(     2,:)
-      br0_g(nt_g,:)=two*sum1-br0_g(ntm1_g,:)
-c
-c ****** Calculate the total flux.
-c
-      area=0.
-      fluxp=0.
-      fluxm=0.
-      do j=2,ntm1_g
-        do k=2,npm1_g
-          da=sth_g(j)*dth_g(j)*dph_g(k)
-          if (br0_g(j,k).gt.0.) then
-            fluxp=fluxp+br0_g(j,k)*da
-          else
-            fluxm=fluxm+br0_g(j,k)*da
-          end if
-          area=area+da
-        enddo
-      enddo
-c
-      write (*,*)
-      write (*,*) '### COMMENT from READBR:'
-      write (*,*) '### Computed flux balance:'
-      write (*,*)
-      write (*,*) 'Positive flux = ',fluxp
-      write (*,*) 'Negative flux = ',fluxm
-c
-c ****** Fix the magnetic field so that the total flux is zero
-c ****** (unless this has not been requested).
-c
-      if (.not.((option.eq.'ss'.or.option.eq.'open')
-     &          .and.do_not_balance_flux)) then
-c
-        br00err=(fluxp+fluxm)/area
-c
-        do k=1,np_g
-          do j=1,nt_g
-            br0_g(j,k)=br0_g(j,k)-br00err
-          enddo
-        enddo
-c
-        write (*,*)
-        write (*,*) '### COMMENT from READBR:'
-        write (*,*) '### Flux balance correction:'
-        write (*,*)
-        write (*,*) 'BR00 (monopole Br field magnitude) = ',br00err
-c
-      end if
-c
-      return
-      end
-c#######################################################################
-      subroutine intrp2d (nxi,nyi,xi,yi,fi,nx,ny,x,y,f,ierr)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Interpolate a 2D field from array FI(NXI,NYI), defined
-c ****** on the mesh XI(NXI) x YI(NYI), into the array F(NX,NY),
-c ****** defined on the mesh X(NX) x Y(NY).
-c
-c ****** Note that if a data point is outside the bounds of
-c ****** the XI x YI mesh, IERR=2 is returned.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: nxi,nyi
-      real(r_typ), dimension(nxi) :: xi
-      real(r_typ), dimension(nyi) :: yi
-      real(r_typ), dimension(nxi,nyi) :: fi
-      integer :: nx,ny
-      real(r_typ), dimension(nx) :: x
-      real(r_typ), dimension(ny) :: y
-      real(r_typ), dimension(nx,ny) :: f
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), parameter :: zero=0._r_typ
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ), external :: flint
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j,ii,jj
-      real(r_typ) :: dum(1)=0.
-      real(r_typ) :: dummy,xv,yv,ax,ay
-c
-c-----------------------------------------------------------------------
-c
-      ierr=0
-c
-c ****** Check that the scales XI and YI are monotonic.
-c
-      dummy=flint(zero,nxi,xi,dum,1,ierr)
-      if (ierr.ne.0) go to 900
-c
-      dummy=flint(zero,nyi,yi,dum,1,ierr)
-      if (ierr.ne.0) go to 900
-c
-c ****** Interpolate the data.
-c
-      do j=1,ny
-        yv=y(j)
-        if (yv.lt.yi(1).or.yv.gt.yi(nyi)) then
-          go to 910
-        end if
-        call interp (yi,nyi,yv,jj,ay)
-        do i=1,nx
-          xv=x(i)
-          if (xv.lt.xi(1).or.xv.gt.xi(nxi)) then
-            go to 910
-          end if
-          call interp (xi,nxi,xv,ii,ax)
-          f(i,j)= (1.-ax)*((1.-ay)*fi(ii  ,jj  )+ay*fi(ii  ,jj+1))
-     &           +    ax *((1.-ay)*fi(ii+1,jj  )+ay*fi(ii+1,jj+1))
-        enddo
-      enddo
-c
-      return
-c
-c ****** Error exit: invalid scales.
-c
-  900 continue
-c
-      write (*,*)
-      write (*,*) '### ERROR in INTRP2D:'
-      write (*,*) '### Scales are not monotonically increasing.'
-      ierr=1
-c
-      return
-c
-c ****** Error exit: data outside range of scales.
-c
-  910 continue
-c
-      write (*,*)
-      write (*,*) '### ERROR in INTRP2D:'
-      write (*,*) '### An interpolation was attempted outside the'//
-     &            ' range of the defined scales.'
-      ierr=2
-c
-      return
-      end
-c#######################################################################
-      function flint (x,n,xn,fn,icheck,ierr)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Interpolate a function linearly.
-c
-c-----------------------------------------------------------------------
-c
-c ****** The funcion is defined at N nodes, with values given by
-c ****** FN(N) at positions XN(N).  The function value returned is
-c ****** the linear interpolant at X.
-c
-c ****** Note that if X.lt.XN(1), the function value returned
-c ****** is FN(1), and if X.gt.XN(N), the function value returned
-c ****** is FN(N).
-c
-c ****** Call once with ICHECK.ne.0 to check that the values
-c ****** in XN(N) are monotonically increasing.  In this mode
-c ****** the array XN(N) is checked, and X and FN(N) are not
-c ****** accessed.  If the check is passed, IERR=0 is returned.
-c ****** Otherwise, IERR=1 is returned.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      real(r_typ) :: flint
-      real(r_typ) :: x
-      integer :: n
-      real(r_typ), dimension(n) :: xn,fn
-      integer :: icheck
-      integer :: ierr
-c
-c-----------------------------------------------------------------------
-c
-      integer :: i,j
-      real(r_typ) :: x1,x2,alpha
-c
-c-----------------------------------------------------------------------
-c
-      ierr=0
-      flint=0.
-c
-c ****** If ICHECK.ne.0, check the abscissa table.
-c
-      if (icheck.ne.0) then
-        if (n.le.0) then
-          write (*,*)
-          write (*,*) '### ERROR in FLINT:'
-          write (*,*) '### Bad dimension of abscissa table.'
-          write (*,*) 'N = ',n
-          ierr=1
-          return
-        end if
-        do 100 i=1,n-1
-          if (xn(i+1).le.xn(i)) then
-            write (*,*)
-            write (*,*) '### ERROR in FLINT:'
-            write (*,*) '### Bad data in abscissa table.'
-            write (*,*) 'N = ',n
-            write (*,*) 'XN = '
-            write (*,*) (xn(j),j=1,n)
-            ierr=1
-            return
-          end if
-  100   continue
-        return
-      end if
-c
-c ****** Get the interpolated value.
-c
-      if (x.le.xn(1)) then
-        flint=fn(1)
-      else if (x.gt.xn(n)) then
-        flint=fn(n)
-      else
-        do 200 i=1,n-1
-          if (x.ge.xn(i).and.x.lt.xn(i+1)) go to 300
-  200   continue
-  300   continue
-        x1=xn(i)
-        x2=xn(i+1)
-        alpha=(x-x1)/(x2-x1)
-        flint=fn(i)*(1.-alpha)+fn(i+1)*alpha
-      end if
-c
-      return
-      end
-c#######################################################################
-      subroutine interp (x,n,xv,i,alpha)
-c
-c-----------------------------------------------------------------------
-c
-c ****** Get interpolation factor ALPHA and table index I.
-c
-c ****** This routine does not do the actual interpolation.  Use the
-c ****** returned values of I and ALPHA to get the interpolated value.
-c
-c-----------------------------------------------------------------------
-c
-      use number_types
-c
-c-----------------------------------------------------------------------
-c
-      implicit none
-c
-c-----------------------------------------------------------------------
-c
-      integer :: n
-      real(r_typ), dimension(n) :: x
-      real(r_typ) :: xv
-      integer :: i
-      real(r_typ) :: alpha
-c
-c-----------------------------------------------------------------------
-c
-      do 100 i=1,n-1
-        if (xv.ge.x(i).and.xv.le.x(i+1)) then
-          alpha=(xv-x(i))/(x(i+1)-x(i))
-          go to 200
-        end if
-  100 continue
-c
-c ****** Value not found --- signal error and stop.
-c
-      write (*,*)
-      write (*,*) '### ERROR in INTERP:'
-      write (*,*) '### Value not found in table.'
-      write (*,*) 'Value requested = ',xv
-      write (*,*) 'Min table value = ',x(1)
-      write (*,*) 'Max table value = ',x(n)
-      call endrun (.true.)
-c
-  200 continue
-c
-      return
-      end
-c#######################################################################
-c
-c ****** Revision history:
-c
-c ### Version 1.00, 03/02/2006, file pot3d.f, modified by ZM:
-c
-c       - Cleaned up the previous version of POT3D.
-c
-c ### Version 1.01, 03/06/2006, file pot3d.f, modified by ZM:
-c
-c       - Added the ability to do a "source-surface plus
-c         current-sheet" solution.  Select this by setting
-c         OPTION='ss+cs'.
-c
-c ### Version 1.02, 06/18/2007, file pot3d.f, modified by ZM:
-c
-c       - Fixed a bug that caused the code to hang when an error
-c         was encountered (when running a parallel job).
-c
-c ### Version 1.03, 03/17/2009, file pot3d.f, modified by ZM:
-c
-c       - Added the ability to write the boundary flux before the
-c         sign flip for current-sheet solutions (i.e., OPTION='open').
-c         Set the variable BR_PHOTO_ORIGINAL_FILE to the desired
-c         file name to request this.
-c
-c ### Version 1.50, 01/25/2016, file pot3d.f, modified by RC:
-c
-c       - Added new (much faster) BILU0 preconditioner to CG solver.
-c         To activate, set ifprec=2 in pot3d.dat file.
-c       - Modified CG solve to use 1D arrays
-c         for SAXPY and DOT operations.
-c
-c ### Version 2.00, 06/06/2017, file pot3d.f, modified by RC:
-c
-c       - Added OpenACC support to the code.
-c         - OpenACC support is currently ONLY on 'standard'
-c           pot3d runs (not inner-outer-iteratative mode)
-c           and is only efficient on GPUs when using ifprec=1.
-c         - OpenACC adds support for running the code on
-c           Nvidia GPUs/Intel KNL/x86-multicore/OpenPower.
-c         - To use OpenACC, simply compile the code with a compiler
-c           that supports OpenACC with the correct flags activated.
-c         - Multi-gpu support is included by using the new
-c           ngpus_per_node input parameter.  This should be set
-c           to the number of GPUs available per node.
-c           The number of MPI ranks per node should match the
-c           number of gpus per node.  This can be launched with
-c           "mpiexec -np <np> -ntasks-per-node <ngpus_per_node>".
-c         - The GPU features of the code are fully portable, i.e.
-c           the code can be compiled/used as before on CPUs with no
-c           changes in compilation or run-time.
-c       - Modified some routines to be "nicer" for OpenACC
-c         and optimized some MPI communications.
-c       - Added wall-clock timer and corrected placement of
-c         MPI_Finalize().  The wall clock timer now reflects the
-c         true runtime.
-c
-c ### Version 2.01, 10/02/2017, file pot3d.f, modified by RC:
-c
-c       - Optimized OpenACC.
-c       - Renamed cgsolv() to cgsolve().
-c
-c ### Version 2.10, 01/15/2018, file pot3d.f, modified by ZM+RC:
-c
-c       - Added the ability to skip the balancing of the flux
-c         when doing a PFSS or OPEN field.  To invoke this, set
-c         DO_NOT_BALNCE_FLUX=.true..
-c       - Changed some pointers to allocatables for better
-c         vectorization.
-c
-c ### Version 2.11, 03/19/2018, file pot3d.f, modified by RC:
-c
-c       - Added 'fmt' input parameter to set output file type.
-c         Set fmt='h5' to write out HDF5 (default is 'hdf').
-c
-c ### Version 2.12, 10/08/2018, file pot3d.f, modified by RC:
-c
-c       - COMPATIBILITY CHANGE! Renamed gpus_per_node to gpn.
-c         gpn is default 0 which will set gpn to the number of
-c         MPI ranks in the local node.
-c         Setting gpn manually is not recommended and only used for
-c         oversubscribing the GPUs.
-c       - Added MPI shared communicator for automatically setting gpn.
-c         This requires an MPI-3 capable MPI library.
-c       - Changed layout of matrix coefficient arrays to be more
-c         vector-friendly instead of cache-friendly.
-c
-c ### Version 2.13, 11/19/2018, file pot3d.f, modified by RC:
-c
-c       - Small modifications to polar boundary condition calculations.
-c       - Updated array ops and ACC clauses to be F2003 optimized.
-c
-c ### Version 2.20, 01/09/2019, file pot3d.f, modified by RC:
-c
-c       - Added double precision output option.
-c         Set hdf32=.false. to activate 64-bit output.
-c       - Updated magnetic field computation.  B is now computed
-c         in parallel.  3D output fields now gathered to rank 0
-c         using Sends and Receives instead of Gatherv in order
-c         to allow very large resolutions.
-c       - Added automatic topology.  Now, nprocs is optional.
-c         One can specify one or more topology dimensions and
-c         use the flag value "-1" to indicate dimensions to auto-set.
-c         It is recommended to simply delete nprocs from input files.
-c       - Added output file flushing so CG iterations can be monitored.
-c       - Added new MPI rank diagnostics including
-c         estimated load imbalance.
-c       - Processor topology and magnetic energy output now written to
-c         pot3d.out as well as the terminal.
-c
-c ### Version 2.21, 01/11/2019, file pot3d.f, modified by RC:
-c
-c       - Small updates to magnetic_energy routine.
-c
-c ### Version 2.22, 11/27/2019, file pot3d.f, modified by RC:
-c
-c       - Optimized some OpenACC directives.  Expanded some
-c         array-syntax lines to full loops.
-c
-c ### Version 2.23, 08/11/2020, file pot3d.f, modified by RC:
-c
-c       - Small bug fix for default output file names and
-c         format option fmt.
-c
-c ### Version 3.0.0, 02/10/2021, file pot3d.f, modified by RC:
-c
-c       - Changed version number scheme to semantic versioning.
-c
-c ### Version 3.1.0, 08/11/2020, file pot3d.f, modified by RC:
-c
-c       - Removed ss+cs inner-outer iterative mode.
-c         It had not been used in many years and never worked
-c         quite right.  Algorithm is saved in previous releases
-c         on the github and svn.
-c       - Added a validation run mode.  To activate, set:
-c         VALIDATION_RUN=.TRUE.
-c         This will set up a tilted dipole exact solution, write it
-c         out, and then solve for it.  It will ignore input br files.
-c         To change the angle of the dipole, set DIPOLE_ANGLE
-c         (default is pi/4).
-c       - Added NCGFLUSH parameter to flush output file during solver
-c         iterations if NCGHIST>0 every NCGFLUSH iteration (default 25).
-c         This is useful to track progress of the solver.
-c
-c ### Version 3.1.1, 11/02/2021, modified by RC:
-c
-c       - Fixed default output name mechanics.  The default names
-c         were ignoring "fmt", making "fmt" almost useless.
-c
-c ### Version 3.2.0, 12/21/2021, modified by RC:
-c
-c       - GPN is now depricated.  Now, # of GPUs per node is assumed
-c         to be equal to # of MPI ranks per node.
-c         This removes the ability to oversubscribe GPUs, but this
-c         was not being used ever, so no great loss.
-c         This change was done to allow setting the gpu device
-c         number before any allocations so that it works correctly
-c         when using NVIDIA unified managed memory.
-c
-c ### Version 3.3.0, 02/14/2022, modified by RC:
-c
-c       - Added new timers for AX, PC, PCLOAD, and modified some timers
-c         to be more useful.
-c       - Updated ILU0 PC2 preconditioner to use single precision.
-c         This speeds up the solve both on CPUs and GPUs.
-c
-c ### Version 4.0.0, 03/02/2022, modified by RC/MS:
-c
-c       - Added cuSparse library calls for ifprec=2 on NVIDIA GPUs.
-c       - Added #ifdef
-c         preprocessor directives which will either compile a cusparse
-c         GPU version of the code or a CPU compatible version
-c         without cusparse calls and no c code references
-c
-c ### Version 4.0.1, 03/31/2022, modified by RC:
-c
-c       - Added 'target' attribute to arrays used with C_LOC.
-c         This fixes a compiler error with GFORTRAN when
-c         using cuSparse.
-c
-c ### Version 4.1.0, 10/24/2022, modified by RC:
-c
-c       - Removed NCGFLUSH input parameter.  The output is
-c         flushed at every write now.
-c       - Changed behavior of solver residual reporting
-c         to also output to terminal.
-c       - Removed flush_output_file() routine in favor of Fortran 2003
-c         intrinsic FLUSH.
-c
-c ### Version 4.2.0, 08/21/2024, modified by MS/RC:
-c
-c       - Replaced OpenACC with OpenMP Target directives.
-c         There remains a single OpenACC directive for device selection
-c         in order to work with nvfortran.
-c
-c#######################################################################
diff --git a/src/stdpar/pot3d.F90 b/src/stdpar/pot3d.F90
new file mode 100644
index 0000000..4ab64e8
--- /dev/null
+++ b/src/stdpar/pot3d.F90
@@ -0,0 +1,7126 @@
+!#######################################################################
+!  _____   ____ _______ ____  _____
+! |  __ \ / __ \__   __|___ \|  __ \
+! | |__) | |  | | | |    __) | |  | |
+! |  ___/| |  | | | |   |__ <| |  | |
+! | |    | |__| | | |   ___) | |__| |
+! |_|     \____/  |_|  |____/|_____/
+!
+! ****** POT3D: 3D potential magnetic field outside a sphere.
+!
+! ****** This program can find the classical potential field, the
+! ****** fully open field, the source-surface field, and the
+! ****** source-surface plus current-sheet field.
+!
+!        Authors:  Zoran Mikic
+!                  Ronald M. Caplan
+!                  Jon A. Linker
+!                  Roberto Lionello
+!                  Miko Stulajter
+!
+!        Predictive Science Inc.
+!        www.predsci.com
+!        San Diego, California, USA 92121
+!
+!#######################################################################
+! Copyright 2021 Predictive Science Inc.
+!
+! Licensed under the Apache License, Version 2.0 (the "License");
+! you may not use this file except in compliance with the License.
+! You may obtain a copy of the License at
+!
+!    http://www.apache.org/licenses/LICENSE-2.0
+!
+! Unless required by applicable law or agreed to in writing, software
+! distributed under the License is distributed on an "AS IS" BASIS,
+! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+! implied.
+! See the License for the specific language governing permissions and
+! limitations under the License.
+!#######################################################################
+!
+!#######################################################################
+!
+module ident
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+! ****** Code name.
+!-----------------------------------------------------------------------
+!
+      character(*), parameter :: idcode='POT3D'
+      character(*), parameter :: vers  ='4.3.1_stdpar'
+      character(*), parameter :: update='12/05/2024'
+!
+end module
+!#######################################################################
+module number_types
+!
+!-----------------------------------------------------------------------
+! ****** Basic number types.
+! ****** This module is used to set the default precision for REALs.
+!-----------------------------------------------------------------------
+!
+      use iso_fortran_env
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+      integer, parameter :: KIND_REAL_4=REAL32
+      integer, parameter :: KIND_REAL_8=REAL64
+      integer, parameter :: KIND_REAL_16=max(REAL128,REAL64)
+!
+      integer, parameter :: r_typ=KIND_REAL_8
+!
+end module
+!#######################################################################
+module number_types_pc
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use iso_fortran_env
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer, parameter :: r_typ_pc=REAL32
+!
+end module
+!#######################################################################
+module constants
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+      real(r_typ), parameter :: pi=3.1415926535897932_r_typ
+!
+end module
+!#######################################################################
+module global_dims
+!
+!-----------------------------------------------------------------------
+! ****** Global number of mesh points.
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+! ****** Global mesh size.
+!
+      integer :: nr_g,nrm1_g
+      integer :: nt_g,ntm1_g
+      integer :: np_g,npm1_g
+!
+! ****** Flag to indicate an axisymmetric run.
+!
+      logical :: axisymmetric=.false.
+!
+end module
+!#######################################################################
+module local_dims
+!
+!-----------------------------------------------------------------------
+! ****** Local number of mesh points.
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+! ****** Flags to indicate whether this processor has points
+! ****** on the physical boundaries.
+!
+      logical :: rb0,rb1,tb0,tb1
+!
+! ****** Local mesh size.
+!
+      integer :: nr,nrm1
+      integer :: nt,ntm1
+      integer :: np,npm1
+!
+! ****** Dimensions of arrays on the "main" mesh.
+!
+      integer :: nrm
+      integer :: ntm
+      integer :: npm
+!
+! ****** Indices of start and end points in the global mesh
+! ****** belonging to this processor.
+!
+      integer :: i0_g,i1_g
+      integer :: j0_g,j1_g
+      integer :: k0_g,k1_g
+!
+end module
+!#######################################################################
+module global_mesh
+!
+!-----------------------------------------------------------------------
+! ****** Global mesh.
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use constants
+!
+      implicit none
+!
+      real(r_typ), dimension(:), allocatable :: r_g,rh_g,dr_g,drh_g
+      real(r_typ), dimension(:), allocatable :: t_g,th_g,dt_g,dth_g
+      real(r_typ), dimension(:), allocatable :: p_g,ph_g,dp_g,dph_g
+!
+      real(r_typ), dimension(:), allocatable :: st_g,ct_g,sth_g,cth_g
+      real(r_typ), dimension(:), allocatable :: sp_g,cp_g,sph_g,cph_g
+!
+! ****** Physical mesh size.
+!
+      real(r_typ), parameter, private :: one=1._r_typ
+      real(r_typ), parameter, private :: two=2._r_typ
+!
+      real(r_typ) :: r0=1._r_typ
+      real(r_typ) :: r1=30._r_typ
+      real(r_typ), parameter :: t0=0.
+      real(r_typ), parameter :: t1=pi
+      real(r_typ), parameter :: p0=0.
+      real(r_typ), parameter :: p1=two*pi
+!
+      real(r_typ), parameter :: pl=p1-p0
+      real(r_typ), parameter :: pl_i=one/pl
+!
+end module
+!#######################################################################
+module local_mesh
+!
+!-----------------------------------------------------------------------
+! ****** Local mesh.
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+      implicit none
+!
+      real(r_typ), dimension(:), allocatable :: r,r2,rh,dr,drh
+      real(r_typ) :: dr1,drn
+!
+      real(r_typ), dimension(:), allocatable :: t,th,dt,dth
+      real(r_typ), dimension(:), allocatable :: p,ph,dp,dph
+!
+      real(r_typ), dimension(:), allocatable :: st,ct,sth,cth
+      real(r_typ), dimension(:), allocatable :: sp,cp,sph,cph
+!
+! ****** Inverse quantities (for efficiency).
+!
+      real(r_typ), dimension(:), allocatable :: r_i,rh_i
+      real(r_typ), dimension(:), allocatable :: dr_i,drh_i
+      real(r_typ), dimension(:), allocatable :: dt_i,dth_i
+      real(r_typ), dimension(:), allocatable :: st_i,sth_i
+      real(r_typ), dimension(:), allocatable :: dp_i,dph_i
+!
+end module
+!#######################################################################
+module mpidefs
+!
+!-----------------------------------------------------------------------
+! ****** MPI variables, processor topology, and processor information.
+!-----------------------------------------------------------------------
+!
+      use mpi
+!
+      implicit none
+!
+! ****** Total number of processors.
+!
+      integer :: nproc
+!
+! ****** Total number of processors per node.
+!
+      integer :: nprocsh
+!
+! ****** Processor rank of this process in communicator
+! ****** MPI_COMM_WORLD.
+!
+      integer :: iprocw
+!
+! ****** Processor rank of this process in communicator
+! ****** comm_shared.
+!
+      integer :: iprocsh
+!
+! ****** Flag to designate that this is the processor with
+! ****** rank 0 in communicator MPI_COMM_WORLD.
+!
+      logical :: iamp0
+!
+! ****** Communicator over all processors in the Cartesian topology.
+!
+      integer :: comm_all
+!
+! ****** Processor rank of this process in communicator
+! ****** COMM_ALL.
+!
+      integer :: iproc
+!
+! ****** Processor rank in communicator COMM_ALL for the
+! ****** processor that has rank 0 in MPI_COMM_WORLD.
+!
+      integer :: iproc0
+!
+! ****** Communicators over all processors in the phi dimension.
+!
+      integer :: comm_phi
+!
+! ****** Communicator over all shared processors on the node.
+!
+      integer :: comm_shared
+!
+! ****** Communicators over all processors in the r dimension.
+!
+      integer :: comm_r
+!
+! ****** Processor coordinate indices of this process
+! ****** in the Cartesian topology.
+!
+      integer :: iproc_r,iproc_t,iproc_p
+!
+! ****** Processor coordinate indices of the neighboring
+! ****** processors in the Cartesian topology.
+!
+      integer :: iproc_rm,iproc_rp
+      integer :: iproc_tm,iproc_tp
+      integer :: iproc_pm,iproc_pp
+!
+! ****** Number of processors along r, theta, and phi.
+!
+      integer :: nproc_r,nproc_t,nproc_p
+!
+! ****** Number type for REALs to be used in MPI calls.
+!
+      integer :: ntype_real
+!
+end module
+!#######################################################################
+module decomposition_params
+!
+!-----------------------------------------------------------------------
+! ****** Input parameters that define the domain decomposition
+! ****** among processors.
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+! ****** Number of processors per dimension.
+!
+      integer, dimension(3) :: nprocs=(/-1,-1,-1/)
+!
+end module
+!#######################################################################
+module decomposition
+!
+!-----------------------------------------------------------------------
+! ****** Information defining the domain decomposition.
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+! ****** Define the structure type for mapping local arrays
+! ****** into global arrays.
+!
+      type :: map_struct
+        integer :: n
+        integer :: i0
+        integer :: i1
+        integer :: offset
+      end type
+!
+! ****** Mapping structures for the different mesh types.
+!
+      type(map_struct), dimension(:), pointer :: map_rh
+      type(map_struct), dimension(:), pointer :: map_rm
+      type(map_struct), dimension(:), pointer :: map_th
+      type(map_struct), dimension(:), pointer :: map_tm
+      type(map_struct), dimension(:), pointer :: map_ph
+      type(map_struct), dimension(:), pointer :: map_pm
+!
+end module
+!#######################################################################
+module meshdef
+!
+!-----------------------------------------------------------------------
+! ****** Variables that define the mesh-point distributions.
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+      implicit none
+!
+      integer, parameter :: nmseg=30
+!
+      real(r_typ), dimension(nmseg) :: drratio=0.
+      real(r_typ), dimension(nmseg) :: dtratio=0.
+      real(r_typ), dimension(nmseg) :: dpratio=0.
+      real(r_typ), dimension(nmseg) :: rfrac=0.
+      real(r_typ), dimension(nmseg) :: tfrac=0.
+      real(r_typ), dimension(nmseg) :: pfrac=0.
+!
+      integer :: nfrmesh=0
+      integer :: nftmesh=0
+      integer :: nfpmesh=0
+!
+      real(r_typ) :: phishift=0.
+!
+end module
+!#######################################################################
+module fields
+!
+!-----------------------------------------------------------------------
+! ****** Local field arrays.
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+      implicit none
+!
+! ****** Potential solution array and cg temp array.
+!
+      real(r_typ), dimension(:,:,:), allocatable :: phi
+      real(r_typ), dimension(:,:,:), allocatable :: x_ax
+!
+! ****** Boundary radial magnetic field arrays.
+!
+      real(r_typ), dimension(:,:), allocatable :: br0
+      real(r_typ), dimension(:,:), allocatable :: br1
+!
+! ****** Arrays used in polar boundary conditions.
+!
+      real(r_typ), dimension(:), allocatable :: sum0,sum1
+!
+! ****** Arrays used for final magnetic field.
+!
+      real(r_typ), dimension(:,:,:), allocatable :: br,bt,bp
+!
+end module
+!#######################################################################
+module cgcom
+!
+      use number_types
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+! ****** Number of equations to solve in the CG solver.
+!-----------------------------------------------------------------------
+!
+      integer :: ncgeq
+!
+!-----------------------------------------------------------------------
+! ****** CG field solver parameters.
+!-----------------------------------------------------------------------
+!
+      integer :: ifprec=1
+      integer :: ncgmax=1000000
+      integer :: ncghist=100
+      real(r_typ) :: epscg=1.e-9
+!
+!-----------------------------------------------------------------------
+! ****** CG field solver variables.
+!-----------------------------------------------------------------------
+!
+      integer :: ncg
+      real(r_typ) :: epsn
+!
+! ****** Seam buffers.
+!
+      real(r_typ), allocatable, dimension(:,:) :: sbuf_rt1,sbuf_rt2
+      real(r_typ), allocatable, dimension(:,:) :: sbuf_tp1,sbuf_tp2
+      real(r_typ), allocatable, dimension(:,:) :: sbuf_rp1,sbuf_rp2
+!
+      real(r_typ), allocatable, dimension(:,:) :: rbuf_rt1,rbuf_rt2
+      real(r_typ), allocatable, dimension(:,:) :: rbuf_tp1,rbuf_tp2
+      real(r_typ), allocatable, dimension(:,:) :: rbuf_rp1,rbuf_rp2
+!
+end module
+!#######################################################################
+module vars
+!
+!-----------------------------------------------------------------------
+! ****** Miscellaneous input variables.
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+      implicit none
+!
+      character(256) :: outfile='pot3d.out'
+      character(256) :: phifile='default'
+      character(256) :: br0file='default'
+      character(256) :: brfile='default'
+      character(256) :: btfile='default'
+      character(256) :: bpfile='default'
+      character(256) :: br_photo_file='default'
+      character(256) :: br_photo_original_file='default'
+!
+! ****** Type of field solution.
+! ****** Select between 'potential', 'open', and 'source-surface'.
+!
+      character(16) :: option='potential'
+!
+! ****** Interval at which to dump diagonstics during the
+! ****** iteration for the source-surface plus current-sheet
+! ****** solution.
+!
+      integer :: ndump=0
+!
+! ****** Flag to skip the balancing of the flux (for PFSS and
+! ****** OPEN field options only).
+
+      logical :: do_not_balance_flux=.false.
+!
+! ****** Set format for output files.
+!
+      character(3) :: fmt='h5'
+!
+      logical :: hdf32=.true.
+!
+! ***** Validation run (tilted dipole).
+!
+      logical :: validation_run=.false.
+!
+      real(r_typ) :: dipole_angle=0.7853981633974483_r_typ
+!
+end module
+!#######################################################################
+module solve_params
+!
+!-----------------------------------------------------------------------
+! ****** Parameters used in the solver.
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+      implicit none
+!
+! ****** Boundary condition switch at r=R1.
+!
+      real(r_typ) :: pm_r1
+!
+end module
+!#######################################################################
+module timer
+!
+!-----------------------------------------------------------------------
+! ****** Timer stack.
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+      implicit none
+!
+      integer, parameter :: nstack=10
+      integer :: istack=0
+      real(r_typ), dimension(nstack) :: tstart=0.
+!
+end module
+!#######################################################################
+module timing
+!
+!-----------------------------------------------------------------------
+! ****** Timing variables.
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+      implicit none
+!
+      real(r_typ) :: t_startup=0.
+      real(r_typ) :: t_solve=0.
+      real(r_typ) :: t_pc_load=0.
+      real(r_typ) :: t_pc=0.
+      real(r_typ) :: t_ax=0.
+      real(r_typ) :: t_io=0.
+      real(r_typ) :: c_seam=0.
+      real(r_typ) :: c_cgdot=0.
+      real(r_typ) :: c_sumphi=0.
+      real(r_typ) :: t_wall=0.
+!
+end module
+!#######################################################################
+module debug
+!
+!-----------------------------------------------------------------------
+! ****** Debugging level.
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+      integer :: idebug=0
+!
+end module
+!#######################################################################
+module assemble_array_interface
+      interface
+  subroutine assemble_array (map_r,map_t,map_p,a,a_g)
+        use number_types
+        use decomposition
+        use mpidefs
+        implicit none
+        type(map_struct), dimension(0:nproc-1) :: map_r,map_t,map_p
+        real(r_typ), dimension(:,:,:) :: a,a_g
+  end subroutine
+      end interface
+end module
+!#######################################################################
+module cusparse_interface
+!
+      use, intrinsic :: iso_c_binding
+!
+#ifdef CUSPARSE
+      interface
+  subroutine load_lusol_cusparse(CSR_A,CSR_AI,CSR_AJ,N,M) &
+          BIND(C, name="load_lusol_cusparse")
+          use, intrinsic :: iso_c_binding
+          implicit none
+          integer(C_INT), value :: N,M
+          type(C_PTR), value :: CSR_A,CSR_AI,CSR_AJ
+  end subroutine load_lusol_cusparse
+!
+  subroutine lusol_cusparse(x) &
+          BIND(C, name="lusol_cusparse")
+          use, intrinsic :: iso_c_binding
+          implicit none
+          type(C_PTR), value :: x
+  end subroutine lusol_cusparse
+!
+  subroutine unload_lusol_cusparse() &
+          BIND(C, name="unload_lusol_cusparse")
+  end subroutine unload_lusol_cusparse
+      end interface
+#endif
+!
+      integer(c_int) :: cN,cM
+!
+end module
+!#######################################################################
+module matrix_storage_pot3d_solve
+!
+!-----------------------------------------------------------------------
+! ****** Storage for the matrix/preconditioners of the solve.
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use number_types_pc
+!
+      implicit none
+!
+      real(r_typ), dimension(:,:,:,:), allocatable :: a
+      real(r_typ_pc), dimension(:), allocatable :: a_i
+!
+      integer, dimension(7) :: a_offsets
+
+      integer :: N,M
+      real(r_typ_pc), dimension(:), allocatable, target :: a_csr
+      real(r_typ_pc), dimension(:), allocatable :: lu_csr
+      real(r_typ_pc), dimension(:), allocatable :: a_csr_d
+      integer, dimension(:), allocatable :: lu_csr_ja
+      integer, dimension(:), allocatable, target :: a_csr_ia
+      integer, dimension(:), allocatable, target :: a_csr_ja
+      integer, dimension(:), allocatable :: a_N1
+      integer, dimension(:), allocatable :: a_N2
+      integer, dimension(:), allocatable :: a_csr_dptr
+!
+end module
+!#######################################################################
+program POT3D
+!
+!-----------------------------------------------------------------------
+!
+      use ident
+      use mpidefs
+      use vars
+      use solve_params
+      use timing
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+! ****** Initialize MPI.
+!
+      call init_mpi
+!
+! ****** Start the wall-clock timer.
+!
+      call timer_on
+!
+! ****** Write the code name and version.
+!
+      if (iamp0) then
+        write (*,*)
+        write (*,*) idcode,' Version ',vers,', updated on ',update
+      end if
+!
+      call timer_on
+!
+! ****** Read the input file.
+!
+      call read_input_file
+!
+! ****** Create the output file.
+!
+      if (iamp0) then
+        call ffopen (9,outfile,'rw',ierr)
+        if (ierr.ne.0) then
+          write (*,*)
+          write (*,*) '### ERROR in POT3D:'
+          write (*,*) '### Could not create the output file.'
+          write (*,*) 'File name: ',trim(outfile)
+        end if
+      end if
+      call check_error_on_p0 (ierr)
+!
+! ****** Check the input parameters.
+!
+      call check_input
+!
+! ****** Check the processor topology.
+!
+      call check_proc_topology
+!
+! ****** Decompose the domain.
+!
+      call decompose_domain
+!
+! ****** Allocate global arrays.
+!
+      call allocate_global_arrays
+!
+! ****** Set the global meshes.
+!
+      call set_global_mesh
+!
+! ****** Decompose the mesh.
+!
+      call decompose_mesh_r
+      call decompose_mesh_tp
+!
+! ****** Allocate local arrays.
+!
+      call allocate_local_arrays_r
+      call allocate_local_arrays_tp
+!
+! ****** Set the local meshes.
+!
+      call set_local_mesh_r
+      call set_local_mesh_tp
+!
+! ****** Print decomposition diagnostics.
+!
+      call decomp_diags
+!
+! ****** Set up seam for solver.
+!
+      call seam_setup
+!
+! ****** Initialize the flux and if validating, write analytic solution.
+!
+      if (validation_run) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### COMMENT from POT3D:'
+          write (*,*) '### Validation run requested.'
+          write (*,*) '### Ignoring br input file,'
+          write (*,*) '### setting HDF32=.FALSE.,'
+          write (*,*) '### and overriding output filenames.'
+          write (9,*)
+          write (9,*) '### COMMENT from POT3D:'
+          write (9,*) '### Validation run requested.'
+          write (9,*) '### Ignoring br input file'
+          write (9,*) '### setting HDF32=.FALSE.,'
+          write (9,*) '### and overriding output filenames.'
+        end if
+        hdf32=.false.
+        brfile='br_solved.'//trim(fmt)
+        btfile='bt_solved.'//trim(fmt)
+        bpfile='bp_solved.'//trim(fmt)
+        phifile='phi_solved.'//trim(fmt)
+        call set_validation_flux
+        call write_validation_solution
+      else
+        call set_flux
+      end if
+!
+      call timer_off (t_startup)
+!
+! ****** Find the solution.
+!
+      if (iamp0) then
+        write (*,*)
+        write (*,*) '### COMMENT from POT3D:'
+        write (*,*) '### Starting PCG solve.'
+        call FLUSH(OUTPUT_UNIT)
+        write (9,*)
+        write (9,*) '### COMMENT from POT3D:'
+        write (9,*) '### Starting PCG solve.'
+        call FLUSH(9)
+      end if
+!
+      call timer_on
+!
+      call potfld
+!
+      call timer_off (t_solve)
+!
+! ****** Compute B.
+!
+      call getb
+!
+! ****** Write solution to file.
+!
+      call write_solution
+!
+! ****** Magnetic energy diagnostics.
+!
+      call magnetic_energy
+!
+      call MPI_Barrier(MPI_COMM_WORLD,ierr)
+      call timer_off (t_wall)
+!
+      call write_timing
+!
+      call endrun (.false.)
+!
+end program pot3d
+!#######################################################################
+subroutine read_input_file
+!
+!-----------------------------------------------------------------------
+!
+! ****** Read the input file.
+!
+!-----------------------------------------------------------------------
+!
+      use global_dims
+      use global_mesh
+      use mpidefs
+      use meshdef
+      use cgcom
+      use debug
+      use vars
+      use decomposition_params
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+! ****** Values for the global mesh size.
+! ****** Since these names conflict with those in LOCAL_DIMS*, it is
+! ****** important not to use these modules here.
+!
+      integer :: nr=0
+      integer :: nt=0
+      integer :: np=0
+!
+!-----------------------------------------------------------------------
+!
+      namelist /topology/ &
+        nprocs,           &! MPI topology triplet.  Default -1,-1,-1
+                           !   automatically sets "best" topology.
+        nr,               &! Grid resolution in the `r` direction.
+        nt,               &! Grid resolution in the `theta` direction.
+        np                 ! Grid resolution in the `phi` direction.
+!
+!-----------------------------------------------------------------------
+!
+      namelist /inputvars/       &
+        r0,                      &! Lower radial boundary.
+        r1,                      &! Upper radial boundary.
+        drratio,                 &! Ratio of grid spacing at the end
+                                  ! of each segment to that at the
+                                  ! beginning for the radial grid
+                                  ! [ length(drratio) = length(rfrac)-1 ].
+        dtratio,                 &! Ratio of grid spacing (theta)
+        dpratio,                 &! Ratio of grid spacing (phi)
+        rfrac,                   &! Normalized positions of grid segment
+                                  ! boundaries (frac of domain size)
+                                  ! for radial grid.
+        tfrac,                   &! Normalized positions of grid (theta)
+        pfrac,                   &! Normalized positions of grid (phi)
+        nfrmesh,                 &! Number of times to filter/smooth
+                                  ! the radial grid spacing.
+        nftmesh,                 &! Number of times to filter (theta)
+        nfpmesh,                 &! Number of times to filter (phi)
+        phishift,                &! Apply an optional phi shift
+                                  ! (radians) to the input Br at r0.
+        ifprec,                  &! Preconditioner method:
+                                  !  1: Diagonal (use for GPU runs)
+                                  !  2: ILU0     (use for CPU runs or
+                                  !   GPU runs when built with cusparse)
+        ncgmax,                  &! Maximum alowed solver iterations.
+        ncghist,                 &! Iteration information.
+                                  !  0: Only write # total iterations.
+                                  ! >0: Write every NCGHIST iteration.
+        epscg,                   &! Solver convergence tolerance,
+                                  !|residual|/|right-hand-side|.
+        idebug,                  &! Output debugging info during run.
+        br0file,                 &! Filename of input 2D (tp) Br
+        phifile,                 &! Filename to write 3D PHI potential.
+        brfile,                  &! Filename to write 3D Br field.
+        btfile,                  &! Filename to write 3D Bt field.
+        bpfile,                  &! Filename to write 3D Bp field.
+        br_photo_file,           &! Filename to write 2D Br@r=r0
+                                  !(after interp/flux-balance).
+        br_photo_original_file,  &! Filename to write 2D Br@r=r0
+                                  ! (option='open' only, writes Br
+                                  !  before sign change).
+        option,                  &!  'ss'        PFSS
+                                  !  'potential' PF with closed-wall
+                                  !              (requires flux balance).
+                                  !  'open'      Open field. Used for
+                                  !              current sheet and fully
+                                  !              open field runs.
+                                  !              B field will be unsigned.
+        do_not_balance_flux,     &! Do not balance flux of input Br@r0.
+        hdf32,                   &! Output precision:
+                                  !  .true.   Single (32-bit) output.
+                                  !  .false.  Double (64-bit) output.
+        validation_run,          &!  Set this to run a validation test
+                                  ! with an analytic tilted dipole
+                                  ! solution. Overrides other inputs.
+        dipole_angle              ! Tilt angle for validation run dipole.
+!
+!-----------------------------------------------------------------------
+!
+      integer :: ierr
+      character(80) :: infile='pot3d.dat'
+!
+!-----------------------------------------------------------------------
+!
+! ****** Read the input file.
+!
+      call ffopen (8,infile,'r',ierr)
+!
+      if (ierr.ne.0) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in READ_INPUT_FILE:'
+          write (*,*) '### Could not open the input file.'
+          write (*,*) 'File name: ',trim(infile)
+        end if
+        call endrun (.true.)
+      end if
+!
+      read (8,topology)
+!
+      read (8,inputvars)
+!
+      close (8)
+!
+      if (trim(fmt).ne.'h5') then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in READ_INPUT_FILE:'
+          write (*,*) '### "fmt" must be "h5".'
+          write (*,*) 'fmt: ',trim(fmt)
+        end if
+        call endrun (.true.)
+      end if
+!
+! ****** Check if output names were overwritten.
+! ****** If not, set default names with format fmt.
+!
+      if (trim(phifile).eq.'default') then
+        phifile='phi.'//trim(fmt)
+      end if
+      if (trim(br0file).eq.'default') then
+        br0file='br0.'//trim(fmt)
+      end if
+      if (trim(brfile).eq.'default') then
+        brfile='br.'//trim(fmt)
+      end if
+      if (trim(btfile).eq.'default') then
+        btfile='bt.'//trim(fmt)
+      end if
+      if (trim(bpfile).eq.'default') then
+        bpfile='bp.'//trim(fmt)
+      end if
+      if (trim(br_photo_file).eq.'default') then
+        br_photo_file='br_photo.'//trim(fmt)
+      end if
+      if (trim(br_photo_original_file).eq.'default') then
+        br_photo_original_file='br_photo_original.'//trim(fmt)
+      end if
+!
+      nr_g=nr
+      nt_g=nt
+      np_g=np
+!
+! ****** Check if the specified mesh dimensions are valid.
+!
+      call check_mesh_dimensions (nr_g,nt_g,np_g)
+!
+      nrm1_g=nr_g-1
+      ntm1_g=nt_g-1
+      npm1_g=np_g-1
+!
+end subroutine
+!#######################################################################
+subroutine check_error_on_p0 (ierr0)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Check if the error flag IERR0 on processor 0 in
+! ****** MPI_COMM_WORLD (i.e., processor IPROC0 in COMM_ALL)
+! ****** indicates that the code should exit.
+!
+! ****** If IERR0 is non-zero, all the processors are directed
+! ****** to call ENDRUN to terminate the code.
+!
+!-----------------------------------------------------------------------
+!
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: ierr0
+!
+!-----------------------------------------------------------------------
+!
+! ****** MPI error return.
+!
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+! ****** Broadcast IERR0 to all processors.
+!
+      call MPI_Bcast (ierr0,1,MPI_INTEGER,0,MPI_COMM_WORLD,ierr)
+!
+! ****** Call ENDRUN if IERR0 is non-zero.
+!
+      if (ierr0.ne.0) then
+        call endrun (.true.)
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine endrun (ifstop)
+!
+!-----------------------------------------------------------------------
+!
+! ****** End the run and exit the code.
+!
+!-----------------------------------------------------------------------
+!
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      logical :: ifstop
+!
+!-----------------------------------------------------------------------
+!
+! ****** MPI error return.
+!
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+! ****** Close the output file.
+!
+      if (iamp0) then
+        close (9)
+      end if
+!
+! ****** Exit MPI gracefully.
+!
+      call MPI_Finalize (ierr)
+!
+! ****** Call the STOP statement if requested.
+!
+      if (ifstop) then
+        stop
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine init_mpi
+!
+!-----------------------------------------------------------------------
+!
+! ****** Initialize MPI.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+! ****** MPI error return.
+!
+      integer :: ierr,tcheck
+!
+!-----------------------------------------------------------------------
+!
+! ****** Real number to determine the KIND of REALs.
+!
+      real(r_typ) :: def_real
+!
+!-----------------------------------------------------------------------
+!
+      call MPI_Init_thread (MPI_THREAD_FUNNELED,tcheck,ierr)
+!
+! ****** Get the total number of processors.
+!
+      call MPI_Comm_size (MPI_COMM_WORLD,nproc,ierr)
+!
+! ****** Get the index (rank) of the local processor in
+! ****** communicator MPI_COMM_WORLD in variable IPROCW.
+!
+      call MPI_Comm_rank (MPI_COMM_WORLD,iprocw,ierr)
+!
+! ****** Create a shared communicator for all ranks in the node.
+!
+      call MPI_Comm_split_type (MPI_COMM_WORLD,MPI_COMM_TYPE_SHARED,0, &
+                                MPI_INFO_NULL,comm_shared,ierr)
+!
+! ****** Get the total number of processors in node.
+!
+      call MPI_Comm_size (comm_shared,nprocsh,ierr)
+!
+! ****** Get the index (rank) of the local processor in the local node.
+!
+      call MPI_Comm_rank (comm_shared,iprocsh,ierr)
+!
+! ****** Set the flag to designate whether this processor
+! ****** has rank 0 in communicator MPI_COMM_WORLD.
+!
+      if (iprocw.eq.0) then
+        iamp0=.true.
+      else
+        iamp0=.false.
+      end if
+!
+! ****** Set the number type for communicating REAL
+! ****** numbers in MPI calls.
+!
+      if (kind(def_real).eq.KIND_REAL_4) then
+        ntype_real=MPI_REAL4
+      else if (kind(def_real).eq.KIND_REAL_8) then
+        ntype_real=MPI_REAL8
+      else
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in INIT_MPI:'
+          write (*,*) '### Unrecognized default REAL number KIND:'
+          write (*,*) 'KIND(default_real) = ',kind(def_real)
+          write (*,*) 'This is a fatal error.'
+        end if
+        call endrun (.true.)
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine check_input
+!
+!-----------------------------------------------------------------------
+!
+! ****** Check the validity of the input parameters.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use vars
+      use solve_params
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: one=1._r_typ
+!
+!-----------------------------------------------------------------------
+!
+! ****** Check that OPTION is valid and set the boundary condition
+! ****** switches accordingly.
+!
+      if (option.eq.'potential') then
+!
+! ****** For a potential field, set d(phi)/dr to zero at r=R1
+! ****** (i.e., the field is tangential to the boundary).
+!
+        pm_r1=one
+!
+      else if (option.eq.'open') then
+!
+! ****** For an open field, set phi to zero at r=R1
+! ****** (i.e., the field is radial there).
+!
+        pm_r1=-one
+!
+      else if (option.eq.'ss') then
+!
+! ****** For a source surface field, set phi to zero at r=R1
+! ****** (i.e., the field is radial there).
+!
+        pm_r1=-one
+!
+      else
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in CHECK_INPUT:'
+          write (*,*) '### Invalid OPTION:'
+          write (*,*)
+          write (*,*) 'OPTION = ',trim(option)
+          write (*,*)
+          write (*,*) 'The options allowed are:'
+          write (*,*) '''potential'''
+          write (*,*) '''open'''
+          write (*,*) '''ss'''
+        end if
+        call endrun (.true.)
+      end if
+!
+      if (iamp0) then
+        write (*,*)
+        write (*,*) '### COMMENT from CHECK_INPUT:'
+        write (*,*) '### Field solve type:'
+        write (*,*)
+        write (*,*) 'OPTION = ',option
+        write (9,*)
+        write (9,*) '### COMMENT from CHECK_INPUT:'
+        write (9,*) '### Field solve type:'
+        write (9,*)
+        write (9,*) 'OPTION = ',option
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine set_proc_topology
+!
+!-----------------------------------------------------------------------
+!
+! ****** Set the optimal values of the MPI rank topology
+! ****** in dimensions not set by user.
+!
+!-----------------------------------------------------------------------
+!
+      use mpidefs
+      use decomposition_params
+      use number_types
+      use global_dims
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: one=1.0_r_typ
+      real(r_typ), parameter :: zero=0.0_r_typ
+      real(r_typ), parameter :: bigval=HUGE(1.0_r_typ)
+!
+!-----------------------------------------------------------------------
+!
+      integer, dimension(:), allocatable :: factors
+      integer, dimension(:,:), allocatable :: rank_factors
+      real(r_typ), dimension(:,:), allocatable :: nperrank
+      real(r_typ), dimension(:), allocatable :: penalty
+!
+      integer :: i,j,k,num_fac,num_rank_fac,best_idx
+      integer :: fr=1,ft=1,fp=1
+      real(r_typ) :: a12,a13,a23
+!
+!-----------------------------------------------------------------------
+!
+! ****** Extract nproc values.  A value of -1 indicates the dimension
+! ****** should be autoset.
+!
+      nproc_r=nprocs(1)
+      nproc_t=nprocs(2)
+      nproc_p=nprocs(3)
+!
+! ****** If no dimensions are to be autoset, return.
+!
+      if(nproc_r.ne.-1.and.nproc_t.ne.-1.and.nproc_p.ne.-1) return
+!
+! ****** Get all factors of nproc and store them in factors array.
+!
+      i=1
+      num_fac=0
+      do while(i.le.nproc)
+        if (MOD(nproc,i).eq.0) then
+          num_fac=num_fac+1
+        endif
+        i=i+1
+      enddo
+      allocate (factors(num_fac))
+      i=1
+      num_fac=0
+      do while(i.le.nproc)
+        if (MOD(nproc,i).eq.0) then
+          num_fac=num_fac+1
+          factors(num_fac)=i
+        endif
+        i=i+1
+      enddo
+!
+! ****** Set penalty function parameters and any fixed dimensions
+! ****** based on which dimensions are to be autoset.
+!
+      a12=one
+      a13=one
+      a23=one
+!
+      if (nproc_r.ne.-1) then
+        fr=nproc_r
+        a12=zero
+        a13=zero
+      end if
+      if (nproc_t.ne.-1) then
+        ft=nproc_t
+        a12=zero
+        a23=zero
+      end if
+      if (nproc_p.ne.-1) then
+        fp=nproc_p
+        a13=zero
+        a23=zero
+      end if
+!
+! ****** Loop over all combinations of factors and save those that
+! ****** yield the correct number of MPI ranks into rank_factors array.
+!
+      num_rank_fac=0
+      do k=1,num_fac
+        do j=1,num_fac
+          do i=1,num_fac
+            if(nproc_r.eq.-1) fr=factors(i)
+            if(nproc_t.eq.-1) ft=factors(j)
+            if(nproc_p.eq.-1) fp=factors(k)
+            if (fr*ft*fp.eq.nproc) then
+              num_rank_fac=num_rank_fac+1
+            end if
+          enddo
+        enddo
+      enddo
+!
+      if (num_rank_fac.eq.0) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in SET_PROC_TOPOLOGY:'
+          write (*,*) '### Processor topology specification error.'
+          write (*,*) 'No valid topologies found for selected options.'
+          write (*,*) 'Number of MPI ranks = ',nproc
+          write (*,*) 'NPROC_R = ',nproc_r
+          write (*,*) 'NPROC_T = ',nproc_t
+          write (*,*) 'NPROC_P = ',nproc_p
+        end if
+        call endrun (.true.)
+      end if
+!
+      allocate(rank_factors(num_rank_fac,3))
+      allocate(nperrank(num_rank_fac,3))
+      allocate(penalty(num_rank_fac))
+!
+      rank_factors(:,:)=-1
+      penalty(:)=bigval
+!
+      num_rank_fac=0
+      do k=1,num_fac
+        do j=1,num_fac
+          do i=1,num_fac
+            if(nproc_r.eq.-1) fr=factors(i)
+            if(nproc_t.eq.-1) ft=factors(j)
+            if(nproc_p.eq.-1) fp=factors(k)
+            if (fr*ft*fp.eq.nproc) then
+              num_rank_fac=num_rank_fac+1
+              rank_factors(num_rank_fac,1)=fr
+              rank_factors(num_rank_fac,2)=ft
+              rank_factors(num_rank_fac,3)=fp
+            end if
+          enddo
+        enddo
+      enddo
+!
+! ****** Get number of grid points per rank for each dimension.
+!
+      nperrank(:,1)=real(nr_g)/rank_factors(:,1)
+      nperrank(:,2)=real(nt_g)/rank_factors(:,2)
+      nperrank(:,3)=real(np_g)/rank_factors(:,3)
+!
+! ****** Compute penalty function.
+!
+      penalty(:)=a12*(nperrank(:,1)-nperrank(:,2))**2 &
+                +a23*(nperrank(:,2)-nperrank(:,3))**2 &
+                +a13*(nperrank(:,3)-nperrank(:,1))**2
+!
+! ****** Eliminate any choices that yield less than a minimum number
+! ****** of grid points per rank.
+!
+      do i=1,num_rank_fac
+        if (nperrank(i,1).lt.4) penalty(i)=bigval
+        if (nperrank(i,2).lt.4) penalty(i)=bigval
+        if (nperrank(i,3).lt.3) penalty(i)=bigval
+      enddo
+!
+! ****** Find optimal topology.
+!
+      best_idx=MINLOC(penalty,1)
+!
+      if (penalty(best_idx).eq.bigval) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in SET_PROC_TOPOLOGY:'
+          write (*,*) '### Processor topology specification error.'
+          write (*,*) 'No valid topologies found for selected options'
+          write (*,*) 'with selected grid.  '
+          write (*,*) 'It is likely you are using too many MPI ranks.'
+          write (*,*) 'Number of MPI ranks = ',nproc
+          write (*,*) 'NPROC_R = ',nproc_r
+          write (*,*) 'NPROC_T = ',nproc_t
+          write (*,*) 'NPROC_P = ',nproc_p
+          write (*,*) 'NR = ',nr_g
+          write (*,*) 'NT = ',nt_g
+          write (*,*) 'NP = ',np_g
+        end if
+        call endrun (.true.)
+      end if
+!
+! ****** Set optimal topology.
+!
+      nprocs(1)=rank_factors(best_idx,1)
+      nprocs(2)=rank_factors(best_idx,2)
+      nprocs(3)=rank_factors(best_idx,3)
+!
+      deallocate(factors)
+      deallocate(rank_factors)
+      deallocate(nperrank)
+      deallocate(penalty)
+!
+end subroutine
+!#######################################################################
+subroutine check_proc_topology
+!
+!-----------------------------------------------------------------------
+!
+! ****** Check the validity of the requested processor topology.
+!
+!-----------------------------------------------------------------------
+!
+      use mpidefs
+      use decomposition_params
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,nreq
+!
+!-----------------------------------------------------------------------
+!
+! ****** Check the processor topology.
+!
+      do i=1,3
+        if (nprocs(i).lt.1.and.nprocs(i).ne.-1) then
+          if (iamp0) then
+            write (*,*)
+            write (*,*) '### ERROR in CHECK_PROC_TOPOLOGY:'
+            write (*,*) '### Processor topology specification error.'
+            write (*,*) 'Invalid number of processors requested.'
+            write (*,*) 'Dimension = ',i
+            write (*,*) 'Number of processors requested = ', &
+                        nprocs(i)
+          end if
+          call endrun (.true.)
+        end if
+      enddo
+!
+! ****** Set the optimal values of the topology if requested.
+!
+      call set_proc_topology
+!
+! ****** Check that the number of processors available
+! ****** matches the number requested.
+!
+      nreq=nprocs(1)*nprocs(2)*nprocs(3)
+!
+      if (nreq.ne.nproc) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in CHECK_PROC_TOPOLOGY:'
+          write (*,*) '### Processor topology specification error.'
+          write (*,*) 'The number of processors requested does not'// &
+                     ' equal the number available.'
+          write (*,*) 'Number of processors requested = ',nreq
+          write (*,*) 'Number of processors available = ',nproc
+        end if
+        call endrun (.true.)
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine decompose_domain
+!
+!-----------------------------------------------------------------------
+!
+! ****** Decompose the domain into a Cartesian MPI topology.
+!
+!-----------------------------------------------------------------------
+!
+      use mpidefs
+      use decomposition_params
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+      integer, parameter :: ndim=3
+      integer, dimension(ndim) :: coords
+      logical, dimension(ndim) :: periodic
+      logical :: reorder
+      logical, dimension(ndim) :: keep_dim
+!
+!-----------------------------------------------------------------------
+!
+! ****** Create a communicator over all processors, COMM_ALL,
+! ****** that has a Cartesian topology.
+!
+! ****** Specify the periodicity of the coordinate system.
+!
+      periodic(1)=.false.
+      periodic(2)=.false.
+      periodic(3)=.true.
+!
+! ****** Allow re-ordering in the Cartesian topology.
+!
+      reorder=.true.
+!
+      call MPI_Cart_create (MPI_COMM_WORLD,ndim,nprocs, &
+                            periodic,reorder,comm_all,ierr)
+!
+! ****** Get the index (rank) of the local processor in
+! ****** communicator COMM_ALL in variable IPROC.
+!
+! ****** IMPORTANT NOTE:
+! ****** If re-odering was allowed in the Cartesian topology
+! ****** creation (above), then the rank of the local processor
+! ****** in communicator COMM_ALL may be different from its rank
+! ****** in communicator MPI_COMM_WORLD.
+!
+      call MPI_Comm_rank (comm_all,iproc,ierr)
+!
+! ****** Set the processor rank IPROC0 in communicator COMM_ALL
+! ****** for the processor that has rank 0 in MPI_COMM_WORLD.
+! ****** This value is broadcast to all the processors.
+!
+      if (iamp0) then
+        iproc0=iproc
+      end if
+      call MPI_Bcast (iproc0,1,MPI_INTEGER,0,MPI_COMM_WORLD,ierr)
+!
+! ****** Get the coordinate indices of this processor in the
+! ****** Cartesian MPI topology.
+!
+      call MPI_Cart_coords (comm_all,iproc,ndim,coords,ierr)
+!
+      iproc_r=coords(1)
+      iproc_t=coords(2)
+      iproc_p=coords(3)
+!
+      nproc_r=nprocs(1)
+      nproc_t=nprocs(2)
+      nproc_p=nprocs(3)
+!
+! ****** Get the rank of the neighboring processors in the
+! ****** Cartesian MPI topology.
+!
+      call MPI_Cart_shift (comm_all,0,1,iproc_rm,iproc_rp,ierr)
+      call MPI_Cart_shift (comm_all,1,1,iproc_tm,iproc_tp,ierr)
+      call MPI_Cart_shift (comm_all,2,1,iproc_pm,iproc_pp,ierr)
+!
+! ****** Create communicators for operations involving all
+! ****** processors in the phi dimension.  These communicators
+! ****** are stored in COMM_PHI (and generally represent different
+! ****** communicators on different processors).
+!
+      keep_dim(1)=.false.
+      keep_dim(2)=.false.
+      keep_dim(3)=.true.
+!
+      call MPI_Cart_sub (comm_all,keep_dim,comm_phi,ierr)
+!
+! ****** Create communicators for operations involving
+! ****** all processors in the r dimension.
+! ****** These communicators are stored in COMM_R
+! ****** (and generally represent different communicators on
+! ****** different processors).
+!
+      keep_dim(1)=.true.
+      keep_dim(2)=.false.
+      keep_dim(3)=.false.
+!
+      call MPI_Cart_sub (comm_all,keep_dim,comm_r,ierr)
+!
+end subroutine
+!#######################################################################
+subroutine decompose_mesh_r
+!
+!-----------------------------------------------------------------------
+!
+! ****** Decompose the r mesh between processors.
+!
+!-----------------------------------------------------------------------
+!
+      use global_dims
+      use local_dims
+      use decomposition
+      use solve_params
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: ierr,i,npts
+      integer :: i0_h,i1_h,i0_m,i1_m
+      integer, dimension(nproc_r) :: mp_r
+!
+!-----------------------------------------------------------------------
+!
+! ****** Decompose the r dimension.
+!
+      npts=nr_g
+!
+      call decompose_dimension (npts,nproc_r,mp_r,ierr)
+      if (ierr.ne.0) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in DECOMPOSE_MESH:'
+          write (*,*) '### Anomaly in decomposing the mesh'// &
+                      ' between processors.'
+          write (*,*) '### Could not decompose the r mesh.'
+          write (*,*) 'Number of mesh points in r = ',npts
+          write (*,*) 'Number of processors along r = ',nproc_r
+        end if
+        call endrun (.true.)
+      end if
+!
+! ****** Check that the resulting mesh topology is valid.
+!
+      call check_mesh_topology (nproc_r,mp_r,1,'r')
+!
+! ****** Compute the mapping between the processor decomposition
+! ****** and the global mesh.
+!
+! ****** Note that there is a two-point overlap in the mesh
+! ****** between adjacent processors in r.
+!
+      i0_g=1
+      do i=1,iproc_r
+        i0_g=i0_g+mp_r(i)
+      enddo
+      nr=mp_r(iproc_r+1)+2
+      i1_g=i0_g+nr-1
+!
+      nrm1=nr-1
+!
+! ****** Set the flags to indicate whether this processor has
+! ****** points on the physical boundaries.
+!
+      if (iproc_r.eq.0) then
+        rb0=.true.
+      else
+        rb0=.false.
+      end if
+!
+      if (iproc_r.eq.nproc_r-1) then
+        rb1=.true.
+      else
+        rb1=.false.
+      end if
+!
+! ****** Set the dimensions of arrays for the "main" meshes
+! ****** (i.e., the "m" mesh) for which normal derivatives are
+! ****** needed (e.g., v).  These vary on different processors,
+! ****** depending if they are left-boundary, internal, or
+! ****** right-boundary processors.
+!
+      if (rb1) then
+        nrm=nrm1
+      else
+        nrm=nr
+      end if
+!
+! ****** Store the mapping structure (for this processor).
+!
+      allocate (map_rh(0:nproc-1))
+      allocate (map_rm(0:nproc-1))
+!
+      if (rb0) then
+        i0_h=1
+      else
+        i0_h=2
+      end if
+      if (rb1) then
+        i1_h=nr
+      else
+        i1_h=nrm1
+      end if
+!
+      if (rb0) then
+        i0_m=1
+      else
+        i0_m=2
+      end if
+      i1_m=nrm1
+!
+      map_rh(iproc)%i0=i0_h
+      map_rh(iproc)%i1=i1_h
+!
+      map_rm(iproc)%i0=i0_m
+      map_rm(iproc)%i1=i1_m
+!
+      map_rh(iproc)%offset=i0_g+map_rh(iproc)%i0-1
+      map_rh(iproc)%n=map_rh(iproc)%i1-map_rh(iproc)%i0+1
+!
+      map_rm(iproc)%offset=i0_g+map_rm(iproc)%i0-1
+      map_rm(iproc)%n=map_rm(iproc)%i1-map_rm(iproc)%i0+1
+!
+! ****** Gather the mapping information by communicating among
+! ****** all processors.
+!
+      call gather_mapping_info (map_rh)
+      call gather_mapping_info (map_rm)
+!
+end subroutine
+!#######################################################################
+subroutine decompose_mesh_tp
+!
+!-----------------------------------------------------------------------
+!
+! ****** Decompose the theta and phi mesh between processors.
+!
+!-----------------------------------------------------------------------
+!
+      use global_dims
+      use local_dims
+      use decomposition
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: ierr,j,k
+      integer :: j0_h,j1_h,j0_m,j1_m
+      integer :: k0_h,k1_h,k0_m,k1_m
+      integer, dimension(nproc_t) :: mp_t
+      integer, dimension(nproc_p) :: mp_p
+!
+!-----------------------------------------------------------------------
+!
+! ****** Decompose the t dimension.
+!
+      call decompose_dimension (nt_g,nproc_t,mp_t,ierr)
+      if (ierr.ne.0) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in DECOMPOSE_MESH_TP:'
+          write (*,*) '### Anomaly in decomposing the mesh'// &
+                      ' between processors.'
+          write (*,*) '### Could not decompose the theta mesh.'
+          write (*,*) 'Number of mesh points in theta = ',nt_g
+          write (*,*) 'Number of processors along theta = ',nproc_t
+        end if
+        call endrun (.true.)
+      end if
+!
+! ****** Decompose the p dimension.
+!
+      call decompose_dimension (np_g,nproc_p,mp_p,ierr)
+      if (ierr.ne.0) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in DECOMPOSE_MESH_TP:'
+          write (*,*) '### Anomaly in decomposing the mesh'// &
+                      ' between processors.'
+          write (*,*) '### Could not decompose the phi mesh.'
+          write (*,*) 'Number of mesh points in phi = ',np_g
+          write (*,*) 'Number of processors along phi = ',nproc_p
+        end if
+        call endrun (.true.)
+      end if
+!
+! ****** Check that the resulting mesh topology is valid.
+!
+      call check_mesh_topology (nproc_t,mp_t,1,'theta')
+      call check_mesh_topology (nproc_p,mp_p,1,'phi')
+!
+! ****** Set the flag for an axisymmetric run (requested by
+! ****** setting NP_G = 3).
+!
+      if (np_g.eq.3) then
+        axisymmetric=.true.
+      else
+        axisymmetric=.false.
+      end if
+!
+! ****** Compute the mapping between the processor decomposition
+! ****** and the global mesh.
+!
+! ****** Note that there is a two-point overlap in the mesh
+! ****** between adjacent processors in theta and phi.
+!
+      j0_g=1
+      do j=1,iproc_t
+        j0_g=j0_g+mp_t(j)
+      enddo
+      nt=mp_t(iproc_t+1)+2
+      j1_g=j0_g+nt-1
+!
+      k0_g=1
+      do k=1,iproc_p
+        k0_g=k0_g+mp_p(k)
+      enddo
+      np=mp_p(iproc_p+1)+2
+      k1_g=k0_g+np-1
+!
+      ntm1=nt-1
+      npm1=np-1
+!
+! ****** Set the flags to indicate whether this processor has
+! ****** points on the physical boundaries.
+!
+      if (iproc_t.eq.0) then
+        tb0=.true.
+      else
+        tb0=.false.
+      end if
+!
+      if (iproc_t.eq.nproc_t-1) then
+        tb1=.true.
+      else
+        tb1=.false.
+      end if
+!
+! ****** Set the dimensions of arrays for the "main" meshes
+! ****** (i.e., the "m" mesh) for which normal derivatives are
+! ****** needed (e.g., v).  These vary on different processors,
+! ****** depending if they are left-boundary, internal, or
+! ****** right-boundary processors.
+!
+      if (tb1) then
+        ntm=ntm1
+      else
+        ntm=nt
+      end if
+!
+! ****** Since the phi dimension is periodic, all processors
+! ****** have the same mesh limits.
+!
+      npm=np
+!
+! ****** Store the mapping structure (for this processor).
+!
+      allocate (map_th(0:nproc-1))
+      allocate (map_tm(0:nproc-1))
+      allocate (map_ph(0:nproc-1))
+      allocate (map_pm(0:nproc-1))
+!
+      if (tb0) then
+        j0_h=1
+      else
+        j0_h=2
+      end if
+      if (tb1) then
+        j1_h=nt
+      else
+        j1_h=ntm1
+      end if
+!
+      if (tb0) then
+        j0_m=1
+      else
+        j0_m=2
+      end if
+      j1_m=ntm1
+!
+      if (iproc_p.eq.0) then
+        k0_m=1
+      else
+        k0_m=2
+      end if
+      k1_m=npm1
+!
+      if (iproc_p.eq.0) then
+        k0_h=1
+      else
+        k0_h=2
+      end if
+      if (iproc_p.eq.nproc_p-1) then
+        k1_h=np
+      else
+        k1_h=npm1
+      end if
+!
+      map_th(iproc)%i0=j0_h
+      map_th(iproc)%i1=j1_h
+!
+      map_tm(iproc)%i0=j0_m
+      map_tm(iproc)%i1=j1_m
+!
+      map_ph(iproc)%i0=k0_h
+      map_ph(iproc)%i1=k1_h
+!
+      map_pm(iproc)%i0=k0_m
+      map_pm(iproc)%i1=k1_m
+!
+      map_th(iproc)%offset=j0_g+map_th(iproc)%i0-1
+      map_th(iproc)%n=map_th(iproc)%i1-map_th(iproc)%i0+1
+!
+      map_tm(iproc)%offset=j0_g+map_tm(iproc)%i0-1
+      map_tm(iproc)%n=map_tm(iproc)%i1-map_tm(iproc)%i0+1
+!
+      map_ph(iproc)%offset=k0_g+map_ph(iproc)%i0-1
+      map_ph(iproc)%n=map_ph(iproc)%i1-map_ph(iproc)%i0+1
+!
+      map_pm(iproc)%offset=k0_g+map_pm(iproc)%i0-1
+      map_pm(iproc)%n=map_pm(iproc)%i1-map_pm(iproc)%i0+1
+!
+! ****** Gather the mapping information by communicating among
+! ****** all processors.
+!
+      call gather_mapping_info (map_th)
+      call gather_mapping_info (map_tm)
+      call gather_mapping_info (map_ph)
+      call gather_mapping_info (map_pm)
+!
+end subroutine
+!#######################################################################
+subroutine check_mesh_dimensions (nr_g,nt_g,np_g)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Check that the requested (global) mesh dimensions are
+! ****** valid.
+!
+!-----------------------------------------------------------------------
+!
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: nr_g,nt_g,np_g
+!
+!-----------------------------------------------------------------------
+!
+      if (nr_g.lt.4) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in CHECK_MESH_DIMENSIONS:'
+          write (*,*) '### Invalid number of r mesh points'// &
+                    ' requested.'
+          write (*,*) '### The minimum number of mesh points is 4.'
+          write (*,*)
+          write (*,*) 'Number of mesh points requested = ',nr_g
+        end if
+        call endrun (.true.)
+      end if
+!
+      if (nt_g.lt.4) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in CHECK_MESH_DIMENSIONS:'
+          write (*,*) '### Invalid number of theta mesh points'// &
+                    ' requested.'
+          write (*,*) '### The minimum number of mesh points is 4.'
+          write (*,*)
+          write (*,*) 'Number of mesh points requested = ',nt_g
+        end if
+        call endrun (.true.)
+      end if
+!
+      if (np_g.lt.3) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in CHECK_MESH_DIMENSIONS:'
+          write (*,*) '### Invalid number of phi mesh points'// &
+                    ' requested.'
+          write (*,*) '### The minimum number of mesh points is 3.'
+          write (*,*)
+          write (*,*) 'Number of mesh points requested = ',np_g
+        end if
+        call endrun (.true.)
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine decompose_dimension (nx,np,mp,ierr)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Decompose the mesh points NX along NP processors.
+!
+! ****** The decomposed mesh points are returned in array MP.
+!
+!-----------------------------------------------------------------------
+!
+! ****** This routine attempts to assign the mesh points as equally
+! ****** as possible between the processors.
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: nx
+      integer :: np
+      integer, dimension(np) :: mp
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+      integer :: nxm2,mpav,nrem
+!
+!-----------------------------------------------------------------------
+!
+      ierr=0
+!
+      nxm2=nx-2
+!
+      if (nxm2.le.0) then
+        ierr=1
+        return
+      end if
+!
+      if (np.le.0) then
+        ierr=2
+        return
+      end if
+!
+      mpav=nxm2/np
+!
+      mp(:)=mpav
+!
+      nrem=nxm2-mpav*np
+!
+      mp(1:nrem)=mp(1:nrem)+1
+!
+end subroutine
+!#######################################################################
+subroutine check_mesh_topology (np,mp,min_pts,coord)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Check the validity of the requested mesh topology.
+!
+!-----------------------------------------------------------------------
+!
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: np
+      integer, dimension(np) :: mp
+      integer :: min_pts
+      character(*) :: coord
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i
+!
+!-----------------------------------------------------------------------
+!
+! ****** Check that the number of mesh points on each processor
+! ****** is valid.
+!
+      do i=1,np
+        if (mp(i).lt.min_pts) then
+          if (iamp0) then
+            write (*,*)
+            write (*,*) '### ERROR in CHECK_MESH_TOPOLOGY:'
+            write (*,*) '### Mesh topology specification error.'
+            write (*,*) 'Invalid number of ',coord, &
+                        ' mesh points requested.'
+            write (*,*) 'Processor index = ',i
+            write (*,*) 'Number of mesh points requested = ',mp(i)
+            write (*,*) 'Minimum number of mesh points allowed = ', &
+                        min_pts
+          end if
+          call endrun (.true.)
+        end if
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine gather_mapping_info (map)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Gather a mapping information array by communicating
+! ****** among all processors.
+!
+!-----------------------------------------------------------------------
+!
+      use mpidefs
+      use decomposition
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      type(map_struct), dimension(0:nproc-1) :: map
+!
+!-----------------------------------------------------------------------
+!
+! ****** Buffers for packing the data.
+!
+      integer, parameter :: lbuf=4
+      integer, dimension(lbuf) :: sbuf
+      integer, dimension(lbuf,0:nproc-1) :: rbuf
+!
+!-----------------------------------------------------------------------
+!
+      integer :: ierr,irank
+!
+!-----------------------------------------------------------------------
+!
+! ****** Put the local section of the mapping information
+! ****** array into a buffer.
+!
+      sbuf(1)=map(iproc)%n
+      sbuf(2)=map(iproc)%i0
+      sbuf(3)=map(iproc)%i1
+      sbuf(4)=map(iproc)%offset
+!
+! ****** Communicate among all processors.  After this call, all
+! ****** processors have the complete mapping information.
+!
+      call MPI_Allgather (sbuf,lbuf,MPI_INTEGER, &
+                          rbuf,lbuf,MPI_INTEGER,comm_all,ierr)
+!
+! ****** Extract the mapping information from the buffer.
+!
+      do irank=0,nproc-1
+        map(irank)%n     =rbuf(1,irank)
+        map(irank)%i0    =rbuf(2,irank)
+        map(irank)%i1    =rbuf(3,irank)
+        map(irank)%offset=rbuf(4,irank)
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine decomp_diags
+!
+!-----------------------------------------------------------------------
+!
+! ****** Print diagnostics about the mesh decomposition.
+!
+!-----------------------------------------------------------------------
+!
+      use global_dims
+      use global_mesh
+      use local_dims
+      use local_mesh
+      use mpidefs
+      use solve_params
+      use debug
+      use decomposition
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: ierr
+      integer :: irank
+      real(r_typ) :: n_per_grid_min,n_per_grid_max
+!
+!-----------------------------------------------------------------------
+!
+      if (iamp0) then
+!
+        n_per_grid_min=floor(real(nr_g)/nproc_r) &
+                      *floor(real(nt_g)/nproc_t) &
+                      *floor(real(np_g)/nproc_p)
+!
+        n_per_grid_max=ceiling(real(nr_g)/nproc_r) &
+                      *ceiling(real(nt_g)/nproc_t) &
+                      *ceiling(real(np_g)/nproc_p)
+!
+        write (*,*)
+        write (*,*) 'Total number of MPI ranks = ',nproc
+        write (*,*)
+        write (*,*) 'Number of MPI ranks in r = ',nproc_r
+        write (*,*) 'Number of MPI ranks in t = ',nproc_t
+        write (*,*) 'Number of MPI ranks in p = ',nproc_p
+        write (*,*)
+        write (*,*) 'Global mesh dimension in r = ',nr_g
+        write (*,*) 'Global mesh dimension in t = ',nt_g
+        write (*,*) 'Global mesh dimension in p = ',np_g
+        write (*,*)
+        write (*,'(A,F6.1)') ' Average # of r mesh pts per rank = ', &
+                     real(nr_g)/nproc_r
+        write (*,'(A,F6.1)') ' Average # of t mesh pts per rank = ', &
+                     real(nt_g)/nproc_t
+        write (*,'(A,F6.1)') ' Average # of p mesh pts per rank = ', &
+                     real(np_g)/nproc_p
+        write (*,*)
+        write (*,'(A,F6.2,A)') ' Estimated load imbalance = ',       &
+            100.0*(1.0-real(n_per_grid_min)/real(n_per_grid_max)),' %'
+!
+        write (9,*)
+        write (9,*) 'Total number of MPI ranks = ',nproc
+        write (9,*)
+        write (9,*) 'Number of MPI ranks in r = ',nproc_r
+        write (9,*) 'Number of MPI ranks in t = ',nproc_t
+        write (9,*) 'Number of MPI ranks in p = ',nproc_p
+        write (9,*)
+        write (9,*) 'Global mesh dimension in r = ',nr_g
+        write (9,*) 'Global mesh dimension in t = ',nt_g
+        write (9,*) 'Global mesh dimension in p = ',np_g
+        write (9,*)
+        write (9,'(A,F6.1)') ' Average # of r mesh pts per rank = ', &
+                     real(nr_g)/nproc_r
+        write (9,'(A,F6.1)') ' Average # of t mesh pts per rank = ', &
+                     real(nt_g)/nproc_t
+        write (9,'(A,F6.1)') ' Average # of p mesh pts per rank = ', &
+                     real(np_g)/nproc_p
+        write (9,*)
+        write (9,'(A,F6.2,A)') ' Estimated load imbalance = ',       &
+            100.0*(1.0-real(n_per_grid_min)/real(n_per_grid_max)),' %'
+!
+      end if
+!
+      if (idebug.le.1) return
+!
+      do irank=0,nproc-1
+        call MPI_Barrier (comm_all,ierr)
+        if (irank.eq.iproc) then
+          write (*,*)
+          write (*,*)
+          write (*,*) 'Rank id = ',iproc
+          write (*,*) 'nr = ',nr
+          write (*,*) 'nt = ',nt
+          write (*,*) 'np = ',np
+          write (*,*) 'i0_g = ',i0_g
+          write (*,*) 'i1_g = ',i1_g
+          write (*,*) 'j0_g = ',j0_g
+          write (*,*) 'j1_g = ',j1_g
+          write (*,*) 'k0_g = ',k0_g
+          write (*,*) 'k1_g = ',k1_g
+          write (*,*) 'Rank index in r    = ',iproc_r
+          write (*,*) 'Rank index in t    = ',iproc_t
+          write (*,*) 'Rank index in p    = ',iproc_p
+          write (*,*) 'Rank to left  in r = ',iproc_rm
+          write (*,*) 'Rank to right in r = ',iproc_rp
+          write (*,*) 'Rank to left  in t = ',iproc_tm
+          write (*,*) 'Rank to right in t = ',iproc_tp
+          write (*,*) 'Rank to left  in p = ',iproc_pm
+          write (*,*) 'Rank to right in p = ',iproc_pp
+          write (*,*)
+          write (*,*) 'Rank in MPI_COMM_WORLD = ',iprocw
+          write (*,*) 'Rank in COMM_ALL       = ',iproc
+          if (idebug.gt.2) then
+            write (*,*)
+            write (*,*) 'r mesh:'
+            write (*,*) r
+            write (*,*)
+            write (*,*) 'theta mesh:'
+            write (*,*) t
+            write (*,*)
+            write (*,*) 'phi mesh:'
+            write (*,*) p
+          end if
+        end if
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine allocate_global_arrays
+!
+!-----------------------------------------------------------------------
+!
+! ****** Allocate global arrays.
+!
+!-----------------------------------------------------------------------
+!
+      use global_dims
+      use global_mesh
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+! ****** Allocate global mesh arrays.
+!
+      allocate (r_g (nrm1_g))
+      allocate (dr_g(nrm1_g))
+!
+      allocate (rh_g (nr_g))
+      allocate (drh_g(nr_g))
+!
+      allocate (t_g (ntm1_g))
+      allocate (dt_g(ntm1_g))
+!
+      allocate (th_g (nt_g))
+      allocate (dth_g(nt_g))
+!
+      allocate (p_g  (np_g))
+      allocate (dp_g (np_g))
+      allocate (ph_g (np_g))
+      allocate (dph_g(np_g))
+!
+      allocate (st_g(ntm1_g))
+      allocate (ct_g(ntm1_g))
+!
+      allocate (sth_g(nt_g))
+      allocate (cth_g(nt_g))
+!
+      allocate (sp_g (np_g))
+      allocate (cp_g (np_g))
+      allocate (sph_g(np_g))
+      allocate (cph_g(np_g))
+!
+end subroutine
+!#######################################################################
+subroutine allocate_local_arrays_r
+!
+!-----------------------------------------------------------------------
+!
+! ****** Allocate local arrays for the r dimension and 3D arrays.
+!
+!-----------------------------------------------------------------------
+!
+      use local_dims
+      use local_mesh
+      use fields
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      allocate (r (nrm))
+      allocate (r2 (nrm))
+      allocate (dr(nrm))
+!
+      allocate (rh (nr))
+      allocate (drh(nr))
+!
+! ****** Allocate inverse quantities.
+!
+      allocate (r_i (nrm))
+      allocate (dr_i(nrm))
+!
+      allocate (rh_i (nr))
+      allocate (drh_i(nr))
+!
+! ****** Allocate the potential array and cg scratch array.
+!
+      allocate (phi(nr,nt,np))
+      allocate (x_ax(nr,nt,np))
+      phi(:,:,:)=0.
+      x_ax(:,:,:)=0.
+!
+! ****** Allocate polar boundary arrays.
+!
+      allocate (sum0(nr))
+      allocate (sum1(nr))
+      sum0(:)=0.
+      sum1(:)=0.
+!
+! ****** Allocate the local magnetic field arrays.
+!
+      allocate (br(nrm,nt,np))
+      allocate (bt(nr,ntm,np))
+      allocate (bp(nr,nt,npm))
+      br(:,:,:)=0.
+      bt(:,:,:)=0.
+      bp(:,:,:)=0.
+!
+end subroutine
+!#######################################################################
+subroutine allocate_local_arrays_tp
+!
+!-----------------------------------------------------------------------
+!
+! ****** Allocate local arrays for the theta and phi dimensions.
+!
+!-----------------------------------------------------------------------
+!
+      use local_dims
+      use local_mesh
+      use fields
+      use vars
+      use solve_params
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      allocate (t (ntm))
+      allocate (dt(ntm))
+!
+      allocate (th (nt))
+      allocate (dth(nt))
+!
+      allocate (p (np))
+      allocate (dp(np))
+!
+      allocate (ph (np))
+      allocate (dph(np))
+!
+      allocate (st(ntm))
+      allocate (ct(ntm))
+!
+      allocate (sth(nt))
+      allocate (cth(nt))
+!
+      allocate (sp (np))
+      allocate (cp (np))
+      allocate (sph(np))
+      allocate (cph(np))
+!
+! ****** Allocate inverse quantities.
+!
+      allocate (dt_i(ntm))
+      allocate (st_i(ntm))
+!
+      allocate (dth_i(nt))
+      allocate (sth_i(nt))
+!
+      allocate (dp_i (np))
+      allocate (dph_i(np))
+!
+! ****** Allocate the boundary radial magnetic field array(s).
+!
+      allocate (br0(nt,np))
+      br0(:,:)=0.
+!
+      if (validation_run) then
+        allocate (br1(nt,np))
+        br1(:,:)=0.
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine set_global_mesh
+!
+!-----------------------------------------------------------------------
+!
+! ****** Define the global mesh arrays.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use global_dims
+      use global_mesh
+      use meshdef
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: zero=0._r_typ
+      real(r_typ), parameter :: one=1._r_typ
+      real(r_typ), parameter :: half=.5_r_typ
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j,k
+!
+!-----------------------------------------------------------------------
+!
+! ****** Define the radial mesh.
+!
+      call genmesh (9,'r',nrm1_g,r0,r1,nmseg,rfrac,drratio,nfrmesh, &
+                    .false.,zero,r_g)
+!
+      do i=2,nrm1_g
+        rh_g(i)=.5*(r_g(i)+r_g(i-1))
+        drh_g(i)=r_g(i)-r_g(i-1)
+      enddo
+      rh_g(1)=rh_g(2)-drh_g(2)
+      rh_g(nr_g)=rh_g(nrm1_g)+drh_g(nrm1_g)
+      drh_g(1)=drh_g(2)
+      drh_g(nr_g)=drh_g(nrm1_g)
+!
+      do i=1,nrm1_g
+        dr_g(i)=rh_g(i+1)-rh_g(i)
+      enddo
+!
+! ****** Define the theta mesh.
+!
+      call genmesh (9,'t',ntm1_g,t0,t1,nmseg,tfrac,dtratio, &
+                    nftmesh,.false.,zero,t_g)
+!
+      do j=2,ntm1_g
+        th_g(j)=.5*(t_g(j)+t_g(j-1))
+        dth_g(j)=t_g(j)-t_g(j-1)
+      enddo
+      th_g(1)=th_g(2)-dth_g(2)
+      th_g(nt_g)=th_g(ntm1_g)+dth_g(ntm1_g)
+      dth_g(1)=dth_g(2)
+      dth_g(nt_g)=dth_g(ntm1_g)
+!
+      do j=1,ntm1_g
+        dt_g(j)=th_g(j+1)-th_g(j)
+      enddo
+!
+! ****** Define the periodic phi mesh.
+!
+      call genmesh (9,'p',npm1_g,p0,p1,nmseg,pfrac,dpratio, &
+                    nfpmesh,.true.,phishift,p_g)
+      p_g(np_g)=p_g(2)+pl
+!
+      do k=2,np_g
+        ph_g(k)=half*(p_g(k)+p_g(k-1))
+        dph_g(k)=p_g(k)-p_g(k-1)
+      enddo
+      ph_g(1)=ph_g(npm1_g)-pl
+      dph_g(1)=dph_g(npm1_g)
+!
+      do k=1,npm1_g
+        dp_g(k)=ph_g(k+1)-ph_g(k)
+      enddo
+      dp_g(np_g)=dp_g(2)
+!
+! ****** Enforce exact periodicity to protect symmetry properties
+! ****** from round-off errors (especially for axisymmetric cases).
+!
+      dph_g(np_g)=dph_g(2)
+      dp_g(1)=dp_g(npm1_g)
+!
+! ****** Define global auxiliary mesh-related arrays.
+!
+      st_g(:)=sin(t_g(:))
+      ct_g(:)=cos(t_g(:))
+      sth_g(:)=sin(th_g(:))
+      cth_g(:)=cos(th_g(:))
+!
+      sp_g(:)=sin(p_g(:))
+      cp_g(:)=cos(p_g(:))
+      sph_g(:)=sin(ph_g(:))
+      cph_g(:)=cos(ph_g(:))
+!
+! ****** For an axisymmetric case, set the exact values of
+! ****** sin(phi) and cos(phi) to preserve symmetry properties
+! ****** in the presence of round-off errors.
+!
+      if (axisymmetric) then
+        sp_g(2)=0.
+        cp_g(2)=one
+        sph_g(2)=0.
+        cph_g(2)=-one
+      end if
+!
+! ****** Enforce exact periodicity to protect symmetry properties
+! ****** from round-off errors (especially for axisymmetric cases).
+!
+      sph_g(1)=sph_g(npm1_g)
+      sph_g(np_g)=sph_g(2)
+      cph_g(1)=cph_g(npm1_g)
+      cph_g(np_g)=cph_g(2)
+      sp_g(1)=sp_g(npm1_g)
+      sp_g(np_g)=sp_g(2)
+      cp_g(1)=cp_g(npm1_g)
+      cp_g(np_g)=cp_g(2)
+!
+end subroutine
+!#######################################################################
+subroutine set_local_mesh_r
+!
+!-----------------------------------------------------------------------
+!
+! ****** Define the local r mesh arrays.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use global_dims
+      use global_mesh
+      use local_dims
+      use local_mesh
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: one=1._r_typ
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i
+!
+!-----------------------------------------------------------------------
+!
+! ****** Define the local meshes.
+!
+      do i=1,nrm
+        r(i)=r_g(i0_g+i-1)
+        dr(i)=dr_g(i0_g+i-1)
+      enddo
+!
+      dr1=dr(1)
+      drn=dr(nrm)
+!
+      do i=1,nr
+        rh(i)=rh_g(i0_g+i-1)
+        drh(i)=drh_g(i0_g+i-1)
+      enddo
+!
+! ****** Define local auxiliary mesh-related arrays.
+!
+      r2(:)=r(:)**2
+      r_i(:)=one/r(:)
+      dr_i(:)=one/dr(:)
+      rh_i(:)=one/rh(:)
+      drh_i(:)=one/drh(:)
+!
+end subroutine
+!#######################################################################
+subroutine set_local_mesh_tp
+!
+!-----------------------------------------------------------------------
+!
+! ****** Define the local theta and phi mesh arrays.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use global_dims
+      use global_mesh
+      use local_dims
+      use local_mesh
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: one=1._r_typ
+!
+!-----------------------------------------------------------------------
+!
+      integer :: j,k,j0,j1
+!
+!-----------------------------------------------------------------------
+!
+! ****** Define the local meshes.
+!
+      do j=1,ntm
+        t(j)=t_g(j0_g+j-1)
+        dt(j)=dt_g(j0_g+j-1)
+      enddo
+!
+      do j=1,nt
+        th(j)=th_g(j0_g+j-1)
+        dth(j)=dth_g(j0_g+j-1)
+      enddo
+!
+      do k=1,npm
+        p(k)=p_g(k0_g+k-1)
+        dp(k)=dp_g(k0_g+k-1)
+      enddo
+!
+      do k=1,np
+        ph(k)=ph_g(k0_g+k-1)
+        dph(k)=dph_g(k0_g+k-1)
+      enddo
+!
+! ****** Define local auxiliary mesh-related arrays.
+!
+      do j=1,ntm
+        st(j)=st_g(j0_g+j-1)
+        ct(j)=ct_g(j0_g+j-1)
+      enddo
+!
+      do j=1,nt
+        sth(j)=sth_g(j0_g+j-1)
+        cth(j)=cth_g(j0_g+j-1)
+      enddo
+!
+      do k=1,npm
+        sp(k)=sp_g(k0_g+k-1)
+        cp(k)=cp_g(k0_g+k-1)
+      enddo
+!
+      do k=1,np
+        sph(k)=sph_g(k0_g+k-1)
+        cph(k)=cph_g(k0_g+k-1)
+      enddo
+!
+      dt_i(:)=one/dt(:)
+      dth_i(:)=one/dth(:)
+      sth_i(:)=one/sth(:)
+      dp_i(:)=one/dp(:)
+      dph_i(:)=one/dph(:)
+!
+! ****** Prevent division by zero at the poles for sin(theta).
+!
+      if (tb0) then
+        j0=2
+      else
+        j0=1
+      end if
+      if (tb1) then
+        j1=ntm1-1
+      else
+        j1=ntm1
+      end if
+!
+      st_i(:)=0.
+      do j=j0,j1
+        st_i(j)=one/st(j)
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine genmesh (io,label,nc,c0,c1,nseg,frac,dratio,nfilt,periodic, &
+                    c_shift,c)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Generate a one-dimensional mesh.
+!
+!-----------------------------------------------------------------------
+!
+! ****** Input arguments:
+!
+!          IO      : [integer]
+!                    Fortran file unit number to which to write
+!                    mesh diagnostics.  Set IO=0 if diagnostics
+!                    are not of interest.  It is assumed that
+!                    unit IO has been connected to a file prior
+!                    to calling this routine.
+!
+!          LABEL   : [character(*)]
+!                    Name for the mesh coordinate (example: 'x').
+!
+!          NC      : [integer]
+!                    Number of mesh points to load.
+!
+!          C0      : [real]
+!                    The starting location for the coordinate.
+!
+!          C1      : [real]
+!                    The ending location for the coordinate.
+!                    It is required that C1.gt.C0.
+!
+!          NSEG    : [integer]
+!                    Maximum number of mesh segments.
+!                    The mesh spacing in each segment varies
+!                    exponentially with a uniform amplification
+!                    factor.  The actual number of mesh segments
+!                    used is NSEG or less.  It is obtained from the
+!                    information in array FRAC.
+!
+!          FRAC    : [real array, dimension NSEG]
+!                    The normalized positions of the mesh segment
+!                    boundaries (as a fraction of the size of the
+!                    domain).  For a non-periodic mesh, the first
+!                    value of FRAC specified must equal 0. and the
+!                    last value must equal 1.  For a periodic mesh,
+!                    FRAC must not contain both 0. and 1., since
+!                    these represent the same point.
+!
+!          DRATIO  : [real array, dimension NSEG]
+!                    The ratio of the mesh spacing at the end of a
+!                    segment to that at the beginning.
+!
+!          NFILT   : [integer]
+!                    The number of times to filter the mesh-point
+!                    distribution array.  Set NFILT=0 if filtering
+!                    is not desired.  Filtering can reduce
+!                    discontinuities in the derivative of the mesh
+!                    spacing.
+!
+!          PERIODIC: [logical]
+!                    A flag to indicate whether the mesh to be
+!                    generated represents a periodic coordinate.
+!                    If the coordinate is specified as periodic,
+!                    the range [C0,C1] should be the whole periodic
+!                    interval; the first mesh point is set at C0
+!                    and the last mesh point, C(NC), is set at C1.
+!
+!          C_SHIFT : [real]
+!                    Amount by which to shift the periodic coordinate.
+!                    C_SHIFT is only used when PERIODIC=.true.,
+!                    and is ignored otherwise.  A positive C_SHIFT
+!                    moves the mesh points to the right.
+!
+! ****** Output arguments:
+!
+!          C       : [real array, dimension NC]
+!                    The locations of the mesh points.
+!
+!-----------------------------------------------------------------------
+!
+! ****** The arrays DRATIO and FRAC define the mesh as follows.
+!
+! ****** For example, suppose that a (non-periodic) mesh with three
+! ****** segments is desired.  Suppose the domain size is c=[0:2].
+! ****** In the first segment (with c between 0 and .5) the mesh
+! ****** spacing is decreasing with c, such that DC at c=.5 is half
+! ****** DC at c=0.  From c=.5 to c=1, the mesh is uniform.  From c=1
+! ****** to c=2, the mesh spacing is increasing with c such that DC at
+! ****** c=2 is 10 times DC at c=1.  This mesh would be specified by:
+! ******
+! ******     FRAC=0.,.25,.5,1.
+! ******     DRATIO=.5,1.,10.
+! ******
+! ****** The variable C_SHIFT can be used to shift the mesh point
+! ****** distribution for a periodic coordinate.  For example,
+! ****** suppose C represents mesh points in the interval [0,2*pi].
+! ****** C_SHIFT=.5*pi would move the distribution of mesh points
+! ****** so that the original mesh point with C(1)=0. would be
+! ****** close to .5*pi in the new mesh.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use mpidefs
+      use debug
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer, intent(in) :: io
+      character(*), intent(in) :: label
+      integer, intent(in) :: nc
+      real(r_typ), intent(in) :: c0,c1
+      integer, intent(in) :: nseg
+      real(r_typ), dimension(nseg), intent(in) :: frac,dratio
+      integer, intent(in) :: nfilt
+      logical, intent(in) :: periodic
+      real(r_typ), intent(in) :: c_shift
+      real(r_typ), dimension(nc), intent(out) :: c
+!
+!-----------------------------------------------------------------------
+!
+! ****** Storage for the coordinate transformation.
+!
+      integer :: ns
+      real(r_typ), dimension(:), allocatable :: xi,cs,a,r,f
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: zero=0._r_typ
+      real(r_typ), parameter :: one=1._r_typ
+      real(r_typ), parameter :: half=.5_r_typ
+      real(r_typ), parameter :: eps=1.e-5_r_typ
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j,nf,nr,ll,j0
+      real(r_typ) :: alpha,dr,fac,d,dxi,xiv,cshft,xi_shift
+      real(r_typ), dimension(:), allocatable :: dc,rdc
+!
+!-----------------------------------------------------------------------
+!
+! ****** Check that the number of mesh points is valid.
+!
+      if (nc.lt.2) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in GENMESH:'
+          write (*,*) '### Invalid number of mesh points specified.'
+          write (*,*) '### There must be at least two mesh points.'
+          write (*,*) 'Mesh coordinate: ',label
+          write (*,*) 'Number of mesh points specified = ',nc
+        end if
+        call endrun (.true.)
+      end if
+!
+! ****** Check that a positive mesh interval has been specified.
+!
+      if (c0.ge.c1) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in GENMESH:'
+          write (*,*) '### Invalid mesh interval specified.'
+          write (*,*) '### C1 must be greater than C0.'
+          write (*,*) 'Mesh coordinate: ',label
+          write (*,*) 'C0 = ',c0
+          write (*,*) 'C1 = ',c1
+        end if
+        call endrun (.true.)
+      end if
+!
+! ****** Find the number of values of FRAC specified.
+!
+      do nf=1,nseg-1
+        if (frac(nf+1).eq.zero) exit
+      enddo
+!
+! ****** When no values have been specified (NF=1, the default),
+! ****** a uniform mesh is produced.
+!
+      if (nf.eq.1.and.frac(1).eq.zero) then
+        ns=1
+        allocate (cs(ns+1))
+        allocate (r(ns))
+        cs(1)=c0
+        cs(2)=c1
+        r(1)=one
+        go to 100
+      end if
+!
+! ****** Check that the specified values of FRAC are monotonically
+! ****** increasing.
+!
+      do i=2,nf
+        if (frac(i).lt.frac(i-1)) then
+          if (iamp0) then
+            write (*,*)
+            write (*,*) '### ERROR in GENMESH:'
+            write (*,*) '### Invalid mesh specification.'
+            write (*,*) 'Mesh coordinate: ',label
+            write (*,*) 'The values in FRAC must increase'// &
+                       ' monotonically.'
+            write (*,*) 'FRAC = ',frac(1:nf)
+          end if
+          call endrun (.true.)
+        end if
+      enddo
+!
+! ****** Check the specified values of FRAC.
+!
+      if (periodic) then
+!
+! ****** A periodic mesh requires the specified values of FRAC
+! ****** to be in the range 0. to 1.
+!
+        if (frac(1).lt.zero.or.frac(nf).gt.one) then
+          if (iamp0) then
+            write (*,*)
+            write (*,*) '### ERROR in GENMESH:'
+            write (*,*) '### Invalid mesh specification.'
+            write (*,*) 'Mesh coordinate: ',label
+            write (*,*) 'For a periodic coordinate, the values in'// &
+                       ' FRAC must be between 0. and 1.'
+            write (*,*) 'FRAC = ',frac(1:nf)
+          end if
+          call endrun (.true.)
+        end if
+!
+! ****** A periodic mesh cannot contain both 0. and 1. in FRAC,
+! ****** since these represent the same point.
+!
+        if (frac(1).eq.zero.and.frac(nf).eq.one) then
+          if (iamp0) then
+            write (*,*)
+            write (*,*) '### ERROR in GENMESH:'
+            write (*,*) '### Invalid mesh specification.'
+            write (*,*) 'Mesh coordinate: ',label
+            write (*,*) 'For a periodic coordinate, FRAC must not'// &
+                        ' contain both 0. and 1.'
+            write (*,*) 'FRAC = ',frac(1:nf)
+          end if
+          call endrun (.true.)
+        end if
+!
+      else
+!
+! ****** A non-periodic mesh requires the first specified value
+! ****** of FRAC to be 0., and the last value to equal 1.
+!
+        if (frac(1).ne.zero) then
+          if (iamp0) then
+            write (*,*)
+            write (*,*) '### ERROR in GENMESH:'
+            write (*,*) '### Invalid mesh specification.'
+            write (*,*) 'Mesh coordinate: ',label
+            write (*,*) 'For a non-periodic coordinate, the first'// &
+                      ' value of FRAC must equal 0.'
+            write (*,*) 'FRAC = ',frac(1:nf)
+          end if
+          call endrun (.true.)
+        end if
+!
+        if (frac(nf).ne.one) then
+          if (iamp0) then
+            write (*,*)
+            write (*,*) '### ERROR in GENMESH:'
+            write (*,*) '### Invalid mesh specification.'
+            write (*,*) 'Mesh coordinate: ',label
+            write (*,*) 'For a non-periodic coordinate, the last'// &
+                        ' value of FRAC must equal 1.'
+            write (*,*) 'FRAC = ',frac(1:nf)
+          end if
+          call endrun (.true.)
+        end if
+!
+      end if
+!
+! ****** Check that the required values of DRATIO have been set,
+! ****** and are positive.
+!
+      if (periodic) then
+        nr=nf
+      else
+        nr=nf-1
+      end if
+!
+      do i=1,nr
+        if (dratio(i).le.zero) then
+          if (iamp0) then
+            write (*,*)
+            write (*,*) '### ERROR in GENMESH:'
+            write (*,*) '### Invalid mesh specification.'
+            write (*,*) 'Mesh coordinate: ',label
+            write (*,*) 'A required value in DRATIO has not been'// &
+                        ' set or is not positive.'
+            write (*,*) 'DRATIO = ',dratio(1:nr)
+          end if
+          call endrun (.true.)
+        end if
+      enddo
+!
+! ****** Check that an inherently discontinuous mesh has not been
+! ****** specified inadvertently.
+!
+      if (periodic.and.nr.eq.1.and.dratio(1).ne.one) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### WARNING from GENMESH:'
+          write (*,*) '### Discontinuous mesh specification.'
+          write (*,*) 'Mesh coordinate: ',label
+          write (*,*) 'An inherently discontinuous mesh has been'// &
+                      ' specified.'
+          write (*,*) 'FRAC = ',frac(1:nf)
+          write (*,*) 'DRATIO = ',dratio(1:nr)
+        end if
+      end if
+!
+! ****** Set the number of segments.
+!
+      ns=nf-1
+!
+! ****** For a periodic coordinate, add points at XI=0. and XI=1.
+! ****** if they are not already present.
+!
+      if (periodic) then
+        if (frac(1).ne.zero) ns=ns+1
+        if (frac(nf).ne.one) ns=ns+1
+      end if
+!
+      allocate (cs(ns+1))
+      allocate (r(ns))
+!
+! ****** Set up the coordinate limits of the segments.
+!
+      if (periodic) then
+        if (frac(1).ne.zero) then
+          cs(1)=c0
+          cs(2:nf+1)=c0+(c1-c0)*frac(1:nf)
+          if (frac(nf).ne.one) then
+            alpha=(one-frac(nf))/(frac(1)+one-frac(nf))
+            r(1)=dratio(nr)/(one+alpha*(dratio(nr)-one))
+            r(2:nr+1)=dratio(1:nr)
+            cs(ns+1)=c1
+            r(ns)=one+alpha*(dratio(nr)-one)
+          else
+            r(1)=dratio(nr)
+            r(2:nr)=dratio(1:nr-1)
+          end if
+        else
+          cs(1:nf)=c0+(c1-c0)*frac(1:nf)
+          r(1:nr)=dratio(1:nr)
+          cs(ns+1)=c1
+        end if
+      else
+        cs(1:nf)=c0+(c1-c0)*frac(1:nf)
+        r(1:nr)=dratio(1:nr)
+      end if
+!
+  100 continue
+!
+      allocate (xi(ns+1))
+      allocate (a(ns))
+      allocate (f(ns))
+!
+! ****** Compute the XI values at the segment limits.
+!
+      do i=1,ns
+        dr=r(i)-one
+        if (abs(dr).lt.eps) then
+          f(i)=(cs(i+1)-cs(i))*(one+half*dr)
+        else
+          f(i)=(cs(i+1)-cs(i))*log(r(i))/dr
+        end if
+      enddo
+!
+      fac=zero
+      do i=ns,1,-1
+        fac=fac/r(i)+f(i)
+      enddo
+!
+      d=f(1)/fac
+      xi(1)=zero
+      do i=2,ns
+        xi(i)=xi(i-1)+d
+        if (i.lt.ns) d=d*f(i)/(f(i-1)*r(i-1))
+      enddo
+      xi(ns+1)=one
+!
+! ****** Set the amplification factor for each segment.
+!
+      do i=1,ns
+        a(i)=log(r(i))/(xi(i+1)-xi(i))
+      enddo
+!
+! ****** For a periodic coordinate, find the XI shift corresponding
+! ****** to a shift of C_SHIFT in the coordinate.
+! ****** Note that a positive value of C_SHIFT moves the mesh
+! ****** points to the right.
+!
+      if (periodic) then
+        cshft=-c_shift
+        call map_c_to_xi (periodic,ns,xi,cs,a,r,cshft,xi_shift)
+      else
+        xi_shift=0.
+      end if
+!
+! ****** Compute the location of the mesh points in array C
+! ****** by mapping from the XI values.
+!
+      dxi=one/(nc-one)
+!
+      c(1)=c0
+      do j=2,nc-1
+        xiv=(j-1)*dxi
+        call map_xi_to_c (periodic,ns,xi,cs,a,r,cshft,xi_shift,xiv,c(j))
+      enddo
+      c(nc)=c1
+!
+! ****** Filter the mesh if requested.
+!
+      if (nfilt.gt.0) then
+        do i=1,nfilt
+          if (periodic) then
+            call filter_coord_periodic (c1-c0,nc,c)
+          else
+            call filter_coord (nc,c)
+          end if
+        enddo
+      end if
+!
+! ****** Write out the mesh information.
+!
+      if (io.gt.0.and.iamp0) then
+!
+        write (io,*)
+        write (io,*) '### COMMENT from GENMESH:'
+        write (io,*) '### Mesh information for coordinate ',label,':'
+!
+        if (idebug.gt.0) then
+          write (io,*)
+          write (io,*) 'Flag to indicate a periodic mesh: ',periodic
+          write (io,*) 'Number of mesh points = ',nc
+          write (io,*) 'Lower mesh limit = ',c0
+          write (io,*) 'Upper mesh limit = ',c1
+          write (io,*) 'Number of times to filter the mesh = ',nfilt
+          if (periodic) then
+            write (io,*) 'Amount to shift the mesh = ',c_shift
+          end if
+        end if
+!
+        write (io,*)
+        write (io,*) 'Number of mesh segments = ',ns
+!
+        ll=len_trim(label)
+!
+        write (io,900) 'Segment      xi0       xi1'//   &
+                       repeat (' ',16-ll)//label//'0'// &
+                       repeat (' ',16-ll)//label//'1'// &
+                       '            ratio'
+        do i=1,ns
+          write (io,910) i,xi(i),xi(i+1),cs(i),cs(i+1),r(i)
+        enddo
+!
+        allocate (dc(nc))
+        allocate (rdc(nc))
+!
+        dc=c-cshift(c,-1)
+        if (periodic) dc(1)=dc(nc)
+        rdc=dc/cshift(dc,-1)
+        if (periodic) rdc(1)=rdc(nc)
+!
+        write (io,*)
+        write (io,920) 'Mesh-point locations:'
+        write (io,920) '     i'//                         &
+                       repeat (' ',18-ll)//label//        &
+                       repeat (' ',17-ll)//'d'//label//   &
+                       '             ratio'
+!
+        if (periodic) then
+          j0=1
+        else
+          j0=3
+          write (io,930) 1,c(1)
+          write (io,930) 2,c(2),dc(2)
+        end if
+        do j=j0,nc
+          write (io,930) j,c(j),dc(j),rdc(j)
+        enddo
+!
+        deallocate (dc)
+        deallocate (rdc)
+!
+      end if
+!
+  900 format (/,tr1,a)
+  910 format (tr1,i4,2x,2f10.6,4f17.8)
+  920 format (tr1,a)
+  930 format (tr1,i6,3f18.8)
+!
+      deallocate (cs)
+      deallocate (r)
+      deallocate (xi)
+      deallocate (a)
+      deallocate (f)
+!
+end subroutine
+!#######################################################################
+subroutine map_xi_to_c (periodic,ns,xi,cs,a,r,cshft,xi_shift,xiv,cv)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Get the mesh coordinate value CV for the specified
+! ****** xi value XIV.
+!
+! ****** Set PERIODIC=.true. for a periodic coordinate.
+! ****** NS is the number of segments in the mesh definition.
+! ****** The arrays XI, CS, A, and R define the mesh mapping.
+!
+! ****** CSHFT represents the amount to shift a periodic coordinate.
+! ****** XI_SHIFT represents the corresponding amount to shift xi.
+!
+! ****** This is a utility routine for GENMESH.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      logical, intent(in) :: periodic
+      integer, intent(in) :: ns
+      real(r_typ), dimension(ns+1), intent(in) :: xi,cs
+      real(r_typ), dimension(ns), intent(in) :: a,r
+      real(r_typ), intent(in) :: cshft,xi_shift
+      real(r_typ), intent(in) :: xiv
+      real(r_typ), intent(out) :: cv
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: zero=0._r_typ
+      real(r_typ), parameter :: one=1._r_typ
+      real(r_typ), parameter :: half=.5_r_typ
+      real(r_typ), parameter :: eps=1.e-5_r_typ
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i
+      real(r_typ) :: xiv_p,d,d1,da,da1,fac
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), external :: fold
+!
+!-----------------------------------------------------------------------
+!
+! ****** Find the index of the segment to which XIV belongs.
+!
+      if (periodic) then
+!
+! ****** Shift XIV by XI_SHIFT.
+!
+        xiv_p=xiv+xi_shift
+!
+! ****** Fold XIV_P into the main interval.
+!
+        xiv_p=fold(zero,one,xiv_p)
+!
+      else
+!
+        xiv_p=xiv
+!
+      end if
+!
+      do i=1,ns
+        if (xiv_p.ge.xi(i).and.xiv_p.le.xi(i+1)) exit
+      enddo
+!
+      if (i.gt.ns) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in MAP_XI_TO_C:'
+          write (*,*) '### Error in finding the XI segment.'
+          write (*,*) '### Could not find XIV in the XI table.'
+          write (*,*) '[Utility routine for GENMESH.]'
+          write (*,*) '[This is an internal error.]'
+          write (*,*) 'XI = ',xi
+          write (*,*) 'XIV = ',xiv
+          write (*,*) 'XIV_P = ',xiv_p
+        end if
+        call endrun (.true.)
+      end if
+!
+      d =xiv_p  -xi(i)
+      d1=xi(i+1)-xi(i)
+!
+      da =a(i)*d
+      da1=a(i)*d1
+!
+! ****** Interpolate the mapping function at XIV_P.
+!
+      if (abs(da1).lt.eps) then
+        fac=(d*(one+half*da))/(d1*(one+half*da1))
+      else
+        fac=(exp(da)-one)/(r(i)-one)
+      end if
+!
+      cv=cs(i)+(cs(i+1)-cs(i))*fac
+!
+      if (periodic) then
+!
+! ****** Shift CV by the amount CSHFT.
+!
+        cv=cv-cshft
+!
+! ****** Fold CV into the main interval.
+!
+        cv=fold(cs(1),cs(ns+1),cv)
+!
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine map_c_to_xi (periodic,ns,xi,cs,a,r,cv,xiv)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Get the xi value XIV for the specified coordinate value CV.
+!
+! ****** Set PERIODIC=.true. for a periodic coordinate.
+! ****** NS is the number of segments in the mesh definition.
+! ****** The arrays XI, CS, A, and R define the mesh mapping.
+!
+! ****** This is a utility routine for GENMESH.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      logical, intent(in) :: periodic
+      integer, intent(in) :: ns
+      real(r_typ), dimension(ns+1), intent(in) :: xi,cs
+      real(r_typ), dimension(ns), intent(in) :: a,r
+      real(r_typ), intent(in) :: cv
+      real(r_typ), intent(out) :: xiv
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: one=1._r_typ
+      real(r_typ), parameter :: eps=1.e-5_r_typ
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i
+      real(r_typ) :: cv_p,d,da,fac
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), external :: fold
+!
+!-----------------------------------------------------------------------
+!
+! ****** Find the index of the segment to which CV belongs.
+!
+      if (periodic) then
+!
+! ****** Fold CV_P into the main interval.
+!
+        cv_p=fold(cs(1),cs(ns+1),cv)
+!
+      else
+!
+        cv_p=cv
+!
+      end if
+!
+      do i=1,ns
+        if (cv_p.ge.cs(i).and.cv_p.le.cs(i+1)) exit
+      enddo
+!
+      if (i.gt.ns) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in MAP_C_TO_XI:'
+          write (*,*) '### Error in finding the CS segment.'
+          write (*,*) '### Could not find CV in the CS table.'
+          write (*,*) '[Utility routine for GENMESH.]'
+          write (*,*) '[This is an internal error.]'
+          write (*,*) 'CS = ',cs
+          write (*,*) 'CV = ',cv
+          write (*,*) 'CV_P = ',cv_p
+        end if
+        call endrun (.true.)
+      end if
+!
+      d=(cv_p-cs(i))/(cs(i+1)-cs(i))
+      da=(r(i)-one)*d
+!
+! ****** Interpolate the mapping function at XIV_P.
+!
+      if (abs(da).lt.eps) then
+        fac=d*(xi(i+1)-xi(i))
+      else
+        fac=log(da+one)/a(i)
+      end if
+!
+      xiv=xi(i)+fac
+!
+end subroutine
+!#######################################################################
+subroutine filter_coord (n,f)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Apply a "(1,2,1)/4" low-pass digital filter to a
+! ****** 1D coordinate.
+!
+! ****** The end-points F(1) and F(N) are not changed.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: n
+      real(r_typ), dimension(n) :: f
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: two=2._r_typ
+      real(r_typ), parameter :: quarter=.25_r_typ
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), dimension(n) :: ff
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i
+!
+!-----------------------------------------------------------------------
+!
+! ****** Make a copy of the function.
+!
+      ff=f
+!
+! ****** Apply the filter.
+!
+      do i=2,n-1
+        f(i)=quarter*(ff(i-1)+two*ff(i)+ff(i+1))
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine filter_coord_periodic (xl,n,f)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Apply a "(1,2,1)/4" low-pass digital filter to a
+! ****** periodic 1D coordinate.
+!
+!-----------------------------------------------------------------------
+!
+! ****** XL is the periodic interval for the coordinate.
+!
+! ****** The filtered coordinate is translated so that F(1)
+! ****** is preserved.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ) :: xl
+      integer :: n
+      real(r_typ), dimension(n) :: f
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: two=2._r_typ
+      real(r_typ), parameter :: quarter=.25_r_typ
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), dimension(0:n+1) :: ff
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i
+      real(r_typ) :: f1old,f1new
+!
+!-----------------------------------------------------------------------
+!
+! ****** Save the value of F(1).
+!
+      f1old=f(1)
+!
+! ****** Make a periodic copy of the function.
+!
+      ff(1:n)=f(:)
+!
+      ff(0)=f(n-1)-xl
+      ff(n+1)=f(2)+xl
+!
+! ****** Apply the filter.
+!
+      do i=1,n
+        f(i)=quarter*(ff(i-1)+two*ff(i)+ff(i+1))
+      enddo
+!
+! ****** Translate F so that F(1) is preserved.
+!
+      f1new=f(1)
+      do i=1,n
+        f(i)=f(i)-f1new+f1old
+      enddo
+!
+end subroutine
+!#######################################################################
+function fold (x0,x1,x)
+!
+!-----------------------------------------------------------------------
+!
+! ****** "Fold" X into the periodic interval [X0,X1].
+!
+! ****** On return, X is such that X0.le.X.lt.X1.
+!
+!-----------------------------------------------------------------------
+!
+! ****** It is assumed that X0 does not equal X1, as is physically
+! ****** necessary.  If X0 and X1 are equal, the routine just
+! ****** returns with FOLD=X.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ) :: fold
+      real(r_typ) :: x0,x1,x
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ) :: xl
+!
+!-----------------------------------------------------------------------
+!
+      fold=x
+!
+      if (x0.eq.x1) return
+!
+      xl=x1-x0
+!
+      fold=mod(x-x0,xl)+x0
+!
+      if (fold.lt.x0) fold=fold+xl
+      if (fold.ge.x1) fold=fold-xl
+!
+      return
+end function
+!#######################################################################
+subroutine set_flux
+!
+!-----------------------------------------------------------------------
+!
+! ****** Set the radial magnetic field at the photosphere.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use global_dims
+      use global_mesh
+      use local_dims
+      use local_mesh
+      use fields
+      use vars
+      use mpidefs
+      use timing
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+! ****** Global flux array.
+!
+      real(r_typ), dimension(:,:), allocatable :: br0_g
+!
+!-----------------------------------------------------------------------
+!
+      integer :: j,k,ierr
+!
+!-----------------------------------------------------------------------
+!
+      call timer_on
+!
+      allocate (br0_g(nt_g,np_g))
+!
+! ****** Define the global flux array.
+!
+! ****** Read the flux from file BR0FILE (only on processor IPROC0).
+!
+      if (iamp0) then
+        call readbr (br0file,br0_g,ierr)
+      end if
+      call check_error_on_p0 (ierr)
+!
+! ****** Broadcast BR0_G to all the processors.
+!
+      call MPI_Bcast (br0_g,nt_g*np_g,ntype_real,0,comm_all,ierr)
+!
+! ****** For a fully open field, reverse negative Br
+! ****** (i.e., use the monopole trick).
+!
+      if (option.eq.'open') then
+!
+! ****** Write the boundary flux (before the sign flip) to a file
+! ****** if requested.
+!
+        if (iamp0) then
+          if (br_photo_original_file.ne.'') then
+            write (*,*)
+            write (*,*) '### COMMENT from SET_FLUX:'
+            write (*,*)
+            write (*,*) 'Writing BR0 (before sign flip) to file: ', &
+                        trim(br_photo_original_file)
+            write (9,*)
+            write (9,*) '### COMMENT from SET_FLUX:'
+            write (9,*)
+            write (9,*) 'Writing BR0 (before sign flip) to file: ', &
+                        trim(br_photo_original_file)
+            call wrhdf_2d (br_photo_original_file,.true.,nt_g,np_g, &
+                           br0_g,th_g,ph_g,hdf32,ierr)
+          end if
+        end if
+!
+! ****** Reverse Br.
+!
+        br0_g(:,:)=abs(br0_g(:,:))
+!
+      end if
+!
+! ****** Write the boundary flux to a file if requested.
+!
+      if (iamp0) then
+        if (br_photo_file.ne.' ') then
+          write (*,*)
+          write (*,*) '### COMMENT from SET_FLUX:'
+          write (*,*)
+          write (*,*) 'Writing BR0 to file: ',trim(br_photo_file)
+          write (9,*)
+          write (9,*) '### COMMENT from SET_FLUX:'
+          write (9,*)
+          write (9,*) 'Writing BR0 to file: ',trim(br_photo_file)
+          call wrhdf_2d (br_photo_file,.true.,nt_g,np_g, &
+                         br0_g,th_g,ph_g,hdf32,ierr)
+        end if
+      end if
+!
+      do j=1,nt
+        do k=1,np
+          br0(j,k)=br0_g(j0_g+j-1,k0_g+k-1)
+        enddo
+      enddo
+!
+      deallocate(br0_g)
+!
+      call timer_off (t_io)
+!
+end subroutine
+!#######################################################################
+subroutine potfld
+!
+!-----------------------------------------------------------------------
+!
+! ****** Find the potential field solution.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use local_dims
+      use local_mesh
+      use fields
+      use cgcom
+      use solve_params
+      use mpidefs
+      use debug
+      use timing
+      use matrix_storage_pot3d_solve
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: one=1._r_typ
+!
+!-----------------------------------------------------------------------
+!
+      integer :: ierr,nrm2,ntm2,npm2,i
+!
+      real(r_typ), dimension(:), allocatable :: rhs_cg,x_cg
+!
+!-----------------------------------------------------------------------
+!
+! ****** Load matrix and preconditioner.
+!
+      nrm2=nrm1-1
+      ntm2=ntm1-1
+      npm2=npm1-1
+!
+      a_offsets(1)=-nrm2*ntm2
+      a_offsets(2)=-nrm2
+      a_offsets(3)=-1
+      a_offsets(4)= 0
+      a_offsets(5)= 1
+      a_offsets(6)= nrm2
+      a_offsets(7)= nrm2*ntm2
+!
+! ****** Allocate cg 1D vectors.
+!
+      N=nrm2*ntm2*npm2
+!
+! ****** Prepare the guess, and rhs for the solve.
+!
+      allocate(rhs_cg(N))
+      allocate(x_cg(N))
+      rhs_cg(:)=0.
+      x_cg(:)=0.
+!
+      call getM (N,a_offsets,M)
+      call alloc_pot3d_matrix_coefs
+      call load_matrix_pot3d_solve
+      call load_preconditioner_pot3d_solve
+!
+! ****** Use a trick to accumulate the contribution of the
+! ****** boundary conditions (i.e., the inhomogenous part).
+!
+      call set_boundary_points (x_ax,one)
+      call seam_hhh (x_ax)
+      call delsq (x_ax,rhs_cg)
+!
+! ****** Original rhs is zero so just use negative of boundary
+!        trick contributions:
+!
+      do concurrent (i=1:N)
+        rhs_cg(i)=-rhs_cg(i)
+      enddo
+!
+! ****** Solve for the potential.
+!
+      if (idebug.gt.0.and.iamp0) then
+        write (*,*)
+        write (*,*) '### COMMENT from POTFLD:'
+        write (*,*) '### Doing a solution:'
+      end if
+!
+      call solve (x_cg,rhs_cg,N,ierr)
+!
+      if (ierr.ne.0) then
+        call endrun (.true.)
+      end if
+!
+      call unpack_scalar (phi,x_cg)
+!
+      call set_boundary_points (phi,one)
+      call seam_hhh (phi)
+!
+      call dealloc_pot3d_matrix_coefs
+      deallocate(rhs_cg)
+      deallocate(x_cg)
+!
+end subroutine
+!#######################################################################
+subroutine set_validation_flux
+!
+!-----------------------------------------------------------------------
+!
+! ****** Set the radial magnetic field at the photosphere.
+! ****** This uses the tilted dipole analytic solution for validation.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use global_dims
+      use global_mesh
+      use local_dims
+      use local_mesh
+      use fields
+      use vars
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: two=2.0_r_typ
+!
+!-----------------------------------------------------------------------
+!
+! ****** Global Br boundary arrays.
+!
+      real(r_typ), dimension(:,:), allocatable :: br0_g
+      real(r_typ), dimension(:,:), allocatable :: br1_g
+!
+!-----------------------------------------------------------------------
+!
+      integer :: j,k
+      real(r_typ) :: sum00,sum11
+!
+!-----------------------------------------------------------------------
+!
+! ****** Define the global boundary condition arrays.
+!
+      allocate (br0_g(nt_g,np_g))
+      allocate (br1_g(nt_g,np_g))
+!
+! ****** Set the tilted dipole analytic Br.
+!
+      do j=1,nt_g
+        do k=1,np_g
+          br0_g(j,k)=(two/r0**3)*(cth_g(j)*cos(dipole_angle) + &
+                                  sth_g(j)*cph_g(k)*sin(dipole_angle))
+          br1_g(j,k)=(two/r1**3)*(cth_g(j)*cos(dipole_angle) + &
+                                  sth_g(j)*cph_g(k)*sin(dipole_angle))
+        enddo
+      enddo
+!
+! ****** Set Br to be periodic (should not be nesessary?).
+!
+      br0_g(:,1)=br0_g(:,npm1_g)
+      br0_g(:,np_g)=br0_g(:,2)
+      br1_g(:,1)=br1_g(:,npm1_g)
+      br1_g(:,np_g)=br1_g(:,2)
+!
+! ****** Set BCs at the poles.
+!
+      sum00=sum(br0_g(     2,2:npm1_g)*dph_g(2:npm1_g))*pl_i
+      sum11=sum(br0_g(ntm1_g,2:npm1_g)*dph_g(2:npm1_g))*pl_i
+!
+      br0_g(1   ,:)=two*sum00-br0_g(     2,:)
+      br0_g(nt_g,:)=two*sum11-br0_g(ntm1_g,:)
+!
+      sum00=sum(br1_g(     2,2:npm1_g)*dph_g(2:npm1_g))*pl_i
+      sum11=sum(br1_g(ntm1_g,2:npm1_g)*dph_g(2:npm1_g))*pl_i
+!
+      br1_g(1   ,:)=two*sum00-br1_g(     2,:)
+      br1_g(nt_g,:)=two*sum11-br1_g(ntm1_g,:)
+!
+      do j=1,nt
+        do k=1,np
+          br0(j,k)=br0_g(j0_g+j-1,k0_g+k-1)
+          br1(j,k)=br1_g(j0_g+j-1,k0_g+k-1)
+        enddo
+      enddo
+!
+      deallocate(br0_g)
+      deallocate(br1_g)
+!
+end subroutine
+!#######################################################################
+subroutine write_validation_solution
+!
+!-----------------------------------------------------------------------
+!
+! ****** Write out the tilted dipole analytic solution for validation.
+! ****** This writes out the phi, B, and B computed from phi.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use global_dims
+      use global_mesh
+      use local_dims
+      use local_mesh
+      use fields
+      use vars
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: one=1.0_r_typ
+      real(r_typ), parameter :: two=2.0_r_typ
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j,k
+      character(256) :: fnamephi,fnamebr,fnamebt,fnamebp
+!
+!-----------------------------------------------------------------------
+!
+! ****** Set phi.
+!
+      do concurrent (k=1:np, j=1:nt, i=1:nr)
+        phi(i,j,k)=(-one/rh(i)**2)*(cth(j)*cos(dipole_angle)+ &
+                                   sth(j)*cph(k)*sin(dipole_angle))
+      enddo
+!
+      call set_boundary_points (phi,one)
+      call seam_hhh (phi)
+!
+!
+! ****** Set Br.
+!
+      do concurrent (k=1:np, j=1:nt, i=1:nrm1)
+        br(i,j,k)=(two/r(i)**3)*(cth(j)*cos(dipole_angle) + &
+                                 sth(j)*cph(k)*sin(dipole_angle))
+      enddo
+!
+! ****** Set Bt.
+!
+      do concurrent (k=1:np, j=1:ntm1, i=1:nr)
+        bt(i,j,k)=(one/rh(i)**3)*(st(j)*cos(dipole_angle) - &
+                                  ct(j)*cph(k)*sin(dipole_angle))
+      enddo
+!
+! ****** Set Bp.
+!
+      do concurrent (k=1:npm1, j=1:nt, i=1:nr)
+        bp(i,j,k)=(one/rh(i)**3)*(sph(k)*sin(dipole_angle))
+      enddo
+!
+! ****** Due to half mesh trig issues on periodic domain, we need
+! ****** to seam these even though seaming is not needed in getb().
+!
+      call seam_gen (br,nrm,nt,np)
+      call seam_gen (bt,nr,ntm,np)
+      call seam_gen (bp,nr,nt,npm)
+!
+! ****** Backup fnames and set new ones so we can reuse write routine.
+!
+      fnamephi=phifile
+      phifile='phi_exact.'//trim(fmt)
+      fnamebr=brfile
+      brfile='br_exact.'//trim(fmt)
+      fnamebt=btfile
+      btfile='bt_exact.'//trim(fmt)
+      fnamebp=bpfile
+      bpfile='bp_exact.'//trim(fmt)
+!
+      call write_solution
+!
+! ****** Restore original output names.
+!
+      phifile=fnamephi
+      brfile=fnamebr
+      btfile=fnamebt
+      bpfile=fnamebp
+!
+! ****** Reset phi.
+!
+      phi(:,:,:)=0.
+!
+end subroutine
+!#######################################################################
+subroutine solve (x,rhs,N,ierr)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Solve the implicit equations iteratively.
+!
+!-----------------------------------------------------------------------
+!
+! ****** Return IERR=0 if the iteration converges; otherwise,
+! ****** IERR is set to a nonzero value.
+!
+! ****** X is the initial guess at the solution.
+! ****** RHS is the right-hand side.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use cgcom
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: N
+      real(r_typ), dimension(N) :: x,rhs
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+! ****** Solve the equations using the CG method.
+!
+      call cgsolve (x,rhs,N,ierr)
+!
+! ****** Check for convergence.
+!
+      if (ierr.ne.0) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in SOLVE:'
+          write (*,*) '### The field solver did not converge.'
+          write (*,*) 'IERR = ',ierr
+          write (*,100) ncg,epsn
+  100     format (1x,'N = ',i6,' EPSN = ',1pe13.6)
+        end if
+      else
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### COMMENT from SOLVE:'
+          write (*,*) '### The field solver converged.'
+          write (*,*) 'Number of iterations = ',ncg
+          write (9,*)
+          write (9,*) '### COMMENT from SOLVE:'
+          write (9,*) '### The field solver converged.'
+          write (9,*) 'Number of iterations = ',ncg
+        end if
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine cgsolve (x,r,N,ierr)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Solve the linear system:
+!
+!            A * x = b
+!
+! ****** using the classical Conjugate Gradient method for symmetric
+! ****** and positive-definite matrices.
+!
+!-----------------------------------------------------------------------
+!
+! ****** On input, X(N) contains a guess at the solution, and
+! ****** R(N) contains the right-hand side, b.
+!
+! ****** On exit, X contains an estimate to the solution, and
+! ****** R contains the residual (b-Ax).
+!
+! ****** IERR=0 indicates that the solution converged to the
+! ****** requested accuracy.  Other values indicate that the
+! ****** iteration did not converge for the given maximum number
+! ****** of iterations.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use cgcom
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: N
+      real(r_typ), dimension(N) :: x,r
+      integer :: ierr,i
+!
+!-----------------------------------------------------------------------
+!
+! ****** Scratch space for the CG iteration vectors.
+!
+      real(r_typ), dimension(N), target :: p,ap
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), external :: cgdot
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ) :: bdotb,rdotr,pdotap,alphai,rdotr_old,betai
+!
+!-----------------------------------------------------------------------
+!
+      ncg=0
+!
+! ****** Get the norm of the RHS.
+!
+      do concurrent (i=1:N)
+        p(i)=r(i)
+      enddo
+!
+      call prec_inv (p)
+      bdotb=cgdot(r,p,N)
+!
+! ****** If the RHS is zero, return with a zero solution.
+!
+      if (bdotb.eq.0.) then
+        do concurrent (i=1:N)
+          x(i)=0.
+        enddo
+        epsn=0.
+        ierr=0
+        return
+      end if
+!
+!-----------------------------------------------------------------------
+! ****** Initialization.
+!-----------------------------------------------------------------------
+!
+      call ax (x,ap,N)
+!
+      do concurrent (i=1:N)
+        r(i)=r(i)-ap(i)
+        p(i)=r(i)
+      enddo
+!
+! ****** Find the initial error norm.
+!
+      call prec_inv (p)
+      rdotr=cgdot(r,p,N)
+!
+      call ernorm (bdotb,rdotr,ierr)
+      if (ierr.ge.0) return
+!
+!-----------------------------------------------------------------------
+! ****** Main iteration loop.
+!-----------------------------------------------------------------------
+!
+      do
+        ncg=ncg+1
+!
+        call ax (p,ap,N)
+!
+        pdotap=cgdot(p,ap,N)
+        alphai=rdotr/pdotap
+!
+        do concurrent (i=1:N)
+          x(i)=x(i)+alphai*p(i)
+          r(i)=r(i)-alphai*ap(i)
+          ap(i)=r(i)
+        enddo
+!
+        call prec_inv (ap)
+        rdotr_old=rdotr
+        rdotr=cgdot(r,ap,N)
+!
+! ****** Check for convergence.
+!
+        call ernorm (bdotb,rdotr,ierr)
+        if (ierr.ge.0) exit
+!
+        betai=rdotr/rdotr_old
+!
+        do concurrent (i=1:N)
+          p(i)=betai*p(i)+ap(i)
+        enddo
+!
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine ernorm (bdotb,rdotr,ierr)
+!
+!-----------------------------------------------------------------------
+!
+! ****** This subroutine checks if the iterative solver has
+! ****** converged or if the maximum allowed number of iterations,
+! ****** NCGMAX, has been exceeded.
+!
+!-----------------------------------------------------------------------
+!
+! ****** Convergence is deemed to have occurred when:
+! ******
+! ******     ||R||/||B|| .lt. EPSCG
+! ******
+! ****** where ||R|| is the norm of the (preconditioned)
+! ****** residual, ||B|| is the norm of the (preconditioned)
+! ****** RHS, and EPSCG is the specified convergence criterion.
+!
+! ****** Set IERR=0 if the error is below the error criterion
+! ****** (i.e., the solution has converged).
+! ****** Set IERR=-1 if the error does not yet meet the error
+! ****** criterion and the number of iterations is less than NCGMAX.
+! ****** Set IERR=1 if the maximum number of iterations has
+! ****** been exceeded without convergence.
+!
+!-----------------------------------------------------------------------
+!
+! ****** On input, BDOTB has the dot product of the RHS vector
+! ****** with itself, weighted by the preconditioning matrix.
+! ****** Similarly, RDOTR has the dot product of the residual vector
+! ****** with itself, weighted by the preconditioning matrix.
+! ****** This is used to normalize the error estimate.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use cgcom
+      use mpidefs
+      use vars
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ) :: bdotb,rdotr
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: one=1._r_typ
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ) :: epssq
+!
+!-----------------------------------------------------------------------
+!
+      ierr=-1
+!
+      epssq=rdotr/bdotb
+      epsn=sign(one,epssq)*sqrt(abs(epssq))
+!
+      if (ncghist.gt.0) then
+!
+        if (mod(ncg,ncghist).eq.0) then
+          if (iamp0) then
+            if (ncg.eq.0) then
+              write (9,*)
+              write (9,*) '### Convergence information:'
+              write (*,*)
+            end if
+            write (*,100) ncg,epsn
+            FLUSH(OUTPUT_UNIT)
+            write (9,100) ncg,epsn
+            FLUSH(9)
+  100       format (1x,'Iteration: ',i8,'   Residual: ',1pe23.15)
+          end if
+        end if
+      end if
+!
+! ****** Check for convergence.
+!
+      if (epsn.lt.epscg) then
+        if (ncghist.gt.0) then
+          if (iamp0) then
+            write (*,100) ncg,epsn
+            FLUSH(OUTPUT_UNIT)
+            write (9,*)
+            write (9,*) '### The CG solver has converged.'
+            write (9,100) ncg,epsn
+            FLUSH(9)
+          end if
+        end if
+        ierr=0
+      else if (ncg.ge.ncgmax) then
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### ERROR in ERNORM:'
+          write (*,*) '### Exceeded maximum number of iterations.'
+          write (*,*) 'NCGMAX = ',ncgmax
+          write (*,*) 'EPSN = ',epsn
+          FLUSH(OUTPUT_UNIT)
+          write (9,*)
+          write (9,*) '### ERROR in ERNORM:'
+          write (9,*) '### Exceeded maximum number of iterations.'
+          write (9,*) 'NCGMAX = ',ncgmax
+          write (9,*) 'EPSN = ',epsn
+          FLUSH(9)
+        end if
+        ierr=1
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine alloc_pot3d_matrix_coefs
+!
+!-----------------------------------------------------------------------
+!
+! ****** Allocate the arrays in which the matrix coefficients
+! ****** for the pot3d solve are stored.
+!
+!-----------------------------------------------------------------------
+!
+      use matrix_storage_pot3d_solve
+      use cgcom
+      use local_dims
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      allocate (a(2:nrm1,2:ntm1,2:npm1,7))
+      a=0.
+      allocate (a_i(N))
+      a_i=0.
+!
+      if (ifprec.eq.2) then
+        allocate (a_csr(M))
+        allocate (lu_csr(M))
+        allocate (lu_csr_ja(M))
+        allocate (a_csr_ja(M))
+        allocate (a_csr_ia(1+N))
+        allocate (a_N1(N))
+        allocate (a_N2(N))
+        allocate (a_csr_d(N))
+        allocate (a_csr_dptr(N))
+      endif
+!
+end subroutine
+!#######################################################################
+subroutine dealloc_pot3d_matrix_coefs
+!
+!-----------------------------------------------------------------------
+!
+! ****** Deallocate the arrays in which the matrix coefficients
+! ****** for the pot3d solve are stored.
+!
+!-----------------------------------------------------------------------
+!
+      use matrix_storage_pot3d_solve
+      use cgcom
+      use cusparse_interface
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      deallocate (a)
+      deallocate (a_i)
+!
+      if (ifprec.eq.2) then
+#ifdef CUSPARSE
+        call unload_lusol_cusparse
+#endif
+        deallocate (a_csr)
+        deallocate (lu_csr)
+        deallocate (lu_csr_ja)
+        deallocate (a_csr_ia)
+        deallocate (a_csr_ja)
+        deallocate (a_csr_d)
+        deallocate (a_N1)
+        deallocate (a_N2)
+        deallocate (a_csr_dptr)
+      endif
+!
+end subroutine
+!#######################################################################
+subroutine load_matrix_pot3d_solve
+!
+!-----------------------------------------------------------------------
+!
+! ****** Load the matrix coefficients for the pot3d solve.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use matrix_storage_pot3d_solve
+      use local_dims
+      use local_mesh
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j,k
+!
+!-----------------------------------------------------------------------
+!
+! ****** Set matrix coefs
+!
+      do k=2,npm1
+        do j=2,ntm1
+          do i=2,nrm1
+!           a*ps(i,j,k-1):
+            a(i,j,k,1)=-drh(i)*dth(j)*sth_i(j)*dp_i(k-1)
+!           a*ps(i,j-1,k):
+            a(i,j,k,2)=-drh(i)*dph(k)*st(j-1)*dt_i(j-1)
+!           a*ps(i-1,j,k):
+            a(i,j,k,3)=-sth(j)*dth(j)*dph(k)*r2(i-1)*dr_i(i-1)
+!           a*ps(i+1,j,k):
+            a(i,j,k,5)=-sth(j)*dth(j)*dph(k)*r2(i  )*dr_i(i  )
+!           a*ps(i,j+1,k):
+            a(i,j,k,6)=-drh(i)*dph(k)*st(j  )*dt_i(j  )
+!           a*ps(i,j,k+1):
+            a(i,j,k,7)=-drh(i)*dth(j)*sth_i(j)*dp_i(k  )
+!
+!           a*ps(i,j,k):
+            a(i,j,k,4)=-(a(i,j,k,1)+a(i,j,k,2)+a(i,j,k,3)+ &
+                         a(i,j,k,5)+a(i,j,k,6)+a(i,j,k,7))
+          enddo
+        enddo
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine load_preconditioner_pot3d_solve
+!
+!-----------------------------------------------------------------------
+!
+! ****** Load the preconditioner for the pot3d solve.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use matrix_storage_pot3d_solve
+      use cgcom
+      use local_dims
+      use, intrinsic :: iso_c_binding
+      use cusparse_interface
+      use timing
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: one=1._r_typ
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j,k,icode,ii
+!
+!-----------------------------------------------------------------------
+!
+      call timer_on
+!
+      if (ifprec.eq.0) return
+!
+      if (ifprec.eq.1) then
+!
+! ****** Diagonal scaling:
+!
+        ii=0
+        do k=2,npm1
+          do j=2,ntm1
+            do i=2,nrm1
+              ii=ii+1
+              a_i(ii)=real(one/a(i,j,k,4),r_typ_pc)
+            enddo
+          enddo
+        enddo
+!
+      elseif (ifprec.eq.2) then
+!
+! ****** Convert A matrix into CSR format:
+!
+        call diacsr (N,M,a,a_offsets,a_csr,a_csr_ja,a_csr_ia,a_csr_dptr)
+#ifdef CUSPARSE
+      cN=N
+      cM=M
+      call load_lusol_cusparse (C_LOC(a_csr(1)),          &
+                                C_LOC(a_csr_ia(1)),       &
+                                C_LOC(a_csr_ja(1)),cN,cM)
+#else
+!
+! ****** Overwrite CSR A with preconditioner L and U matrices:
+!
+! ****** Incomplete LU (ILU)
+!
+        icode=0
+        call ilu0 (N,M,a_csr,a_csr_ja,a_csr_ia,a_csr_dptr,icode)
+!
+        if (icode.ne.0) then
+          print*, '### ERROR IN ILU FORMATION'
+        endif
+!
+! ****** Convert LU stored in A to LU matrix in optimized layout.
+!
+        call lu2luopt (N,M,lu_csr,a_csr,a_csr_ia,a_csr_ja,lu_csr_ja, &
+                       a_csr_dptr,a_N1,a_N2)
+!
+! ****** Store inverse of diagonal of LU matrix.
+!
+        do i=1,N
+          a_csr_d(i)=one/a_csr(a_csr_dptr(i))
+        enddo
+!
+#endif
+      endif
+!
+      call timer_off (t_pc_load)
+!
+end subroutine
+!#######################################################################
+subroutine ilu0 (N,M,A,JA,IA,A_da,icode)
+!
+!-----------------------------------------------------------
+!
+!     Set-up routine for ILU(0) preconditioner. This routine
+!     computes the L and U factors of the ILU(0) factorization
+!     of a general sparse matrix A stored in CSR format with
+!     1-based indexing. Since
+!     L is unit triangular, the L and U factors can be stored
+!     as a single matrix which occupies the same storage as A.
+!     New ja and ia arrays are not needed for the LU matrix
+!     since the pattern of the LU matrix is identical with
+!     that of A.
+!
+!     Original Author:  Yousef Saad
+!            Iterative Methods for Sparse Linear Systems 2nd Ed. pg. 309
+!     Modified by R.M. Caplan
+!
+!-----------------------------------------------------------
+!     INPUT:
+!     N         : Dimension of matrix
+!     A, JA, IA : Sparse matrix in CSR sparse storage format
+!     A_da      : Pointers to the diagonal elements in the CSR
+!                 data structure luval
+!
+!     OUTPUT:
+!     A     : L/U matrices stored together. On return A,
+!             JA, and IA are the combined CSR data structure for
+!             the L and U factors.
+!     icode : Integer indicating error code on return:
+!             (0): Normal return.
+!             (k): Encountered a zero pivot at step k.
+!------------------------------------------------------------
+!
+      use number_types
+      use number_types_pc
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: JA(M),IA(N+1),A_da(N),iw(N)
+      integer :: icode,N,M
+      real(r_typ_pc) :: A(M)
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,ik,kj,k,ij,IA_i,IA_ip1m1
+      real(r_typ_pc) :: Aik
+!
+!-----------------------------------------------------------------------
+!
+      icode=0
+!     Initialize scratch index array:
+      iw(:)=0
+!
+      do i=2,N
+!       Store index of (i,j) in A in scratch array of iw(j=1:N)
+!       This allows lookup given a column index (j) in row (k)
+!       to see if the column is in row (i).
+        IA_i    =IA(i)
+        IA_ip1m1=IA(i+1)-1
+!
+        do ij=IA_i,IA_ip1m1
+          iw(JA(ij))=ij
+        enddo
+!
+!       Loop from first element in row i to 1 less than diagonal elem:
+        do ik=IA_i,A_da(i)-1     !IA(i+1) !ik is index of (i,k) in A[]
+          k    =JA(ik)           !Actual column index in matrix (k)
+          Aik  =A(ik)/A(A_da(k)) !Save Aik for next loop as an optim.
+          A(ik)=Aik
+!
+!         Loop from 1 more than diag elem to last elem in row k:
+          do kj=A_da(k)+1,IA(k+1)-1 !kj is index of (k,j) in A[]
+!            Get ij location from scratch array (if 0, no ij present)
+             ij=iw(JA(kj))
+             if (ij .ne. 0) then
+               A(ij)=A(ij)-Aik*A(kj)
+             endif
+          enddo
+        enddo
+!
+        if (A(ik).eq.0) then
+          icode=i
+          exit
+        endif
+!
+!       Reset scratch index array:
+        do ij=IA_i,IA_ip1m1
+          iw(JA(ij))=0
+        enddo
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine lu2luopt (N,M,LU,A,IA,JA,LUJA,A_da,N1,N2)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Re-order elements of LU matrix in CSR format into custom,
+! ****** optimized format for use with lusol().
+! ****** (Eventually, this could be merged with the ilu0 and/or diacsr)
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use number_types_pc
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: N,M
+      integer :: JA(M),LUJA(M),IA(N+1),A_da(N)
+      integer :: N1(N),N2(N)
+      real(r_typ_pc) :: A(M),LU(M)
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,k,ii
+!
+!-----------------------------------------------------------------------
+!
+      ii=0
+!
+      do i=1,N
+        do k=IA(i),A_da(i)-1
+           ii=ii+1
+           LU(ii)=A(k)
+           LUJA(ii)=JA(k)
+        enddo
+!
+!       Store k1 and k2 ranges for lusolve:
+!
+        N1(i)=A_da(i)-1-IA(i)
+        N2(i)=IA(i+1)-2-A_da(i)
+      enddo
+!
+      do i=N,1,-1
+        do k=A_da(i)+1,IA(i+1)-1
+           ii=ii+1
+           LU(ii)=A(k)
+           LUJA(ii)=JA(k)
+        enddo
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine diacsr (N,M,Adia,ioff,Acsr,JA,IA,Adptr)
+!
+!-----------------------------------------------------------------------
+!
+! *** DIACSR_INNER converts a solver matrix in a MAS-style
+!     diagonal format to standard compressed sparse row (CSR)
+!     including periodic coefficents when nproc_p=1.
+!
+!     Author of original diacsr: Youcef Saad
+!     Modifications for MAS:     RM Caplan
+!
+!     Input:
+!                     N: Size of the matrix (NxN)
+!                     M: Number of non-zero entries in matrix
+!                        (computed with getM_tc())
+!         Adia(IDIAG,N): The matrix in modified "DIA" format
+!           ioff(IDIAG): Offsets of the diagonals in A.
+!
+!     Output:
+!            Acsr(M), JA(M), IA(N+1): The matrix A in CSR.
+!                           Adptr(N): Pointers to diag elements in A,
+!                                     [e.g. A(i,i) == A(Adptr(i))]
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use number_types_pc
+      use local_dims
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer, parameter :: IDIAG=7
+!
+!-----------------------------------------------------------------------
+!
+      real (r_typ_pc) :: Acsr(M)
+      real (r_typ) :: Adia(N,IDIAG)
+      integer :: N,M
+      integer :: Adptr(N)
+      integer :: IA(N+1)
+      integer :: JA(M)
+      integer :: ioff(IDIAG)
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j,jj,mi,mj,mk,ko,x
+      integer :: ioffok(IDIAG)
+!
+!-----------------------------------------------------------------------
+!
+      x=0
+!
+      IA(1)=1
+      ko=1
+      i=0
+!
+      do mk=2,npm1
+        do mj=2,ntm1
+          do mi=2,nrm1
+! ********* Set index of value and column indicies array:
+            i=i+1
+!
+! ********* Do not add coefs that multiply boundaries:
+!           For each boundary, there is a sub-set of coefs in the
+!           matrix row that should not be added.
+!           This makes "local" matrices have no bc info
+!
+! ********* Reset "i-offset-ok-to-use-coef-jj" array:
+!
+            ioffok(:)=1
+!
+            if (mi.eq.2) then
+              ioffok(3)=0;
+            endif
+!
+            if (mi.eq.nrm1) then
+              ioffok(5)=0;
+            endif
+!
+            if (mj.eq.2) then
+              ioffok(2)=0;
+            endif
+!
+            if (mj.eq.ntm1) then
+              ioffok(6)=0;
+            endif
+!
+! ********* Eliminate periodic ceofs in the case nproc_p>1
+!
+            if (nproc_p.gt.1) then
+              if (mk.eq.2) then
+                ioffok(1)=0
+              endif
+              if (mk.eq.npm1) then
+                ioffok(7)=0
+              endif
+            endif
+!
+! ********* To handle periodicity of phi in nproc_p=1 case:
+!           We want CSR matrix to be in order so
+!           have to sweep three times to avoid sorting:
+!
+! ********* Add periodic coefs of "right side":
+!
+            do jj=1,IDIAG
+              if (ioffok(jj).eq.1) then
+                j=i+ioff(jj)-x
+                if (j.gt.N-x) then
+                  j=j-N
+                  Acsr(ko)=real(Adia(i,jj),r_typ_pc)
+                  JA(ko)=j
+                  ko=ko+1
+                endif
+              endif
+            enddo
+!
+! ********* Now do non-periodic coefs:
+!
+            do jj=1,IDIAG
+              if (ioffok(jj).eq.1) then
+                j=i+ioff(jj)-x
+                if (j.ge.1.and.j.le.N-x) then
+!                 Store pointer to diagonal elements in A:
+                  if (jj.eq.4) Adptr(i)=ko
+                  Acsr(ko)=real(Adia(i,jj),r_typ_pc)
+                  JA(ko)=j
+                  ko=ko+1
+                endif
+              endif
+            enddo
+!
+! ********* Now do periodic coefs of "left side":
+!
+            do jj=1,IDIAG
+              if (ioffok(jj).eq.1) then
+                j=i+ioff(jj)-x
+                if (j.lt.1) then
+                  j=N+j
+                  Acsr(ko)=real(Adia(i,jj),r_typ_pc)
+                  JA(ko)=j
+                  ko=ko+1
+                endif
+              endif
+            enddo
+!
+! ********* Set row offset:
+!
+            IA(i+1)=ko-x
+          enddo
+        enddo
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine getM (N, ioff, M)
+!
+!-----------------------------------------------------------------------
+!
+! *** This routine computes the number of non-zeros in the
+!     solver matrix for use with allocating the matrices.
+!     See diacsr() for description of inputs.
+!
+!     Output:  M  # of nonzeros.
+!
+!-----------------------------------------------------------------------
+!
+      use mpidefs
+      use local_dims
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer, parameter :: IDIAG=7
+      integer :: N,M,i,j,jj,ko,mi,mj,mk,x
+      integer :: ioff(IDIAG)
+      integer :: ioffok(IDIAG)
+!
+      x=0
+!
+      ko=1
+      i=0
+!
+      do mk=2,npm1
+        do mj=2,ntm1
+          do mi=2,nrm1
+!
+            ioffok(:)=1
+!
+            if (mi.eq.2) then
+              ioffok(3)=0;
+            endif
+!
+            if (mi.eq.nrm1) then
+              ioffok(5)=0;
+            endif
+!
+            if (mj.eq.2) then
+              ioffok(2)=0;
+            endif
+!
+            if (mj.eq.ntm1) then
+              ioffok(6)=0;
+            endif
+!
+! ********* Eliminate periodic ceofs in the case nproc_p>1
+!
+            if (nproc_p.gt.1) then
+              if (mk.eq.2) then
+                ioffok(1)=0
+              endif
+              if (mk.eq.npm1) then
+                ioffok(7)=0
+              endif
+            endif
+!
+            do jj=1,IDIAG
+              if (ioffok(jj).eq.1) then
+                j=i+ioff(jj)-x
+                if (j.gt.N-x) then
+                  ko=ko+1
+                endif
+              endif
+            enddo
+!
+            do jj=1,IDIAG
+              if (ioffok(jj).eq.1) then
+                j=i+ioff(jj)-x
+                if (j.ge.1.and.j.le.N-x) then
+                  ko=ko+1
+                endif
+              endif
+            enddo
+!
+            do jj=1,IDIAG
+              if (ioffok(jj).eq.1) then
+                j=i+ioff(jj)-x
+                if (j.lt.1) then
+                  ko=ko+1
+                endif
+              endif
+            enddo
+          enddo
+        enddo
+      enddo
+!
+! *** Save number of non-zeros of matrix:
+!
+      M=ko-1
+!
+end subroutine
+!#######################################################################
+subroutine ax (x,y,N)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Set y = A * x.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use local_dims
+      use fields, ONLY : x_ax
+      use timing
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: zero=0._r_typ
+!
+!-----------------------------------------------------------------------
+!
+      integer :: N
+      real(r_typ), dimension(N) :: x,y
+!
+!-----------------------------------------------------------------------
+!
+! ****** Expand X array to allow for boundary and seam values.
+!
+      call unpack_scalar (x_ax,x)
+!
+! ****** Set the boundary values of X.
+!
+      call set_boundary_points (x_ax,zero)
+!
+! ****** Seam along edges between processors.
+!
+      call seam_hhh (x_ax)
+!
+! ****** Get the matrix-vector product.
+!
+      call timer_on
+!
+      call delsq (x_ax,y)
+!
+      call timer_off (t_ax)
+!
+end subroutine
+!#######################################################################
+subroutine prec_inv (x)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Apply preconditioner: x := M(inv) * x.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use number_types_pc
+      use cgcom
+      use solve_params
+      use matrix_storage_pot3d_solve
+      use, intrinsic :: iso_c_binding
+      use cusparse_interface
+      use timing
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), dimension(N) :: x
+      real(r_typ_pc), dimension(N) :: x_32
+      integer :: i
+!
+!-----------------------------------------------------------------------
+!
+      if (ifprec.eq.0) return
+!
+      call timer_on
+!
+      if (ifprec.eq.1) then
+!
+! ****** Point-Jacobi (diagonal scaling):
+!
+        do concurrent (i=1:N)
+          x(i)=a_i(i)*x(i)
+        enddo
+!
+      elseif (ifprec.eq.2) then
+!
+! ****** ILU0 Partial-Block-Jacobi:
+!
+#ifdef CUSPARSE
+        call lusol_cusparse(C_LOC(x(1)))
+!
+#else
+!
+! ****** Convert input array to single precision.
+!
+        do i=1,N
+          x_32(i) = real(x(i),r_typ_pc)
+        enddo
+!
+        call lusol (N,M,x_32,lu_csr,lu_csr_ja,a_N1,a_N2,a_csr_d)
+!
+! ****** Convert result back to double precision.
+!
+        do i=1,N
+          x(i) = real(x_32(i),r_typ)
+        enddo
+!
+#endif
+      endif
+!
+      call timer_off (t_pc)
+!
+end subroutine
+!#######################################################################
+subroutine lusol (N,M,x,LU,LU_ja,N1,N2,LUd_i)
+!
+!-----------------------------------------------------------
+!
+!     Performs a forward and a backward solve for the sparse system
+!     (LU) x=y  where LU is in an optimized custom CSR format
+!                                              (see lu2luopt())
+!
+!     For use where LU is an ILU or SSOR/SGS factorization.
+!
+!     Author of original lusol: Yousef Saad
+!           Iterative Methods for Sparse Linear Systems 2nd Ed. pg. 299
+!
+!     Modified by RM Caplan to include optimized memory access
+!     as described in
+!     B. Smith, H. Zhang  Inter. J. of High Perf. Comp. Appl.
+!     Vol. 25 #4 pg. 386-391 (2011)
+!
+!-----------------------------------------------------------
+!     PARAMETERS:
+!     N     : Dimension of problem
+!     x     : At input, x is rhs (y), at output x is the solution.
+!     LU    : Values of the LU matrix. L and U are stored together in
+!             order of access in this routine.
+!     LU_ja : Column indices of elements in LU.
+!     N1    : Row-start indicies in original CSR LU.
+!     N2    : Indices of diagonal elements in orig CSR LU
+!     LUd_i : Inverse diagonal elements of U
+!------------------------------------------------------------
+!
+      use number_types
+      use number_types_pc
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ_pc) :: x(N),LUd_i(N),LU(M)
+      integer :: N1(N),N2(N),LU_ja(M)
+      integer :: N,M
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,k,k1,k2
+!
+!-----------------------------------------------------------------------
+!
+! ****** FORWARD SOLVE: Solve L x'=y
+!
+      k2=0
+      do i=1,N
+!       Compute x(i) := x(i) - sum L(i,j) * x(j)
+        k1=k2+1
+        k2=k1+N1(i)
+        do k=k1,k2
+          x(i)=x(i)-LU(k)*x(LU_ja(k))
+        enddo
+!       Diagonal is always 1 for L so no division here is nessesary.
+      enddo
+!
+! ****** BACKWARD SOLVE: Solve U x=x'
+!
+      do i=N,1,-1
+!       Compute x(i) := x(i) - sum U(i,j) * x(j)
+        k1=k2+1
+        k2=k1+N2(i)
+        do k=k1,k2
+          x(i)=x(i)-LU(k)*x(LU_ja(k))
+        enddo
+!       Compute x(i) := x(i) / U(i,i)
+        x(i)=x(i)*LUd_i(i)
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine unpack_scalar (s,x)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Unpack the scalar x into
+! ****** three-dimensional array s leaving room for boundaries.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use local_dims
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), dimension(nr,nt,np) :: s
+      real(r_typ), dimension(2:nrm1,2:ntm1,2:npm1) :: x
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j,k
+!
+!-----------------------------------------------------------------------
+!
+      do concurrent (k=2:npm1, j=2:ntm1, i=2:nrm1)
+        s(i,j,k)=x(i,j,k)
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine delsq (x,y)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Set Y = - (dV * del-squared X) at the internal points.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use local_dims
+      use matrix_storage_pot3d_solve
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), dimension(nr,nt,np) :: x
+      real(r_typ), dimension(2:nrm1,2:ntm1,2:npm1) :: y
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j,k
+!
+!-----------------------------------------------------------------------
+!
+      do concurrent (k=2:npm1, j=2:ntm1, i=2:nrm1)
+        y(i,j,k)=a(i,j,k,1)*x(i  ,j  ,k-1) &
+                +a(i,j,k,2)*x(i  ,j-1,k  ) &
+                +a(i,j,k,3)*x(i-1,j  ,k  ) &
+                +a(i,j,k,4)*x(i  ,j  ,k  ) &
+                +a(i,j,k,5)*x(i+1,j  ,k  ) &
+                +a(i,j,k,6)*x(i  ,j+1,k  ) &
+                +a(i,j,k,7)*x(i  ,j  ,k+1)
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine set_boundary_points (x,vmask)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Set boundary points of X at the physical boundaries.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use global_mesh
+      use local_dims
+      use local_mesh
+      use fields
+      use solve_params
+      use vars, ONLY : validation_run
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), dimension(nr,nt,np) :: x
+      real(r_typ) :: vmask,temp_sum0
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: two=2._r_typ
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j,k
+!
+!-----------------------------------------------------------------------
+!
+! ****** Set X at the radial boundaries.
+!
+      if (rb0) then
+        do concurrent (k=2:npm1, j=2:ntm1)
+          x( 1,j,k)=x(2,j,k)-vmask*br0(j,k)*dr1
+        enddo
+      end if
+!
+      if (rb1) then
+        if (validation_run) then
+          do concurrent (k=2:npm1, j=2:ntm1)
+            x(nr,j,k)=x(nrm1,j,k)+vmask*br1(j,k)*drn
+          enddo
+        else
+          do concurrent (k=2:npm1, j=2:ntm1)
+            x(nr,j,k)=pm_r1*x(nrm1,j,k)
+          enddo
+        end if
+      end if
+!
+! ****** If this processor does not contain any points at the
+! ****** pole, return.
+!
+      if (.not.(tb0.or.tb1)) return
+!
+! ****** Get the m=0 component of X at the poles.
+!
+      if (tb0) then
+        do concurrent (i=1:nr)
+          sum0(i)=0
+        enddo
+        do concurrent (i=1:nr)
+          temp_sum0=0.
+          do concurrent (k=2:npm1) reduce(+:temp_sum0)
+            temp_sum0=temp_sum0+x(i,2,k)*dph(k)*pl_i
+          enddo
+          sum0(i)=temp_sum0
+        enddo
+      end if
+!
+      if (tb1) then
+        do concurrent (i=1:nr)
+          sum1(i)=0
+        enddo
+        do concurrent (i=1:nr)
+          temp_sum0=0.
+          do concurrent (k=2:npm1) reduce(+:temp_sum0)
+            temp_sum0=temp_sum0+x(i,ntm1,k)*dph(k)*pl_i
+          enddo
+          sum1(i)=temp_sum0
+        enddo
+      end if
+!
+! ****** Sum over all processors.
+!
+      call sum_over_phi (nr,sum0,sum1)
+!
+! ****** Set X to have only an m=0 component at the poles.
+!
+      if (tb0) then
+        do concurrent (k=2:npm1, i=1:nr)
+          x(i,1,k)=two*sum0(i)-x(i,2,k)
+        enddo
+      end if
+!
+      if (tb1) then
+        do concurrent (k=2:npm1, i=1:nr)
+          x(i,nt,k)=two*sum1(i)-x(i,ntm1,k)
+        enddo
+      end if
+!
+      return
+end subroutine
+!#######################################################################
+subroutine sum_over_phi (n,a0,a1)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Sum the contribution over all processors in the phi
+! ****** dimension (only for processors with points on the poles).
+!
+! ****** The sum is performed for all N points in the vectors
+! ****** SUM0(N) and SUM1(N), at the North and South pole,
+! ****** respectively.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use local_dims
+      use mpidefs
+      use timing
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: n
+      real(r_typ), dimension(n) :: a0,a1
+!
+!-----------------------------------------------------------------------
+!
+! ****** MPI error return.
+!
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+      call timer_on
+!
+      if (tb0) then
+        call MPI_Allreduce (MPI_IN_PLACE,a0,n,ntype_real, &
+                            MPI_SUM,comm_phi,ierr)
+      end if
+!
+      if (tb1) then
+        call MPI_Allreduce (MPI_IN_PLACE,a1,n,ntype_real, &
+                            MPI_SUM,comm_phi,ierr)
+      end if
+!
+      call timer_off (c_sumphi)
+!
+end subroutine
+!#######################################################################
+subroutine zero_boundary_points (x)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Set the boundary points at the physical boundaries
+! ****** of X to zero.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use local_dims
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), dimension(nr,nt,np) :: x
+!
+!-----------------------------------------------------------------------
+!
+      if (rb0) x( 1,:,:)=0.
+      if (rb1) x(nr,:,:)=0.
+      if (tb0) x(:, 1,:)=0.
+      if (tb1) x(:,nt,:)=0.
+!
+end subroutine
+!#######################################################################
+function cgdot (x,y,N)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Get the dot product of the vectors X and Y.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: N,i
+      real(r_typ) :: cgdot
+      real(r_typ), dimension(N) :: x,y
+!
+!-----------------------------------------------------------------------
+!
+      cgdot=0.
+!
+      do concurrent (i=1:N) reduce(+:cgdot)
+        cgdot=cgdot+x(i)*y(i)
+      enddo
+!
+! ****** Sum over all the processors.
+!
+      call global_sum (cgdot)
+!
+      return
+end function
+!#######################################################################
+subroutine global_sum (x)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Overwrite X by the its sum over all processors.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use mpidefs
+      use timing
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ) :: x
+!
+!-----------------------------------------------------------------------
+!
+! ****** MPI error return.
+!
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+      call timer_on
+!
+! ****** Take the sum over all the processors.
+!
+      call MPI_Allreduce (MPI_IN_PLACE,x,1,ntype_real, &
+                          MPI_SUM,comm_all,ierr)
+!
+      call timer_off (c_cgdot)
+!
+end subroutine
+!#######################################################################
+subroutine seam_setup
+!
+!-----------------------------------------------------------------------
+!
+! ****** Allocate seam buffers.
+! ****** Doing this here avoids GPU allocate/deallocates.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use cgcom
+      use local_mesh
+      use local_dims
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      allocate(sbuf_rt1(nr,nt))
+      allocate(sbuf_rt2(nr,nt))
+      allocate(sbuf_tp1(nt,np))
+      allocate(sbuf_tp2(nt,np))
+      allocate(sbuf_rp1(nr,np))
+      allocate(sbuf_rp2(nr,np))
+!
+      allocate(rbuf_rt1(nr,nt))
+      allocate(rbuf_rt2(nr,nt))
+      allocate(rbuf_tp1(nt,np))
+      allocate(rbuf_tp2(nt,np))
+      allocate(rbuf_rp1(nr,np))
+      allocate(rbuf_rp2(nr,np))
+!
+end subroutine
+!#######################################################################
+subroutine seam_hhh (a)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Seam the boundary points of 3D (r,t,p) array A between
+! ****** adjacent processors.
+!
+! ****** This routine assumes that there is a two-point
+! ****** overlap between processors in each dimension.
+!
+!-----------------------------------------------------------------------
+!
+! ****** This version uses non-blocking MPI sends and receives
+! ****** whenever possible in order to overlap communications.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use mpidefs
+      use timing
+      use local_mesh
+      use local_dims
+      use cgcom
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), dimension(nr,nt,np) :: a
+!
+!-----------------------------------------------------------------------
+!
+! ****** MPI error return.
+!
+      integer :: ierr
+!
+! ****** MPI tag for MPI_ISEND and MPI_IRECV (not tagged).
+!
+      integer :: tag=0
+!
+!-----------------------------------------------------------------------
+!
+      integer :: lbuf,i,j
+      integer :: reqs(4)
+!
+!-----------------------------------------------------------------------
+!
+      call timer_on
+!
+! ****** Seam the third (periodic) dimension.
+! ****** Since halo data is stride-1, no need for buffers.
+!
+      lbuf=nr*nt
+!
+      call MPI_Isend (a(:,:,np-1),lbuf,ntype_real,iproc_pp,tag, &
+                      comm_all,reqs(1),ierr)
+!
+      call MPI_Isend (a(:,:,   2),lbuf,ntype_real,iproc_pm,tag, &
+                      comm_all,reqs(2),ierr)
+!
+      call MPI_Irecv (a(:,:, 1),lbuf,ntype_real,iproc_pm,tag,   &
+                      comm_all,reqs(3),ierr)
+!
+      call MPI_Irecv (a(:,:,np),lbuf,ntype_real,iproc_pp,tag,   &
+                      comm_all,reqs(4),ierr)
+!
+      call MPI_Waitall (4,reqs,MPI_STATUSES_IGNORE,ierr)
+!
+! ****** Seam the first dimension.
+!
+      if (nproc_r.gt.1) then
+!
+        lbuf=nt*np
+!
+        do concurrent (j=1:np, i=1:nt)
+          sbuf_tp1(i,j)=a(nr-1,i,j)
+          sbuf_tp2(i,j)=a(   2,i,j)
+        enddo
+!
+        call MPI_Isend (sbuf_tp1,lbuf,ntype_real,iproc_rp,tag, &
+                        comm_all,reqs(1),ierr)
+!
+        call MPI_Isend (sbuf_tp2,lbuf,ntype_real,iproc_rm,tag, &
+                        comm_all,reqs(2),ierr)
+!
+        call MPI_Irecv (rbuf_tp1,lbuf,ntype_real,iproc_rm,tag, &
+                        comm_all,reqs(3),ierr)
+!
+        call MPI_Irecv (rbuf_tp2,lbuf,ntype_real,iproc_rp,tag, &
+                        comm_all,reqs(4),ierr)
+!
+        call MPI_Waitall (4,reqs,MPI_STATUSES_IGNORE,ierr)
+!
+        if (iproc_rm.ne.MPI_PROC_NULL) then
+          do concurrent (j=1:np, i=1:nt)
+            a( 1,i,j)=rbuf_tp1(i,j)
+          enddo
+        end if
+!
+        if (iproc_rp.ne.MPI_PROC_NULL) then
+          do concurrent (j=1:np, i=1:nt)
+            a(nr,i,j)=rbuf_tp2(i,j)
+          enddo
+        end if
+      end if
+!
+! ****** Seam the second dimension.
+!
+      if (nproc_t.gt.1) then
+!
+        lbuf=nr*np
+!
+        do concurrent (j=1:np, i=1:nr)
+          sbuf_rp1(i,j)=a(i,nt-1,j)
+          sbuf_rp2(i,j)=a(i,   2,j)
+        enddo
+!
+        call MPI_Isend (sbuf_rp1,lbuf,ntype_real,iproc_tp,tag, &
+                        comm_all,reqs(1),ierr)
+!
+        call MPI_Isend (sbuf_rp2,lbuf,ntype_real,iproc_tm,tag, &
+                        comm_all,reqs(2),ierr)
+!
+        call MPI_Irecv (rbuf_rp1,lbuf,ntype_real,iproc_tm,tag, &
+                        comm_all,reqs(3),ierr)
+!
+        call MPI_Irecv (rbuf_rp2,lbuf,ntype_real,iproc_tp,tag, &
+                        comm_all,reqs(4),ierr)
+!
+        call MPI_Waitall (4,reqs,MPI_STATUSES_IGNORE,ierr)
+!
+        if (iproc_tm.ne.MPI_PROC_NULL) then
+          do concurrent (j=1:np, i=1:nr)
+            a(i, 1,j)=rbuf_rp1(i,j)
+          enddo
+        end if
+!
+        if (iproc_tp.ne.MPI_PROC_NULL) then
+          do concurrent (j=1:np, i=1:nr)
+            a(i,nt,j)=rbuf_rp2(i,j)
+          enddo
+        end if
+!
+      end if
+!
+      call timer_off (c_seam)
+!
+end subroutine
+!#######################################################################
+subroutine seam_gen (a,n1,n2,n3)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Seam the boundary points of 3D (r,t,p) array A between
+! ****** adjacent processors.
+!
+! ****** This routine assumes that there is a two-point
+! ****** overlap between processors in each dimension.
+!
+!-----------------------------------------------------------------------
+!
+! ****** This version uses non-blocking MPI sends and receives
+! ****** whenever possible in order to overlap communications.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use mpidefs
+      use timing
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), dimension(n1,n2,n3) :: a
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), dimension(n2,n3) :: sbuf11,rbuf11
+      real(r_typ), dimension(n2,n3) :: sbuf12,rbuf12
+      real(r_typ), dimension(n1,n3) :: sbuf21,rbuf21
+      real(r_typ), dimension(n1,n3) :: sbuf22,rbuf22
+!
+!-----------------------------------------------------------------------
+!
+! ****** MPI error return.
+!
+      integer :: ierr
+!
+! ****** MPI tag for MPI_ISEND and MPI_IRECV (not tagged).
+!
+      integer :: tag=0
+!
+!-----------------------------------------------------------------------
+!
+      integer :: lbuf,i,j
+      integer :: n1,n2,n3
+      integer :: reqs(4)
+!
+!-----------------------------------------------------------------------
+!
+      call timer_on
+!
+! ****** Seam the third (periodic) dimension.
+! ****** Since halo data is stride-1, no need for buffers.
+!
+      lbuf=n1*n2
+!
+      call MPI_Isend (a(:,:,n3-1),lbuf,ntype_real,iproc_pp,tag, &
+                      comm_all,reqs(1),ierr)
+!
+      call MPI_Isend (a(:,:,   2),lbuf,ntype_real,iproc_pm,tag, &
+                      comm_all,reqs(2),ierr)
+!
+      call MPI_Irecv (a(:,:, 1),lbuf,ntype_real,iproc_pm,tag,   &
+                      comm_all,reqs(3),ierr)
+!
+      call MPI_Irecv (a(:,:,n3),lbuf,ntype_real,iproc_pp,tag,   &
+                      comm_all,reqs(4),ierr)
+!
+      call MPI_Waitall (4,reqs,MPI_STATUSES_IGNORE,ierr)
+!
+! ****** Seam the first dimension.
+!
+      if (nproc_r.gt.1) then
+!
+        lbuf=n2*n3
+!
+        do concurrent (j=1:n3, i=1:n2)
+          sbuf11(i,j)=a(n1-1,i,j)
+          sbuf12(i,j)=a(   2,i,j)
+        enddo
+!
+        call MPI_Isend (sbuf11,lbuf,ntype_real,iproc_rp,tag, &
+                        comm_all,reqs(1),ierr)
+!
+        call MPI_Isend (sbuf12,lbuf,ntype_real,iproc_rm,tag, &
+                        comm_all,reqs(2),ierr)
+!
+        call MPI_Irecv (rbuf11,lbuf,ntype_real,iproc_rm,tag, &
+                        comm_all,reqs(3),ierr)
+!
+        call MPI_Irecv (rbuf12,lbuf,ntype_real,iproc_rp,tag, &
+                        comm_all,reqs(4),ierr)
+!
+        call MPI_Waitall (4,reqs,MPI_STATUSES_IGNORE,ierr)
+!
+        if (iproc_rm.ne.MPI_PROC_NULL) then
+          do concurrent (j=1:n3, i=1:n2)
+            a( 1,i,j)=rbuf11(i,j)
+          enddo
+        end if
+!
+        if (iproc_rp.ne.MPI_PROC_NULL) then
+          do concurrent (j=1:n3, i=1:n2)
+            a(n1,i,j)=rbuf12(i,j)
+          enddo
+        end if
+!
+      end if
+!
+! ****** Seam the second dimension.
+!
+      if (nproc_t.gt.1) then
+!
+        lbuf=n1*n3
+!
+        do concurrent (j=1:n3, i=1:n1)
+          sbuf21(i,j)=a(i,n2-1,j)
+          sbuf22(i,j)=a(i,   2,j)
+        enddo
+!
+        call MPI_Isend (sbuf21,lbuf,ntype_real,iproc_tp,tag, &
+                        comm_all,reqs(1),ierr)
+!
+        call MPI_Isend (sbuf22,lbuf,ntype_real,iproc_tm,tag, &
+                        comm_all,reqs(2),ierr)
+!
+        call MPI_Irecv (rbuf21,lbuf,ntype_real,iproc_tm,tag, &
+                        comm_all,reqs(3),ierr)
+!
+        call MPI_Irecv (rbuf22,lbuf,ntype_real,iproc_tp,tag, &
+                        comm_all,reqs(4),ierr)
+!
+        call MPI_Waitall (4,reqs,MPI_STATUSES_IGNORE,ierr)
+!
+        if (iproc_tm.ne.MPI_PROC_NULL) then
+          do concurrent (j=1:n3, i=1:n1)
+            a(i, 1,j)=rbuf21(i,j)
+          enddo
+        end if
+!
+        if (iproc_tp.ne.MPI_PROC_NULL) then
+          do concurrent (j=1:n3, i=1:n1)
+            a(i,n2,j)=rbuf22(i,j)
+          enddo
+        end if
+!
+      end if
+!
+      call timer_off (c_seam)
+!
+end subroutine
+!#######################################################################
+subroutine write_solution
+!
+!-----------------------------------------------------------------------
+!
+! ****** Write the global solution.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use global_dims
+      use global_mesh
+      use fields
+      use vars
+      use solve_params
+      use mpidefs
+      use decomposition
+      use assemble_array_interface
+      use timing
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+! ****** Declaration for the global arrays.
+! ****** These arrays are only allocated on processor IPROC0.
+!
+      real(r_typ), dimension(:,:,:), allocatable :: phi_g
+      real(r_typ), dimension(:,:,:), allocatable :: br_g
+      real(r_typ), dimension(:,:,:), allocatable :: bt_g
+      real(r_typ), dimension(:,:,:), allocatable :: bp_g
+!
+!-----------------------------------------------------------------------
+!
+      integer :: ierr
+      character(256) :: fname
+!
+!-----------------------------------------------------------------------
+!
+      call timer_on
+!
+! ****** Potential.
+!
+      if (phifile.ne.'') then
+!
+! ****** Allocate the global array PHI_G (on processor IPROC0).
+!
+        if (iamp0) then
+          allocate (phi_g(nr_g,nt_g,np_g))
+        else
+          allocate (phi_g(1,1,1))
+        end if
+!
+! ****** Assemble the global PHI array.
+!
+        call assemble_array (map_rh,map_th,map_ph,phi,phi_g)
+!
+        fname=phifile
+!
+! ****** Write out the potential to a file.
+!
+        if (iamp0) then
+            write (*,*)
+            write (*,*) '### COMMENT from WRITE_SOLUTION:'
+            write (*,*)
+            write (*,*) 'Writing the potential to file: ',trim(fname)
+            call wrhdf_3d (fname,.true.,nr_g,nt_g,np_g, &
+                           phi_g,rh_g,th_g,ph_g,hdf32,ierr)
+        end if
+!
+        deallocate (phi_g)
+!
+      end if
+!
+! ****** Br.
+!
+      if (brfile.ne.'') then
+!
+        fname=brfile
+!
+        if (iamp0) then
+          allocate (br_g(nrm1_g,nt_g,np_g))
+        else
+          allocate (br_g(1,1,1))
+        end if
+!
+! ****** Assemble the global PHI array.
+!
+        call assemble_array (map_rm,map_th,map_ph,br,br_g)
+!
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### COMMENT from WRITE_SOLUTION:'
+          write (*,*)
+          write (*,*) 'Writing Br to file: ',trim(fname)
+          call wrhdf_3d (fname,.true.,nrm1_g,nt_g,np_g, &
+                         br_g,r_g,th_g,ph_g,hdf32,ierr)
+        end if
+!
+        deallocate (br_g)
+!
+      end if
+!
+! ****** Bt.
+!
+      if (btfile.ne.'') then
+!
+        fname=btfile
+!
+        if (iamp0) then
+          allocate (bt_g(nr_g,ntm1_g,np_g))
+        else
+          allocate (bt_g(1,1,1))
+        end if
+!
+! ****** Assemble the global PHI array.
+!
+        call assemble_array (map_rh,map_tm,map_ph,bt,bt_g)
+!
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### COMMENT from WRITE_SOLUTION:'
+          write (*,*)
+          write (*,*) 'Writing Bt to file: ',trim(fname)
+          call wrhdf_3d (fname,.true.,nr_g,ntm1_g,np_g, &
+                         bt_g,rh_g,t_g,ph_g,hdf32,ierr)
+!
+        end if
+!
+        deallocate (bt_g)
+!
+      end if
+!
+! ****** Bp.
+!
+      if (bpfile.ne.'') then
+!
+        fname=bpfile
+!
+        if (iamp0) then
+          allocate (bp_g(nr_g,nt_g,npm1_g))
+        else
+          allocate (bp_g(1,1,1))
+        end if
+!
+! ****** Assemble the global PHI array.
+!
+        call assemble_array (map_rh,map_th,map_pm,bp,bp_g)
+!
+        if (iamp0) then
+          write (*,*)
+          write (*,*) '### COMMENT from WRITE_SOLUTION:'
+          write (*,*)
+          write (*,*) 'Writing Bp to file: ',trim(fname)
+          call wrhdf_3d (fname,.true.,nr_g,nt_g,npm1_g, &
+                         bp_g,rh_g,th_g,p_g,hdf32,ierr)
+!
+        end if
+!
+        deallocate (bp_g)
+!
+      end if
+!
+      call timer_off (t_io)
+!
+end subroutine
+!#######################################################################
+subroutine getb
+!
+!-----------------------------------------------------------------------
+!
+! ****** Calculate B from grad-phi.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use global_dims
+      use global_mesh
+      use vars
+      use fields
+      use local_dims
+      use local_mesh
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j,k
+!
+!-----------------------------------------------------------------------
+!
+! ****** Get Br.
+!
+      do concurrent (k=1:np, j=1:nt, i=1:nrm1)
+        br(i,j,k)=(phi(i+1,j,k)-phi(i,j,k))/dr(i)
+      enddo
+!
+! ****** Get Bt.
+!
+      do concurrent (k=1:np, j=1:ntm1, i=1:nr)
+        bt(i,j,k)=(phi(i,j+1,k)-phi(i,j,k))/(rh(i)*dt(j))
+      enddo
+!
+! ****** Get Bp.
+!
+      do concurrent (k=1:npm1, j=1:nt, i=1:nr)
+        bp(i,j,k)=(phi(i,j,k+1)-phi(i,j,k))/(rh(i)*sth(j)*dp(k))
+      enddo
+!
+end subroutine
+!#######################################################################
+subroutine magnetic_energy
+!
+!-----------------------------------------------------------------------
+!
+! ****** Calculate magnetic energy from B.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use global_dims
+      use global_mesh
+      use vars
+      use fields
+      use mpidefs
+      use local_dims
+      use local_mesh
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: half=.5_r_typ
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j,k,ierr
+      real(r_typ) :: brav,btav,bpav,dv
+      real(r_typ) :: wr,wt,wp
+      real(r_typ), dimension(3) :: w
+      character(32) :: fmtstr
+!
+!-----------------------------------------------------------------------
+!
+      if (hdf32) then
+        fmtstr="(A, ES14.8)"
+      else
+        fmtstr="(A,ES22.16)"
+      end if
+!
+      wr=0.
+      wt=0.
+      wp=0.
+      do concurrent (i=2:nrm1,j=2:ntm1,k=2:npm1) reduce(+:wr,wt,wp)
+        dv=rh(i)**2*drh(i)*dth(j)*sth(j)*dph(k)
+        brav=half*(br(i,j,k)+br(i-1,j,k))
+        btav=half*(bt(i,j,k)+bt(i,j-1,k))
+        bpav=half*(bp(i,j,k)+bp(i,j,k-1))
+        wr=wr+half*brav**2*dv
+        wt=wt+half*btav**2*dv
+        wp=wp+half*bpav**2*dv
+      enddo
+!
+! ****** Sum up all processors into final values and print.
+!
+      w(1)=wr
+      w(2)=wt
+      w(3)=wp
+      call MPI_Allreduce(MPI_IN_PLACE,w,3,ntype_real, &
+                         MPI_SUM,comm_all,ierr)
+!
+      if (iamp0) then
+        write (*,*)
+        write (*,*) '### COMMENT from GETB:'
+        write (*,*) '### Magnetic energy diagnostic:'
+        write (*,*)
+        write (*,trim(fmtstr)) 'Energy in Br**2 = ',w(1)
+        write (*,trim(fmtstr)) 'Energy in Bt**2 = ',w(2)
+        write (*,trim(fmtstr)) 'Energy in Bp**2 = ',w(3)
+        write (*,trim(fmtstr)) 'Magnetic energy = ',SUM(w)
+        write (9,*)
+        write (9,*) '### COMMENT from GETB:'
+        write (9,*) '### Magnetic energy diagnostic:'
+        write (9,*)
+        write (9,trim(fmtstr)) 'Energy in Br**2 = ',w(1)
+        write (9,trim(fmtstr)) 'Energy in Bt**2 = ',w(2)
+        write (9,trim(fmtstr)) 'Energy in Bp**2 = ',w(3)
+        write (9,trim(fmtstr)) 'Magnetic energy = ',SUM(w)
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine assemble_array (map_r,map_t,map_p,a,a_g)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Assemble a global array (into A_G) on processor IPROC0 by
+! ****** fetching the local sections (A) from all the processors.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use decomposition
+      use mpidefs
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      type(map_struct), dimension(0:nproc-1) :: map_r,map_t,map_p
+      real(r_typ), dimension(:,:,:) :: a,a_g
+!
+!-----------------------------------------------------------------------
+!
+! ****** Storage for the buffers.
+!
+      integer :: lbuf,lsbuf
+      real(r_typ), dimension(:), allocatable :: sbuf
+      real(r_typ), dimension(:), allocatable :: rbuf
+!
+!-----------------------------------------------------------------------
+!
+      integer :: tag=0
+      integer :: irank,l1,l2,l3,i,j,k,ii
+      integer :: i0,j0,k0,i1,j1,k1
+      integer :: i0g,j0g,k0g
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+      l1=map_r(iproc)%n
+      l2=map_t(iproc)%n
+      l3=map_p(iproc)%n
+      lsbuf=l1*l2*l3
+!
+      i0=map_r(iproc)%i0
+      i1=map_r(iproc)%i1
+      j0=map_t(iproc)%i0
+      j1=map_t(iproc)%i1
+      k0=map_p(iproc)%i0
+      k1=map_p(iproc)%i1
+!
+! ***** Extract 1D array of non-overlapping values from local array.
+!
+      allocate (sbuf(lsbuf))
+!
+      sbuf=reshape(a(i0:i1,j0:j1,k0:k1),(/lsbuf/))
+!
+! ****** If proc0, recieve/store local arrays into global array.
+!
+      if (iamp0) then
+        do irank=0,nproc-1
+!
+          l1=map_r(irank)%n
+          l2=map_t(irank)%n
+          l3=map_p(irank)%n
+          lbuf=l1*l2*l3
+!
+          i0g=map_r(irank)%offset
+          j0g=map_t(irank)%offset
+          k0g=map_p(irank)%offset
+!
+! ****** If proc0 is the current rank in loop, simply copy local array.
+          if (iproc==irank) then
+            do k=1,l3
+              do j=1,l2
+                do i=1,l1
+                  ii=l2*l1*(k-1)+l1*(j-1)+i
+                  a_g(i0g+i-1,j0g+j-1,k0g+k-1)=sbuf(ii)
+                enddo
+              enddo
+            enddo
+! ****** Otherwise recieve data:
+          else
+            allocate (rbuf(lbuf))
+            call MPI_Recv (rbuf,lbuf,ntype_real,irank,tag, &
+                           comm_all,MPI_STATUS_IGNORE,ierr)
+            do k=1,l3
+              do j=1,l2
+                do i=1,l1
+                  ii=l2*l1*(k-1)+l1*(j-1)+i
+                  a_g(i0g+i-1,j0g+j-1,k0g+k-1)=rbuf(ii)
+                enddo
+              enddo
+            enddo
+            deallocate(rbuf)
+          end if
+        enddo
+      else
+!
+! ****** Send local array to iproc0.
+!
+      call MPI_Ssend (sbuf,lsbuf,ntype_real,iproc0,tag,comm_all,ierr)
+!
+      end if
+      deallocate (sbuf)
+!
+end subroutine
+!#######################################################################
+subroutine timer_on
+!
+!-----------------------------------------------------------------------
+!
+! ****** Push an entry onto the timing stack and initialize
+! ****** a timing event.
+!
+!-----------------------------------------------------------------------
+!
+! ****** This routine can be called in a nested way to measure
+! ****** multiple timing events.  Calls to TIMER_ON and TIMER_OFF
+! ****** need to be nested like do-loops in FORTRAN.
+!
+!-----------------------------------------------------------------------
+!
+      use mpidefs
+      use timer
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      if (istack.ge.nstack) then
+        write (*,*)
+        write (*,*) '### WARNING from TIMER_ON:'
+        write (*,*) '### Timing stack depth exceeded.'
+        write (*,*) 'This may signal an incorrect nesting of '// &
+                    'TIMER_ON/TIMER_OFF calls.'
+        write (*,*) 'Timing information will not be valid.'
+        return
+      else
+        istack=istack+1
+      end if
+!
+      tstart(istack)=MPI_Wtime()
+!
+end subroutine
+!#######################################################################
+subroutine timer_off (tused)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Increment the CPU time used since the call to TIMER_ON
+! ****** in variable TUSED, and pop an entry off the timing
+! ****** stack.
+!
+!-----------------------------------------------------------------------
+!
+! ****** This routine can be called in a nested way to measure
+! ****** multiple timing events.  Calls to TIMER_ON and TIMER_OFF
+! ****** need to be nested like do-loops in FORTRAN.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use mpidefs
+      use timer
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ) :: tused
+!
+!-----------------------------------------------------------------------
+!
+      if (istack.le.0) then
+        write (*,*)
+        write (*,*) '### WARNING from TIMER_OFF:'
+        write (*,*) '### Timing stack cannot be popped.'
+        write (*,*) 'This may signal an incorrect nesting of '// &
+                    'TIMER_ON/TIMER_OFF calls.'
+        write (*,*) 'Timing information will not be valid.'
+        return
+      else
+        istack=istack-1
+      end if
+!
+      tused=tused+MPI_Wtime()-tstart(istack+1)
+!
+end subroutine
+!#######################################################################
+subroutine write_timing
+!
+!-----------------------------------------------------------------------
+!
+! ****** Write out the timing info.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use mpidefs
+      use timing
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+! ****** Timing buffers.
+!
+      integer, parameter :: lbuf=10
+      real(r_typ), dimension(lbuf) :: sbuf
+      real(r_typ), dimension(lbuf,0:nproc-1) :: tbuf
+!
+! ****** Timing statistics.
+!
+      real(r_typ), dimension(lbuf) :: tmin,tmax,tavg,tsdev
+!
+!-----------------------------------------------------------------------
+!
+      integer :: ierr,irank
+      real(r_typ) :: t_tot_avg,c_tot_avg,c_tot
+!
+      character(80) :: tfile='timing.out'
+!
+!-----------------------------------------------------------------------
+!
+! ****** Gather the timing information for all processors into TBUF.
+!
+      sbuf(1)=t_solve
+      sbuf(2)=t_startup
+      sbuf(3)=t_io
+      sbuf(4)=t_ax
+      sbuf(5)=t_pc_load
+      sbuf(6)=t_pc
+      sbuf(7)=c_seam
+      sbuf(8)=c_cgdot
+      sbuf(9)=c_sumphi
+      sbuf(10)=t_wall
+!
+      call MPI_Allgather (sbuf,lbuf,ntype_real, &
+                          tbuf,lbuf,ntype_real,comm_all,ierr)
+!
+! ****** Calculate the timing statistics.
+!
+      tavg=sum(tbuf,dim=2)/nproc
+      tmin=minval(tbuf,dim=2)
+      tmax=maxval(tbuf,dim=2)
+!
+      tsdev(:)=0.
+      do irank=0,nproc-1
+        tsdev(:)=tsdev(:)+(tbuf(:,irank)-tavg(:))**2
+      enddo
+      tsdev(:)=sqrt(tsdev(:)/nproc)
+!
+      t_tot_avg=tavg(10)
+      c_tot_avg=tavg(7)+tavg(8)+tavg(9)
+!
+      if (iamp0) then
+!
+        call ffopen (1,tfile,'rw',ierr)
+!
+        if (ierr.ne.0) then
+          write (*,*)
+          write (*,*) '### WARNING from WRITE_TIMING:'
+          write (*,*) '### Could not create the timing file.'
+          write (*,*) 'File name: ',trim(tfile)
+        end if
+!
+        do irank=0,nproc-1
+          c_tot=tbuf(7,irank)+tbuf(8,irank)+tbuf(9,irank)
+          write (1,*)
+          write (1,100)
+          write (1,*)
+          write (1,*) 'Processor id = ',irank
+          write (1,*)
+          write (1,200) 'Comm. time in SEAM    = ',tbuf(7,irank)
+          write (1,200) 'Comm. time in CGDOT   = ',tbuf(8,irank)
+          write (1,200) 'Comm. time in SUMPHI  = ',tbuf(9,irank)
+          write (1,*)   '------------------------------------'
+          write (1,200) 'Total comm. time      = ',c_tot
+          write (1,*)
+          write (1,200) 'Time used in start-up = ',tbuf(2,irank)
+          write (1,200) 'Time used in i/o      = ',tbuf(3,irank)
+          write (1,200) 'Time used in POTFLD   = ',tbuf(1,irank)
+          write (1,200) 'Time used in AX       = ',tbuf(4,irank)
+          write (1,200) 'Time used in PCLOAD   = ',tbuf(5,irank)
+          write (1,200) 'Time used in PC       = ',tbuf(6,irank)
+          write (1,*)   '------------------------------------'
+          write (1,200) 'Total time used       = ',tbuf(10,irank)
+  100     format (80('-'))
+  200     format (1x,a,f12.6)
+        enddo
+        write (1,*)
+        write (1,100)
+!
+        write (1,*)
+        write (1,*) 'Average times:'
+        write (1,*) '-------------'
+        write (1,*)
+        write (1,300) '    Avg         Min         Max      S. Dev'
+        write (1,300) '    ---         ---         ---      ------'
+        write (1,400) 'Avg comm. time in SEAM    = ', &
+                      tavg(7),tmin(7),tmax(7),tsdev(7)
+        write (1,400) 'Avg comm. time in CGDOT   = ', &
+                      tavg(8),tmin(8),tmax(8),tsdev(8)
+        write (1,400) 'Avg comm. time in SUMPHI  = ', &
+                      tavg(9),tmin(9),tmax(9),tsdev(9)
+        write (1,400) 'Avg time used in start-up = ', &
+                      tavg(2),tmin(2),tmax(2),tsdev(2)
+        write (1,400) 'Avg time used in i/o      = ', &
+                      tavg(3),tmin(3),tmax(3),tsdev(3)
+        write (1,400) 'Avg time used in POTFLD   = ', &
+                      tavg(1),tmin(1),tmax(1),tsdev(1)
+        write (1,400) 'Avg time used in AX       = ', &
+                      tavg(4),tmin(4),tmax(4),tsdev(4)
+        write (1,400) 'Avg time used in PCLOAD   = ', &
+                      tavg(5),tmin(5),tmax(5),tsdev(5)
+        write (1,400) 'Avg time used in PC       = ', &
+                      tavg(6),tmin(6),tmax(6),tsdev(6)
+        write (1,400) 'Avg total time            = ', &
+                      tavg(10),tmin(10),tmax(10),tsdev(10)
+  300   format (1x,33x,a)
+  400   format (1x,a,4f12.3)
+!
+        write (1,*)
+        write (1,200) 'Average time used per proc  = ',t_tot_avg
+        write (1,200) 'Average comm. time per proc = ',c_tot_avg
+        write (1,*)
+        write (1,100)
+        write (1,*)
+!
+        close (1)
+!
+        write(*,*)
+        write(*,200) 'Total time:               ',tmax(10)
+        write(*,200) 'Max Comm/Imbalance time:  ', &
+                      tmax(7)+tmax(8)+tmax(9)
+        write(*,*)
+!
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine readbr (fname,br0_g,ierr)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Read in the radial magnetic field at the photosphere
+! ****** and interpolate it into array BR0_G.
+!
+! ****** FNAME is the name of the file to read.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+      use global_dims
+      use global_mesh
+      use vars
+      use rdhdf_2d_interface
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      character(*) :: fname
+      real(r_typ), dimension(nt_g,np_g) :: br0_g
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: two=2._r_typ
+!
+!-----------------------------------------------------------------------
+!
+! ****** Br array read in and its scales.
+!
+      real(r_typ), dimension(:,:), pointer :: bn
+      real(r_typ), dimension(:), pointer :: tn,pn
+!
+!-----------------------------------------------------------------------
+!
+      integer :: ntn,npn,j,k
+      logical :: scale
+      real(r_typ) :: sum0,sum1,area,fluxp,fluxm,da,br00err
+!
+!-----------------------------------------------------------------------
+!
+      ierr=0
+!
+! ****** Read in the normal field.
+!
+      write (*,*)
+      write (*,*) '### COMMENT from READBR:'
+      write (*,*) '### Reading Br file: ',trim(fname)
+!
+      call rdhdf_2d (fname,scale,ntn,npn,bn,tn,pn,ierr)
+!
+      if (ierr.ne.0) then
+        write (*,*)
+        write (*,*) '### ERROR in READBR:'
+        write (*,*) '### The flux file has the wrong format.'
+        write (*,*) 'IERR (from RDHDF_2D) = ',ierr
+        write (*,*) 'File name: ',trim(fname)
+        ierr=1
+        return
+      end if
+!
+! ****** Check that the arrays has scales.
+!
+      if (.not.scale) then
+        write (*,*)
+        write (*,*) '### ERROR in READBR:'
+        write (*,*) '### The flux file does not have scales.'
+        write (*,*) 'File name: ',trim(fname)
+        ierr=2
+        return
+      end if
+!
+! ****** Interpolate the field to the code mesh (into array BR0_G).
+!
+      call intrp2d (ntn,npn,tn,pn,bn, &
+                    nt_g-2,np_g-2,th_g(2:ntm1_g),ph_g(2:npm1_g), &
+                    br0_g(2:ntm1_g,2:npm1_g),ierr)
+!
+      if (ierr.ne.0) then
+        write (*,*)
+        write (*,*) '### ERROR in READBR:'
+        write (*,*) '### The scales in the Br file are invalid.'
+        write (*,*) 'File name: ',trim(fname)
+        ierr=3
+        return
+      end if
+!
+! ****** De-allocate the memory for the BN array and its scales.
+!
+      deallocate (bn)
+      deallocate (tn)
+      deallocate (pn)
+!
+! ****** Set Br to be periodic.
+!
+      br0_g(:,1)=br0_g(:,npm1_g)
+      br0_g(:,np_g)=br0_g(:,2)
+!
+! ****** Set BCs at the poles.
+!
+! ****** Br has only an m=0 component there.
+!
+      sum0=sum(br0_g(     2,2:npm1_g)*dph_g(2:npm1_g))*pl_i
+      sum1=sum(br0_g(ntm1_g,2:npm1_g)*dph_g(2:npm1_g))*pl_i
+!
+      br0_g(1   ,:)=two*sum0-br0_g(     2,:)
+      br0_g(nt_g,:)=two*sum1-br0_g(ntm1_g,:)
+!
+! ****** Calculate the total flux.
+!
+      area=0.
+      fluxp=0.
+      fluxm=0.
+      do j=2,ntm1_g
+        do k=2,npm1_g
+          da=sth_g(j)*dth_g(j)*dph_g(k)
+          if (br0_g(j,k).gt.0.) then
+            fluxp=fluxp+br0_g(j,k)*da
+          else
+            fluxm=fluxm+br0_g(j,k)*da
+          end if
+          area=area+da
+        enddo
+      enddo
+!
+      write (*,*)
+      write (*,*) '### COMMENT from READBR:'
+      write (*,*) '### Computed flux balance:'
+      write (*,*)
+      write (*,*) 'Positive flux = ',fluxp
+      write (*,*) 'Negative flux = ',fluxm
+!
+! ****** Fix the magnetic field so that the total flux is zero
+! ****** (unless this has not been requested).
+!
+      if (.not.((option.eq.'ss'.or.option.eq.'open') &
+                .and.do_not_balance_flux)) then
+!
+        br00err=(fluxp+fluxm)/area
+!
+        do k=1,np_g
+          do j=1,nt_g
+            br0_g(j,k)=br0_g(j,k)-br00err
+          enddo
+        enddo
+!
+        write (*,*)
+        write (*,*) '### COMMENT from READBR:'
+        write (*,*) '### Flux balance correction:'
+        write (*,*)
+        write (*,*) 'BR00 (monopole Br field magnitude) = ',br00err
+!
+      end if
+!
+end subroutine
+!#######################################################################
+subroutine intrp2d (nxi,nyi,xi,yi,fi,nx,ny,x,y,f,ierr)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Interpolate a 2D field from array FI(NXI,NYI), defined
+! ****** on the mesh XI(NXI) x YI(NYI), into the array F(NX,NY),
+! ****** defined on the mesh X(NX) x Y(NY).
+!
+! ****** Note that if a data point is outside the bounds of
+! ****** the XI x YI mesh, IERR=2 is returned.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: nxi,nyi
+      real(r_typ), dimension(nxi) :: xi
+      real(r_typ), dimension(nyi) :: yi
+      real(r_typ), dimension(nxi,nyi) :: fi
+      integer :: nx,ny
+      real(r_typ), dimension(nx) :: x
+      real(r_typ), dimension(ny) :: y
+      real(r_typ), dimension(nx,ny) :: f
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), parameter :: zero=0._r_typ
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ), external :: flint
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j,ii,jj
+      real(r_typ) :: dum(1)=0.
+      real(r_typ) :: dummy,xv,yv,ax,ay
+!
+!-----------------------------------------------------------------------
+!
+      ierr=0
+!
+! ****** Check that the scales XI and YI are monotonic.
+!
+      dummy=flint(zero,nxi,xi,dum,1,ierr)
+      if (ierr.ne.0) go to 900
+!
+      dummy=flint(zero,nyi,yi,dum,1,ierr)
+      if (ierr.ne.0) go to 900
+!
+! ****** Interpolate the data.
+!
+      do j=1,ny
+        yv=y(j)
+        if (yv.lt.yi(1).or.yv.gt.yi(nyi)) then
+          go to 910
+        end if
+        call interp (yi,nyi,yv,jj,ay)
+        do i=1,nx
+          xv=x(i)
+          if (xv.lt.xi(1).or.xv.gt.xi(nxi)) then
+            go to 910
+          end if
+          call interp (xi,nxi,xv,ii,ax)
+          f(i,j)= (1.-ax)*((1.-ay)*fi(ii  ,jj  )+ay*fi(ii  ,jj+1)) &
+                 +    ax *((1.-ay)*fi(ii+1,jj  )+ay*fi(ii+1,jj+1))
+        enddo
+      enddo
+!
+      return
+!
+! ****** Error exit: invalid scales.
+!
+  900 continue
+!
+      write (*,*)
+      write (*,*) '### ERROR in INTRP2D:'
+      write (*,*) '### Scales are not monotonically increasing.'
+      ierr=1
+!
+      return
+!
+! ****** Error exit: data outside range of scales.
+!
+  910 continue
+!
+      write (*,*)
+      write (*,*) '### ERROR in INTRP2D:'
+      write (*,*) '### An interpolation was attempted outside the'// &
+                  ' range of the defined scales.'
+      ierr=2
+!
+end subroutine
+!#######################################################################
+function flint (x,n,xn,fn,icheck,ierr)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Interpolate a function linearly.
+!
+!-----------------------------------------------------------------------
+!
+! ****** The funcion is defined at N nodes, with values given by
+! ****** FN(N) at positions XN(N).  The function value returned is
+! ****** the linear interpolant at X.
+!
+! ****** Note that if X.lt.XN(1), the function value returned
+! ****** is FN(1), and if X.gt.XN(N), the function value returned
+! ****** is FN(N).
+!
+! ****** Call once with ICHECK.ne.0 to check that the values
+! ****** in XN(N) are monotonically increasing.  In this mode
+! ****** the array XN(N) is checked, and X and FN(N) are not
+! ****** accessed.  If the check is passed, IERR=0 is returned.
+! ****** Otherwise, IERR=1 is returned.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      real(r_typ) :: flint
+      real(r_typ) :: x
+      integer :: n
+      real(r_typ), dimension(n) :: xn,fn
+      integer :: icheck
+      integer :: ierr
+!
+!-----------------------------------------------------------------------
+!
+      integer :: i,j
+      real(r_typ) :: x1,x2,alpha
+!
+!-----------------------------------------------------------------------
+!
+      ierr=0
+      flint=0.
+!
+! ****** If ICHECK.ne.0, check the abscissa table.
+!
+      if (icheck.ne.0) then
+        if (n.le.0) then
+          write (*,*)
+          write (*,*) '### ERROR in FLINT:'
+          write (*,*) '### Bad dimension of abscissa table.'
+          write (*,*) 'N = ',n
+          ierr=1
+          return
+        end if
+        do 100 i=1,n-1
+          if (xn(i+1).le.xn(i)) then
+            write (*,*)
+            write (*,*) '### ERROR in FLINT:'
+            write (*,*) '### Bad data in abscissa table.'
+            write (*,*) 'N = ',n
+            write (*,*) 'XN = '
+            write (*,*) (xn(j),j=1,n)
+            ierr=1
+            return
+          end if
+  100   continue
+        return
+      end if
+!
+! ****** Get the interpolated value.
+!
+      if (x.le.xn(1)) then
+        flint=fn(1)
+      else if (x.gt.xn(n)) then
+        flint=fn(n)
+      else
+        do 200 i=1,n-1
+          if (x.ge.xn(i).and.x.lt.xn(i+1)) go to 300
+  200   continue
+  300   continue
+        x1=xn(i)
+        x2=xn(i+1)
+        alpha=(x-x1)/(x2-x1)
+        flint=fn(i)*(1.-alpha)+fn(i+1)*alpha
+      end if
+!
+      return
+end function
+!#######################################################################
+subroutine interp (x,n,xv,i,alpha)
+!
+!-----------------------------------------------------------------------
+!
+! ****** Get interpolation factor ALPHA and table index I.
+!
+! ****** This routine does not do the actual interpolation.  Use the
+! ****** returned values of I and ALPHA to get the interpolated value.
+!
+!-----------------------------------------------------------------------
+!
+      use number_types
+!
+!-----------------------------------------------------------------------
+!
+      implicit none
+!
+!-----------------------------------------------------------------------
+!
+      integer :: n
+      real(r_typ), dimension(n) :: x
+      real(r_typ) :: xv
+      integer :: i
+      real(r_typ) :: alpha
+!
+!-----------------------------------------------------------------------
+!
+      do 100 i=1,n-1
+        if (xv.ge.x(i).and.xv.le.x(i+1)) then
+          alpha=(xv-x(i))/(x(i+1)-x(i))
+          go to 200
+        end if
+  100 continue
+!
+! ****** Value not found --- signal error and stop.
+!
+      write (*,*)
+      write (*,*) '### ERROR in INTERP:'
+      write (*,*) '### Value not found in table.'
+      write (*,*) 'Value requested = ',xv
+      write (*,*) 'Min table value = ',x(1)
+      write (*,*) 'Max table value = ',x(n)
+      call endrun (.true.)
+!
+  200 continue
+!
+end subroutine
+!#######################################################################
+!
+! ****** Revision history:
+!
+! ### Version 1.00, 03/02/2006, file pot3d.f, modified by ZM:
+!
+!       - Cleaned up the previous version of POT3D.
+!
+! ### Version 1.01, 03/06/2006, file pot3d.f, modified by ZM:
+!
+!       - Added the ability to do a "source-surface plus
+!         current-sheet" solution.  Select this by setting
+!         OPTION='ss+cs'.
+!
+! ### Version 1.02, 06/18/2007, file pot3d.f, modified by ZM:
+!
+!       - Fixed a bug that caused the code to hang when an error
+!         was encountered (when running a parallel job).
+!
+! ### Version 1.03, 03/17/2009, file pot3d.f, modified by ZM:
+!
+!       - Added the ability to write the boundary flux before the
+!         sign flip for current-sheet solutions (i.e., OPTION='open').
+!         Set the variable BR_PHOTO_ORIGINAL_FILE to the desired
+!         file name to request this.
+!
+! ### Version 1.50, 01/25/2016, file pot3d.f, modified by RC:
+!
+!       - Added new (much faster) BILU0 preconditioner to CG solver.
+!         To activate, set ifprec=2 in pot3d.dat file.
+!       - Modified CG solve to use 1D arrays
+!         for SAXPY and DOT operations.
+!
+! ### Version 2.00, 06/06/2017, file pot3d.f, modified by RC:
+!
+!       - Added OpenACC support to the code.
+!         - OpenACC support is currently ONLY on 'standard'
+!           pot3d runs (not inner-outer-iteratative mode)
+!           and is only efficient on GPUs when using ifprec=1.
+!         - OpenACC adds support for running the code on
+!           Nvidia GPUs/Intel KNL/x86-multicore/OpenPower.
+!         - To use OpenACC, simply compile the code with a compiler
+!           that supports OpenACC with the correct flags activated.
+!         - Multi-gpu support is included by using the new
+!           ngpus_per_node input parameter.  This should be set
+!           to the number of GPUs available per node.
+!           The number of MPI ranks per node should match the
+!           number of gpus per node.  This can be launched with
+!           "mpiexec -np <np> -ntasks-per-node <ngpus_per_node>".
+!         - The GPU features of the code are fully portable, i.e.
+!           the code can be compiled/used as before on CPUs with no
+!           changes in compilation or run-time.
+!       - Modified some routines to be "nicer" for OpenACC
+!         and optimized some MPI communications.
+!       - Added wall-clock timer and corrected placement of
+!         MPI_Finalize().  The wall clock timer now reflects the
+!         true runtime.
+!
+! ### Version 2.01, 10/02/2017, file pot3d.f, modified by RC:
+!
+!       - Optimized OpenACC.
+!       - Renamed cgsolv() to cgsolve().
+!
+! ### Version 2.10, 01/15/2018, file pot3d.f, modified by ZM+RC:
+!
+!       - Added the ability to skip the balancing of the flux
+!         when doing a PFSS or OPEN field.  To invoke this, set
+!         DO_NOT_BALNCE_FLUX=.true..
+!       - Changed some pointers to allocatables for better
+!         vectorization.
+!
+! ### Version 2.11, 03/19/2018, file pot3d.f, modified by RC:
+!
+!       - Added 'fmt' input parameter to set output file type.
+!         Set fmt='h5' to write out HDF5 (default is 'hdf').
+!
+! ### Version 2.12, 10/08/2018, file pot3d.f, modified by RC:
+!
+!       - COMPATIBILITY CHANGE! Renamed gpus_per_node to gpn.
+!         gpn is default 0 which will set gpn to the number of
+!         MPI ranks in the local node.
+!         Setting gpn manually is not recommended and only used for
+!         oversubscribing the GPUs.
+!       - Added MPI shared communicator for automatically setting gpn.
+!         This requires an MPI-3 capable MPI library.
+!       - Changed layout of matrix coefficient arrays to be more
+!         vector-friendly instead of cache-friendly.
+!
+! ### Version 2.13, 11/19/2018, file pot3d.f, modified by RC:
+!
+!       - Small modifications to polar boundary condition calculations.
+!       - Updated array ops and ACC clauses to be F2003 optimized.
+!
+! ### Version 2.20, 01/09/2019, file pot3d.f, modified by RC:
+!
+!       - Added double precision output option.
+!         Set hdf32=.false. to activate 64-bit output.
+!       - Updated magnetic field computation.  B is now computed
+!         in parallel.  3D output fields now gathered to rank 0
+!         using Sends and Receives instead of Gatherv in order
+!         to allow very large resolutions.
+!       - Added automatic topology.  Now, nprocs is optional.
+!         One can specify one or more topology dimensions and
+!         use the flag value "-1" to indicate dimensions to auto-set.
+!         It is recommended to simply delete nprocs from input files.
+!       - Added output file flushing so CG iterations can be monitored.
+!       - Added new MPI rank diagnostics including
+!         estimated load imbalance.
+!       - Processor topology and magnetic energy output now written to
+!         pot3d.out as well as the terminal.
+!
+! ### Version 2.21, 01/11/2019, file pot3d.f, modified by RC:
+!
+!       - Small updates to magnetic_energy routine.
+!
+! ### Version 2.22, 11/27/2019, file pot3d.f, modified by RC:
+!
+!       - Optimized some OpenACC directives.  Expanded some
+!         array-syntax lines to full loops.
+!
+! ### Version 2.23, 08/11/2020, file pot3d.f, modified by RC:
+!
+!       - Small bug fix for default output file names and
+!         format option fmt.
+!
+! ### Version 3.0.0, 02/10/2021, file pot3d.f, modified by RC:
+!
+!       - Changed version number scheme to semantic versioning.
+!
+! ### Version 3.1.0, 08/11/2020, file pot3d.f, modified by RC:
+!
+!       - Removed ss+cs inner-outer iterative mode.
+!         It had not been used in many years and never worked
+!         quite right.  Algorithm is saved in previous releases
+!         on the github and svn.
+!       - Added a validation run mode.  To activate, set:
+!         VALIDATION_RUN=.TRUE.
+!         This will set up a tilted dipole exact solution, write it
+!         out, and then solve for it.  It will ignore input br files.
+!         To change the angle of the dipole, set DIPOLE_ANGLE
+!         (default is pi/4).
+!       - Added NCGFLUSH parameter to flush output file during solver
+!         iterations if NCGHIST>0 every NCGFLUSH iteration (default 25).
+!         This is useful to track progress of the solver.
+!
+! ### Version 3.1.1, 11/02/2021, modified by RC:
+!
+!       - Fixed default output name mechanics.  The default names
+!         were ignoring "fmt", making "fmt" almost useless.
+!
+! ### Version 3.2.0, 12/21/2021, modified by RC:
+!
+!       - GPN is now depricated.  Now, # of GPUs per node is assumed
+!         to be equal to # of MPI ranks per node.
+!         This removes the ability to oversubscribe GPUs, but this
+!         was not being used ever, so no great loss.
+!         This change was done to allow setting the gpu device
+!         number before any allocations so that it works correctly
+!         when using NVIDIA unified managed memory.
+!
+! ### Version 3.3.0, 02/14/2022, modified by RC:
+!
+!       - Added new timers for AX, PC, PCLOAD, and modified some timers
+!         to be more useful.
+!       - Updated ILU0 PC2 preconditioner to use single precision.
+!         This speeds up the solve both on CPUs and GPUs.
+!
+! ### Version 4.0.0, 03/02/2022, modified by RC/MS:
+!
+!       - Added cuSparse library calls for ifprec=2 on NVIDIA GPUs.
+!       - Added #ifdef
+!         preprocessor directives which will either compile a cusparse
+!         GPU version of the code or a CPU compatible version
+!         without cusparse calls and no c code references
+!
+! ### Version 4.0.1, 03/31/2022, modified by RC:
+!
+!       - Added 'target' attribute to arrays used with C_LOC.
+!         This fixes a compiler error with GFORTRAN when
+!         using cuSparse.
+!
+! ### Version 4.1.0, 10/24/2022, modified by RC:
+!
+!       - Removed NCGFLUSH input parameter.  The output is
+!         flushed at every write now.
+!       - Changed behavior of solver residual reporting
+!         to also output to terminal.
+!       - Removed flush_output_file() routine in favor of Fortran 2003
+!         intrinsic FLUSH.
+!
+! ### Version 4.2.0, 08/21/2024, modified by MS/RC:
+!
+!       - Replaced OpenACC with OpenMP Target directives.
+!         There remains a single OpenACC directive for device selection
+!         in order to work with nvfortran.
+!
+! ### Version 4.2.1, 09/19/2024, modified by RC:
+!
+!       - Changed array reduction loops to use
+!         OpenMP reductions instead of atomics.
+!       - Updated OpenMP target directives to use the
+!         more streamlined "loop" syntax.
+!
+! ### Version 4.2.2, 12/05/2024, modified by RC:
+!
+!       - Fixed bug with OpenMP target loops.
+!         Was using 'target loop' instead of 'target teams loop'
+!         nvfortran seemed to allow this, but ifx does not.
+!
+! ### Version 4.3.0, 12/05/2024, modified by RC:
+!
+!       - Converted code to free-format and renamed file from
+!         pot3d.F to pot3d.F90.
+!
+! ### Version 4.3.1, 12/05/2024, modified by RC:
+!
+!       - Code cleanup of modules.
+!       - Made solver seam buffers global to avoid too many
+!         allocates and deallocates for GPU.
+!
+!#######################################################################