diff --git a/.github/workflows/Build.yml b/.github/workflows/Build.yml
index 8b6d4ea20..2e4f69818 100644
--- a/.github/workflows/Build.yml
+++ b/.github/workflows/Build.yml
@@ -42,20 +42,19 @@ jobs:
         ./configure --prefix=$(pwd)/installed
         make all install
 
-    - name: Build Incompact3d
+    - name: Build Xcompact3d
       run: |
         export PATH=$(pwd)/openmpi-4.1.4/installed/bin/:$PATH
-        make BUILD=debug
-
-    - name: Run TGV case
-      run: |
-        cp test/data/Taylor-Green-Vortex/reference_input.i3d input.i3d
-        export PATH=$(pwd)/openmpi-4.1.4/installed/bin/:$PATH
-        mpirun -np 2 xcompact3d
+        export FC=mpif90
+        cmake -S . -B build 
+        cmake --build build -j 2
+        cmake --install build
+        ctest --test-dir build
 
     - name: Compare output time evolution to reference output
       run: |
         pip install numpy
-        python test/compare_TGV_time_evolution.py \
+        cd build/RunTests/TGV
+        python compare_TGV_time_evolution.py \
             --input time_evol.dat \
-            --reference test/data/Taylor-Green-Vortex/reference_time_evol.dat
+            --reference reference_time_evol.dat
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b56e7316c..73e64d436 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,11 +1,14 @@
-cmake_minimum_required(VERSION 3.0.2)                        
-cmake_policy(SET CMP0048 NEW)
+cmake_minimum_required(VERSION 3.20)                        
+cmake_policy(SET CMP0074 NEW)
 
-project(xcompact3d LANGUAGES Fortran)
+project(Xcompact3d LANGUAGES Fortran)
+set(version 5.0)
+if (IO_BACKEND MATCHES "adios2")
+  # Can be useful to also activate CXX, sometimes is needed by packages
+  enable_language(C CXX)
+endif (IO_BACKEND MATCHES "adios2")
 
-set(AUTHOR "Stefano Rolfo;Charles Moulinec")
-set(AUTHOR_DETAILS "stefano.rolfo@stfc.ac.uk;charles.moulinec@stfc.ac.uk")
-set(DESCRIPTION "Building xcompact3d using cmake")
+set(DESCRIPTION "Building Xcompact3d using cmake")
 
 message(STATUS "building ${PROJECT_NAME}")
 
@@ -20,163 +23,50 @@ endif()
 INCLUDE(CMakeDependentOption)
 INCLUDE(CMakeParseArguments)
 
-# Find the modules included with Xcompact
-#SET(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
-
 # make sure that the default is a RELEASE
 if (NOT CMAKE_BUILD_TYPE)
   set (CMAKE_BUILD_TYPE RELEASE CACHE STRING
-      "Choose the type of build, options are: None Debug Release."
-      FORCE)
+       "Choose the type of build, options are: NONE DEV DEBUG RELEASE."
+       FORCE)
 endif (NOT CMAKE_BUILD_TYPE)
-
-set(Fortran_COMPILER_NAME ${CMAKE_Fortran_COMPILER_ID} )
-message(STATUS "COMP ID ${Fortran_COMPILER_NAME}")
-message(STATUS "Fortran compiler name ${Fortran_COMPILER_NAME}")
-message(STATUS "Fortran compiler version ${CMAKE_Fortran_COMPILER_VERSION}")
-if (Fortran_COMPILER_NAME MATCHES "GNU")
-  # gfortran
-  message(STATUS "Setting gfortran flags")
-  set(CMAKE_Fortran_FLAGS "-cpp -funroll-loops -floop-optimize -g -Warray-bounds -fcray-pointer -fbacktrace -ffree-line-length-none")
-  if (CMAKE_Fortran_COMPILER_VERSION GREATER_EQUAL "10")
-    message(STATUS "Set New Fortran basic flags")
-    set(CMAKE_Fortran_FLAGS "-cpp -funroll-loops -floop-optimize -g -Warray-bounds -fcray-pointer -fbacktrace -ffree-line-length-none -fallow-argument-mismatch")
-  endif (CMAKE_Fortran_COMPILER_VERSION GREATER_EQUAL "10")
-  set(CMAKE_Fortran_FLAGS_RELEASE "-funroll-all-loops -fno-f2c -O3")
-  set(CMAKE_Fortran_FLAGS_DEBUG   "-DDEBG -O0 -g")
-elseif (Fortran_COMPILER_NAME MATCHES "Intel")
-  message(STATUS "Setting ifort flags")
-  set(CMAKE_Fortran_FLAGS "-fpp -xHost -heaparrays -safe-cray-ptr -g -traceback")
-  set (CMAKE_Fortran_FLAGS_RELEASE "-O3 -ipo")
-  set (CMAKE_Fortran_FLAGS_DEBUG   "-O0 -g -DDEBG")
-  #set(CMAKE_Fortran_FLAGS "-cpp xSSE4.2 -axAVX,CORE-AVX-I,CORE-AVX2 -ipo -fp-model fast=2 -mcmodel=large -safe-cray-ptr")
-elseif (Fortran_COMPILER_NAME MATCHES "NAG")
-  message(STATUS "Setting nagfor flags")
-  set(CMAKE_Fortran_FLAGS "-fpp")
-  set (CMAKE_Fortran_FLAGS_RELEASE "-O3")
-  set (CMAKE_Fortran_FLAGS_DEBUG   "-O0 -g")
-elseif (Fortran_COMPILER_NAME MATCHES "Cray")
-  message(STATUS "Setting cray fortran flags")
-  set(CMAKE_Fortran_FLAGS "-eF -g -N 1023")
-  set (CMAKE_Fortran_FLAGS_RELEASE "-O3")
-  set (CMAKE_Fortran_FLAGS_DEBUG   "-O0 -g")
-elseif (Fortran_COMPILER_NAME MATCHES "NVHPC")
-  message(STATUS "Setting NVHPC fortran flags")
-  set(CMAKE_Fortran_FLAGS "-cpp -Mfree -Kieee -g")
-  set (CMAKE_Fortran_FLAGS_RELEASE "-O3 -fast")
-  set (CMAKE_Fortran_FLAGS_DEBUG   "-O0 -DDEBG")
-elseif (Fortran_COMPILER_NAME MATCHES "Fujitsu")
-  message(STATUS "Setting Fujitsu fortran flags")
-  set (CMAKE_Fortran_FLAGS "-Cpp")
-  set (CMAKE_Fortran_FLAGS_RELEASE "-O3")
-  set (CMAKE_Fortran_FLAGS_DEBUG "-O0")
-else (Fortran_COMPILER_NAME MATCHES "GNU")
-  message ("CMAKE_Fortran_COMPILER full path: " ${CMAKE_Fortran_COMPILER})
-  message ("Fortran compiler: " ${Fortran_COMPILER_NAME})
-  message ("No optimized Fortran compiler flags are known, we just try -O2...")
-  set (CMAKE_Fortran_FLAGS_RELEASE "-O2")
-  set (CMAKE_Fortran_FLAGS_DEBUG   "-O0 -g")
-endif (Fortran_COMPILER_NAME MATCHES "GNU")
-
-if (CMAKE_BUILD_TYPE MATCHES "DEBUG")
-  add_definitions("-DDEBG")
-endif (CMAKE_BUILD_TYPE MATCHES "DEBUG")
-
-
-find_package(MPI REQUIRED)
-# Stop if there is no MPI_Fortran_Compiler
-if (MPI_Fortran_COMPILER)
-    message(STATUS "MPI_Fortran_COMPILER found: ${MPI_Fortran_COMPILER}")
-else (MPI_Fortran_COMPILER)
-    message(SEND_ERROR "This application cannot compile without MPI")
-endif(MPI_Fortran_COMPILER)
-# Warning if Include are not found => can be fixed with more recent cmake version
-if (MPI_FOUND)
-    message(STATUS "MPI FOUND: ${MPI_FOUND}")
-    include_directories(SYSTEM ${MPI_INCLUDE_PATH})
-    message(STATUS "MPI INCL ALSO FOUND: ${MPI_INCLUDE_PATH}")
-else (MPI_FOUND)
-    message(STATUS "NO MPI include have been found. The executable won't be targeted with MPI include")
-    message(STATUS "Code will compile but performaces can be compromised")
-    message(STATUS "Using a CMake vers > 3.10 should solve the problem")
-    message(STATUS "Alternatively use ccmake to manually set the include if available")
-endif (MPI_FOUND)
-
-execute_process(
-  COMMAND git describe --tag --long --always
-  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
-  OUTPUT_VARIABLE GIT_VERSION
-  OUTPUT_STRIP_TRAILING_WHITESPACE
-)
-add_definitions("-DVERSION=\"${GIT_VERSION}\"")
-option(DOUBLE_PRECISION "Build Xcompact with double precision" ON)
-if (DOUBLE_PRECISION)
-  add_definitions("-DDOUBLE_PREC")
-endif()
-
-option(SINGLE_PRECISION_OUTPUT "Build XCompact with output in single precision" OFF)
-if (SINGLE_PRECISION_OUTPUT)
-  add_definitions("-DSAVE_SINGLE")
-endif()
-
-# FFT options
-set(FFT_Choice "generic" CACHE STRING "FFT for XCompact3d project (generic is the default)")
-set_property(CACHE FFT_Choice PROPERTY STRINGS generic fftw3 mkl)
-
-# Look for ADIOS2 library
-option(USE_ADIOS2 "Build XCompact with ADIOS2 library" OFF)
-if(USE_ADIOS2)
-  #find_package(ADIOS2 COMPONENTS fortran)
-  find_package(ADIOS2)
-endif()
-if(ADIOS2_FOUND)
-  message(STATUS "ADIOS INCLUDE ${ADIOS2_INCLUDE_DIRS}")
-  include_directories(${ADIOS2_INCLUDE_DIRS})
-  add_definitions("-DADIOS2")
-else(ADIOS_FOUND)
-  message(STATUS "ADIOS2 not found")
-endif()
-
-# Create a static library for the fft
-add_subdirectory(decomp2d)
-
-# Create the Xcompact3d executable
+string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE)
+if (CMAKE_BUILD_TYPE MATCHES "NONE")
+  message (STATUS "Selected build type : None")
+elseif (CMAKE_BUILD_TYPE MATCHES "RELEASE")
+  message (STATUS "Selected build type : Release")
+elseif (CMAKE_BUILD_TYPE MATCHES "DEBUG")
+  message (STATUS "Selected build type : Debug")
+elseif (CMAKE_BUILD_TYPE MATCHES "DEV")
+  message (FATAL_ERROR "DEV build is a work in progress and currently not supported by Xcompact3d")
+else (CMAKE_BUILD_TYPE MATCHES "NONE")
+  message (FATAL_ERROR "Invalid build type, options are: NONE DEV DEBUG RELEASE.")
+endif (CMAKE_BUILD_TYPE MATCHES "NONE")
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake" "${CMAKE_SOURCE_DIR}/cmake/compilers" "${CMAKE_SOURCE_DIR}/cmake/fft")
+find_package(DECOMP2D REQUIRED)
+
+# Despite IO is handled by 2DECOMP&FFT we need to set it
+set (IO_BACKEND "mpi" CACHE STRING "Default IO backend (mpi (default) or adios2)")
+set_property(CACHE IO_BACKEND PROPERTY STRINGS mpi adios2)
+
+include(X3D_MPI)
+
+include(X3D_Compilers)
+
+# Add the sources
 add_subdirectory(src)
 
-# Create the Xcompact3d executable
-add_subdirectory(post_vtk)
-
-# Create an example dir with all input.i3d example files
-option(BUILD_TESTING "Build with tests" ON)
-set(test_dir "${PROJECT_BINARY_DIR}/Test")
-add_subdirectory(examples)
-message(STATUS "Before test main ${test_dir}")
+# Add tests
+option(BUILD_TESTING "Build with test and use only TGV case" ON)
+option(BUILD_TESTING_FULL "Build all tests" OFF)
 if (${BUILD_TESTING})
   include(CTest)
-  message(STATUS "MPI INCL ALSO FOUND: ${MPI_INCLUDE_PATH}")
-  message(STATUS "MPI EXEC: ${MPIEXEC_EXECUTABLE}")
-  # ABL
-  add_test(NAME ABL-Convective-old COMMAND ${MPIEXEC_EXECUTABLE} -n ${MPIEXEC_MAX_NUMPROCS} ${CMAKE_INSTALL_PREFIX}/bin/xcompact3d input_convective_test.i3d WORKING_DIRECTORY ${test_dir}/ABL-Convective-old)
-  add_test(NAME ABL-Neutral-old COMMAND ${MPIEXEC_EXECUTABLE} -n ${MPIEXEC_MAX_NUMPROCS} ${CMAKE_INSTALL_PREFIX}/bin/xcompact3d input_neutral_test.i3d WORKING_DIRECTORY ${test_dir}/ABL-Neutral-old)
-  add_test(NAME ABL-Stable-old COMMAND ${MPIEXEC_EXECUTABLE} -n ${MPIEXEC_MAX_NUMPROCS} ${CMAKE_INSTALL_PREFIX}/bin/xcompact3d input_stable_test.i3d WORKING_DIRECTORY ${test_dir}/ABL-Stable-old)
-  add_test(NAME ABL-Neutral COMMAND ${MPIEXEC_EXECUTABLE} -n ${MPIEXEC_MAX_NUMPROCS} ${CMAKE_INSTALL_PREFIX}/bin/xcompact3d input_neutral_test.i3d WORKING_DIRECTORY ${test_dir}/ABL-Neutral)
-  # Channel
-  add_test(NAME Channel-Flow-X COMMAND ${MPIEXEC_EXECUTABLE} -n ${MPIEXEC_MAX_NUMPROCS} ${CMAKE_INSTALL_PREFIX}/bin/xcompact3d input_test_x.i3d WORKING_DIRECTORY ${test_dir}/Channel-Flow-X)
-  add_test(NAME Channel-Flow-Z COMMAND ${MPIEXEC_EXECUTABLE} -n ${MPIEXEC_MAX_NUMPROCS} ${CMAKE_INSTALL_PREFIX}/bin/xcompact3d input_test_z.i3d WORKING_DIRECTORY ${test_dir}/Channel-Flow-Z)
-  # Cylinder
-  if(NOT ${USE_ADIOS2})
-    add_test(NAME Cylinder COMMAND ${MPIEXEC_EXECUTABLE} -n ${MPIEXEC_MAX_NUMPROCS} ${CMAKE_INSTALL_PREFIX}/bin/xcompact3d input_test.i3d WORKING_DIRECTORY ${test_dir}/Cylinder)
-    add_test(NAME CylinderMoving COMMAND ${MPIEXEC_EXECUTABLE} -n ${MPIEXEC_MAX_NUMPROCS} ${CMAKE_INSTALL_PREFIX}/bin/xcompact3d input_test_moving.i3d WORKING_DIRECTORY ${test_dir}/CylinderMoving)
-  endif()
-  # Lock excahnge
-  add_test(NAME Lock-exchange COMMAND ${MPIEXEC_EXECUTABLE} -n ${MPIEXEC_MAX_NUMPROCS} ${CMAKE_INSTALL_PREFIX}/bin/xcompact3d input_test.i3d WORKING_DIRECTORY ${test_dir}/Lock-exchange)
-  # Mixing Layer
-  add_test(NAME Mixing-layer COMMAND ${MPIEXEC_EXECUTABLE} -n ${MPIEXEC_MAX_NUMPROCS} ${CMAKE_INSTALL_PREFIX}/bin/xcompact3d input_test.i3d WORKING_DIRECTORY ${test_dir}/Mixing-layer)
-  # TGV
-  add_test(NAME TGvortex COMMAND ${MPIEXEC_EXECUTABLE} -n ${MPIEXEC_MAX_NUMPROCS} ${CMAKE_INSTALL_PREFIX}/bin/xcompact3d input_test.i3d WORKING_DIRECTORY ${test_dir}/TGV)
-  # TBL
-  add_test(NAME TBL COMMAND ${MPIEXEC_EXECUTABLE} -n ${MPIEXEC_MAX_NUMPROCS} ${CMAKE_INSTALL_PREFIX}/bin/xcompact3d input_test.i3d WORKING_DIRECTORY ${test_dir}/TBL)
-  # Wind Turbines
-  add_test(NAME Wind-Turbine COMMAND ${MPIEXEC_EXECUTABLE} -n ${MPIEXEC_MAX_NUMPROCS} ${CMAKE_INSTALL_PREFIX}/bin/xcompact3d input_test.i3d WORKING_DIRECTORY ${test_dir}/Wind-Turbine)
-endif()
+  add_subdirectory(examples)
+  add_subdirectory(test) 
+endif (${BUILD_TESTING})
+
+# Add a prettify target
+#add_custom_target(format sh ${CMAKE_SOURCE_DIR}/scripts/format.sh
+#  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
+
 
diff --git a/INSTALL.md b/INSTALL.md
new file mode 100644
index 000000000..6dcc26239
--- /dev/null
+++ b/INSTALL.md
@@ -0,0 +1,159 @@
+Xcompact3d installation documentation
+=====================================
+
+## Source Download and Compilation
+
+Xcompact3d sources can be acquired by cloning the git repository: 
+```
+$ git clone https://github.com/xcompact3d/Incompact3d
+```
+If you are behind a firewall, you may need to use the `https` protocol instead of the `git` protocol:
+```
+$ git config --global url."https://".insteadOf git@
+```
+Be sure to also configure your system to use the appropriate proxy settings, 
+e.g. by setting the `https_proxy` and `http_proxy` variables.
+
+### The compiling process
+
+The build system for Xcompact3d is based on CMake. 
+It is good practice to directly point to the 
+MPI Fortran wrapper that you would like to use to guarantee consistency 
+between Fortran compiler and MPI. 
+This can be done by setting the default Fortran environmental variable 
+```
+$ export FC=my_mpif90
+```
+To generate the build system run 
+```
+$ cmake -S $path_to_sources -B $path_to_build_directory -DOPTION1 -DOPTION2 ... 
+```
+for example 
+```
+$ cmake -S . -B build  
+```
+By defult the build system will also download 2DECOMP&FFT 
+and perform the build install using the
+Generic FFT backend. Version 2.0.3 is the default for Xcompact3d building
+and all tests are performed against this specific version.
+If the directory does not exist it will be generated and it will contain the configuration files.
+The configuration can be further
+edited by using the `ccmake` utility as
+```
+$ ccmake $path_to_build_directory
+```
+To compile the sources 
+```
+$ cmake --build $path_to_build_directory -j <nproc>
+```
+appending `-v` will display additional information about the build, such as compiler flags.
+
+After building the library can be tested. Please see the section [Testing](#testing-and-examples)
+for the available options. 
+Finally the code can be installed using 
+```
+$ cmake --install $path_to_build_directory
+```
+By default the installation directory is located under 
+```
+$ $path_to_build_directory/opt
+```
+To change the default location the `CMAKE_INSTALL_PREFIX` can be modified using 
+```
+$ cmake --build $path_to_build_directory -DCMAKE_INSTALL_PREFIX=$path_to_my_opt
+```
+or via the `ccmake` interface. 
+
+The installation is finally completed by typing 
+```
+$ cmake --install $path_to_build_directory
+```
+The installation directory `opt` will cointain:
+* The *bin* directory with the execulables **xcompact3d** for the main execution of the code;
+* The *example* directory with some examples of input *.i3d* files.
+
+### Testing
+The testing suite for the **xcompact3d** solver is composed by 14 tests as follows 
+
+1. Atmospheric Boundary layer (ABL) in neutral conditions (new set-up)
+1. Atmospheric Boundary layer (ABL) in neutral conditions (old set-up)
+1. Atmospheric Boundary layer (ABL) in convective conditions (old set-up)
+1. Atmospheric Boundary layer (ABL) in stable conditions (old set-up)
+1. Differentially heated cavity
+1. Turbulent Channel Flow with X as streamwise direction
+1. Turbulent Channel Flow with Z as streamwise direction
+1. Flow around a circular cylinder
+1. Flow around a moving circular cylinder
+1. Lock exchange
+1. Mixing Layer
+1. Turbulent Boundary Layer (TBL)
+1. Wind Turbine
+1. Taylor Green Vortex (TGV)
+
+By default only the  Taylor Green Vortex case is activated, while the full 
+testing suite needs to be enable by using the `BUILD_TESTING_FULL` flag as 
+```
+$ cmake --build $path_to_build_directory -DBUILD_TESTING_FULL=ON 
+```
+or by using `ccmake`.
+
+The tests are performed using `CTest` as  
+```
+$ ctest --test-dir $path_to_build_directory
+```
+
+Every test is performed in a dedicated working directory that is located under the following path 
+```
+$ /path/to/build/RunTests
+```
+All standard outputs from all test runs are collated under the file
+```
+$ /path/to/build/Testing/Temporary/LastTest.log
+```
+together with additional files detailing additional informations such as 
+the elapse time for the different tests and the eventual failed cases. 
+
+### Build with an already present 2DECOMP&FFT
+If different options from the default 
+(i.e. Generic FFT backend and double precision) are necessary, 
+2DECOMP&FFT needs to be pre-installed as described [here](https://github.com/2decomp-fft/2decomp-fft/blob/dev/INSTALL.md).
+Alternative available options are: 
+* FFTW or MKL for the FFT backend engine;
+* ADIOS2 instead of MPI-IO for the IO operations;
+* SINGLE precision for the build.
+
+2DECOMP&FFT installation provides CMake configuration file that can be used to find the installation directory. 
+To allow the `find_package` of `CMake` to work the following variable needs to be set as
+```
+$ export decomp2d_DIR=/path/to/2decomp/install/opt/lib/decomp2d 
+```
+Depending on the system *lib* can be *lib64*.
+
+***Note***
+Some of the alternative options for FFT and IO backends required additional input
+* For MKL FFT the location of the MKL libraires needs to be passed to the configure as 
+for the 2DECOMP&FFT installation with 
+```
+$ export MKL_DIR=${MKLROOT}/lib/cmake/mkl
+```
+
+* For ADIOS the installation directory needs to be passes to the configure as
+```
+$ cmake -S . -B ./build -DIO_BACKEND=adios2 -Dadios2_DIR=/path/to/adios2/install/lib/cmake/adios2
+```
+
+Both steps are necessary for correct linking of the target **xcompact3d** with the libraries 
+
+## Known issues
+The tests performed under `CTest` rely on the `CMake` ability to properly find the MPI executable *mpirun*. 
+The build system will try to enforce consistency between the MPI Fortran used and the MPI executable, 
+for the first iteration of the configure step. 
+In case no MPI executable is not found or correct please modify manually the `MPIEXEC_EXECUTABLE` by using 
+```
+$ cmake -S . -B build -DMPIEXEC_EXECUTABLE=/correct/path/to/mpirun
+```
+or by using 
+```
+$ ccmake $path_to_build_directory
+```
+
diff --git a/INSTALL_CMAKE.md b/INSTALL_CMAKE.md
deleted file mode 100644
index 6bdb1a7d8..000000000
--- a/INSTALL_CMAKE.md
+++ /dev/null
@@ -1,70 +0,0 @@
-XCompact3d installation documentation
-=====================================
-
-## XCompact3d installation documentation
-
-
-XCompact3d can be installed using the Makefile provided at the top of this repository or alternatively it can 
-be build using CMake. 
-Instructions on installations using the Makefile provided are given in the README file.
-This document provides more detailed instructions on the CMake building and installation together with 
-some of the knows issues. 
-
-If you cloned Xcompact3d to `/path/to/Xcompact3d/root` a simple configuration/build is performed by:
-```
-  $ export FC=mpif90
-  $ mkdir build
-  $ cd build
-  $ cmake /path/to/Xcompact3d/root
-  $ make -j n
-  $ make install 
-```
-where n is the number of tasks that you would like to use and are available on your installation system. 
-The default installation will be located under 
-
-    $ /path/to/build/opt
-
-The installation directory will cointain:
-* The *bin* directory with two execulables: **xcompaxt3d** for the main execution of the code and **xcompact3d_paraview_vtk** to convert the *.bin* files into vtk text format. The converter is useful when the default *xdmf* format is not working with Paraview
-* The *example* directory with few example of input *.i3d* files for **Xcompact3d**
-* The *lib* directory with the archive for the **decomp2d** library
-
-## CTest
-To test your installation you can also type in the terminal from your *build* directory
-
-    $ make test
-
-Four tests are performed:
-* Taylor Green Vortex (TGV)
-* Turbulent Channel Flow with x as streamwise direction
-* Turbulent Channel Flow with z as streamwise direction
-* Flow around a circular cylinder
-
-The simulations results are located under 
-
-    $ /path/to/build/Test
-
-and the standard output from the simulations is in 
-
-    $ /path/to/build/Testing/Temporary/
-
-## Known issues
-* Sometimes the *CMake* find MPI function does not properly locate the *mpiexec* for the given compiler. Please look at the output of 
-
-     $ cmake /path/to/Xcompact3d/root
-
-and make sure that the path to *mpiexec* is the correct one
-
-     $ -- MPI EXEC: /path/to/correct/mpiexec
-If the path is not correct you might have problems in running *CTest*.
-To solve the issue do the following 
-  * run *ccmake* at the root of the build directory
-
-     $ ccmake . 
-
-  * Toggle the advance mode by hitting *t*
-  * Look for the *MPIEXEC_EXECUTABLE* and set it up pointing to the correct *mpiexec*
-
-
-
-
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 96548b9f9..000000000
--- a/Makefile
+++ /dev/null
@@ -1,134 +0,0 @@
-#=======================================================================
-# Makefile for Xcompact3D
-#=======================================================================
-# Choose pre-processing options
-#   -DDOUBLE_PREC - use double-precision
-#   -DSAVE_SINGLE - Save 3D data in single-precision
-#   -DDEBG        - debuggin xcompact3d.f90
-# generate a Git version string
-GIT_VERSION := $(shell git describe --tag --long --always)
-
-DEFS = -DDOUBLE_PREC -DVERSION=\"$(GIT_VERSION)\"
-
-LCL = local# local,lad,sdu,archer
-IVER = 17# 15,16,17,18
-CMP = gcc# intel,gcc
-FFT = generic# generic,fftw3,mkl
-
-BUILD ?=
-
-#######CMP settings###########
-ifeq ($(CMP),intel)
-FC = mpiifort
-#FFLAGS = -fpp -O3 -xHost -heap-arrays -shared-intel -mcmodel=large -safe-cray-ptr -g -traceback
-FFLAGS = -fpp -O3 -xSSE4.2 -axAVX,CORE-AVX-I,CORE-AVX2 -ipo -fp-model fast=2 -mcmodel=large -safe-cray-ptr -I$(MPI_ROOT)/lib
-##debuggin test: -check all -check bounds -chintel eck uninit -gen-interfaces -warn interfaces
-else ifeq ($(CMP),gcc)
-FC = mpif90
-#FFLAGS = -O3 -funroll-loops -floop-optimize -g -Warray-bounds -fcray-pointer -x f95-cpp-input
-ifeq ($(BUILD),debug)
-FFLAGS = -cpp -g3 -Og
-FFLAGS += -ffpe-trap=invalid,zero -fcheck=bounds -fimplicit-none
-else
-FFLAGS = -cpp -O3 -funroll-loops -floop-optimize -g
-endif
-FFLAGS += -Warray-bounds -fcray-pointer -fbacktrace -ffree-line-length-none
-ifeq "$(shell expr `gfortran -dumpversion | cut -f1 -d.` \>= 10)" "1"
-FFLAGS += -fallow-argument-mismatch
-endif
-else ifeq ($(CMP),nagfor)
-FC = mpinagfor
-FFLAGS = -fpp
-else ifeq ($(CMP),cray)
-FC = ftn
-FFLAGS = -eF -g -O3 -N 1023
-endif
-
-
-MODDIR = ./mod
-DECOMPDIR = ./decomp2d
-SRCDIR = ./src
-TURBDIR = ./src
-
-### List of files for the main code
-SRCDECOMP = $(DECOMPDIR)/decomp_2d.f90 $(DECOMPDIR)/glassman.f90 $(DECOMPDIR)/fft_$(FFT).f90 $(DECOMPDIR)/io.f90
-OBJDECOMP = $(SRCDECOMP:%.f90=%.o)
-SRC = $(SRCDIR)/module_param.f90 $(SRCDIR)/variables.f90 $(SRCDIR)/poisson.f90 $(SRCDIR)/derive.f90 $(SRCDIR)/implicit.f90 $(SRCDIR)/schemes.f90 $(SRCDIR)/parameters.f90 $(SRCDIR)/*.f90
-OBJ = $(SRC:%.f90=%.o)
-SRC = $(SRCDIR)/module_param.f90 $(SRCDIR)/variables.f90 $(SRCDIR)/poisson.f90 $(SRCDIR)/ibm.f90 $(SRCDIR)/derive.f90 $(SRCDIR)/implicit.f90 $(SRCDIR)/schemes.f90 $(SRCDIR)/forces.f90 $(SRCDIR)/probes.f90 $(SRCDIR)/navier.f90 $(SRCDIR)/tools.f90 $(SRCDIR)/visu.f90 $(SRCDIR)/BC-TBL.f90 $(SRCDIR)/BC-ABL.f90 $(SRCDIR)/les_models.f90 $(SRCDIR)/BC-Lock-exchange.f90 $(SRCDIR)/time_integrators.f90 $(SRCDIR)/filters.f90 $(SRCDIR)/parameters.f90 $(SRCDIR)/BC-User.f90 $(SRCDIR)/BC-TGV.f90 $(SRCDIR)/BC-Channel-flow.f90 $(SRCDIR)/BC-Periodic-hill.f90 $(SRCDIR)/BC-Cylinder.f90 $(SRCDIR)/BC-Mixing-layer.f90 $(SRCDIR)/BC-Sandbox.f90 $(SRCDIR)/BC-Uniform.f90 $(SRCDIR)/BC-Cavity.f90 $(SRCDIR)/BC-Pipe-flow.f90 $(TURBDIR)/constants.f90 $(TURBDIR)/acl_utils.f90 $(TURBDIR)/airfoils.f90 $(TURBDIR)/dynstall.f90 $(TURBDIR)/dynstall_legacy.f90 $(TURBDIR)/acl_elem.f90 $(TURBDIR)/acl_controller.f90 $(TURBDIR)/acl_turb.f90 $(TURBDIR)/acl_out.f90 $(TURBDIR)/acl_farm_controller.f90 $(TURBDIR)/acl_model.f90 $(TURBDIR)/acl_source.f90 $(TURBDIR)/adm.f90 $(TURBDIR)/turbine.f90 $(SRCDIR)/statistics.f90 $(SRCDIR)/case.f90 $(SRCDIR)/transeq.f90 $(SRCDIR)/genepsi3d.f90 $(SRCDIR)/xcompact3d.f90
-
-
-#######FFT settings##########
-ifeq ($(FFT),fftw3)
-  #FFTW3_PATH=/usr
-  #FFTW3_PATH=/usr/lib64
-  FFTW3_PATH=/usr/local/Cellar/fftw/3.3.7_1
-  INC:=-I$(FFTW3_PATH)/include
-  LIBFFT=-L$(FFTW3_PATH) -lfftw3 -lfftw3f
-else ifeq ($(FFT),fftw3_f03)
-  FFTW3_PATH=/usr                                #ubuntu # apt install libfftw3-dev
-  #FFTW3_PATH=/usr/lib64                         #fedora # dnf install fftw fftw-devel
-  #FFTW3_PATH=/usr/local/Cellar/fftw/3.3.7_1     #macOS  # brew install fftw
-  INC:=-I$(FFTW3_PATH)/include
-  LIBFFT=-L$(FFTW3_PATH)/lib -lfftw3 -lfftw3f
-else ifeq ($(FFT),generic)
-  INC:=
-  LIBFFT=
-else ifeq ($(FFT),mkl)
-  SRCDECOMP := $(DECOMPDIR)/mkl_dfti.f90 $(SRCDECOMP)
-  LIBFFT=-Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKLROOT)/lib/intel64/libmkl_sequential.a $(MKLROOT)/lib/intel64/libmkl_core.a -Wl,--end-group -lpthread
-	INC=-I$(MKLROOT)/include
-else ifeq ($(FFT),ffte)
-  INC:=
-  LIBFFT:=-L$(FFTE_DIR)/lib -lffte
-endif
-
-#######OPTIONS settings###########
-OPT := -I$(SRCDIR) -I$(DECOMPDIR) $(FFLAGS)
-LINKOPT := $(FFLAGS)
-
-LIBIO :=
-ADIOS2DIR :=
-ifeq ($(IO),adios2)
-  ifeq ($(ADIOS2DIR),)
-    $(error Set ADIOS2DIR=/path/to/adios2/install/)
-  endif
-  OPT := -DADIOS2 $(OPT)
-  INC := $(INC) $(shell $(ADIOS2DIR)/bin/adios2-config --fortran-flags) #$(patsubst $(shell $(ADIOS2DIR)/bin/adios2-config --fortran-libs),,$(shell $(ADIOS2DIR)/bin/adios2-config -f))
-  LIBIO := $(shell $(ADIOS2DIR)/bin/adios2-config --fortran-libs)
-endif
-
-#-----------------------------------------------------------------------
-# Normally no need to change anything below
-
-all: xcompact3d
-
-xcompact3d : $(OBJDECOMP) $(OBJ)
-	$(FC) -o $@ $(LINKOPT) $(OBJDECOMP) $(OBJ) $(LIBFFT) $(LIBIO)
-
-$(OBJDECOMP):$(DECOMPDIR)%.o : $(DECOMPDIR)%.f90
-	$(FC) $(FFLAGS) $(OPT) $(DEFS) $(DEFS2) $(INC) -c $<
-	mv $(@F) ${DECOMPDIR}
-	#mv *.mod ${DECOMPDIR}
-
-
-$(OBJ):$(SRCDIR)%.o : $(SRCDIR)%.f90
-	$(FC) $(FFLAGS) $(OPT) $(DEFS) $(DEFS2) $(INC) -c $<
-	mv $(@F) ${SRCDIR}
-	#mv *.mod ${SRCDIR}
-
-## This %.o : %.f90 doesn't appear to be called...
-%.o : %.f90
-	$(FC) $(FFLAGS) $(DEFS) $(DEFS2) $(INC) -c $<
-
-.PHONY: clean
-
-
-clean:
-	rm -f $(DECOMPDIR)/*.o $(DECOMPDIR)/*.mod $(DECOMPDIR)/*.smod
-	rm -f $(SRCDIR)/*.o $(SRCDIR)/*.mod $(SRCDIR)/*.smod
-	rm -f *.o *.mod *.smod xcompact3d
-
-.PHONY: cleanall
-cleanall: clean
-	rm -f *~ \#*\# out/* data/* stats/* planes/* *.xdmf *.log *.out nodefile core sauve*
diff --git a/README.md b/README.md
index a600e0a6a..df298a1e1 100644
--- a/README.md
+++ b/README.md
@@ -3,10 +3,45 @@
 
 ## The Xcompact3d code
 
-Xcompact3d is a Fortran-based framework of high-order finite-difference flow solvers dedicated to the study of turbulent flows. Dedicated to Direct and Large Eddy Simulations (DNS/LES) for which the largest turbulent scales are simulated, it can combine the versatility of industrial codes with the accuracy of spectral codes. Its user-friendliness, simplicity, versatility, accuracy, scalability, portability and efficiency makes it an attractive tool for the Computational Fluid Dynamics community.
-
-XCompact3d is currently able to solve the incompressible and low-Mach number variable density Navier-Stokes equations using sixth-order compact finite-difference schemes with a spectral-like accuracy on a monobloc Cartesian mesh.  It was initially designed in France in the mid-90's for serial processors and later converted to HPC systems. It can now be used efficiently on hundreds of thousands CPU cores to investigate turbulence and heat transfer problems thanks to the open-source library 2DECOMP&FFT (a Fortran-based 2D pencil decomposition framework to support building large-scale parallel applications on distributed memory systems using MPI; the library has a Fast Fourier Transform module).
-When dealing with incompressible flows, the fractional step method used to advance the simulation in time requires to solve a Poisson equation. This equation is fully solved in spectral space via the use of relevant 3D Fast Fourier transforms (FFTs), allowing the use of any kind of boundary conditions for the velocity field. Using the concept of the modified wavenumber (to allow for operations in the spectral space to have the same accuracy as if they were performed in the physical space), the divergence free condition is ensured up to machine accuracy. The pressure field is staggered from the velocity field by half a mesh to avoid spurious oscillations created by the implicit finite-difference schemes. The modelling of a fixed or moving solid body inside the computational domain is performed with a customised Immersed Boundary Method. It is based on a direct forcing term in the Navier-Stokes equations to ensure a no-slip boundary condition at the wall of the solid body while imposing non-zero velocities inside the solid body to avoid discontinuities on the velocity field. This customised IBM, fully compatible with the 2D domain decomposition and with a possible mesh refinement at the wall, is based on a 1D expansion of the velocity field from fluid regions into solid regions using Lagrange polynomials or spline reconstructions. In order to reach high velocities in a context of LES, it is possible to customise the coefficients of the second derivative schemes (used for the viscous term) to add extra numerical dissipation in the simulation as a substitute of the missing dissipation from the small turbulent scales that are not resolved. 
+Xcompact3d is a Fortran-based framework of high-order finite-difference flow solvers 
+dedicated to the study of turbulent flows using high fidelity modelling such as 
+Direct and Large Eddy Simulations (DNS/LES), for which the largest turbulent scales are simulated.
+Xcompact3d can combine the versatility of industrial codes with the accuracy of spectral codes by using 
+the Immersed Boundary Method (IBM) to simulate comples geometries, while retaining high order accuracy. 
+Its user-friendliness, simplicity, versatility, accuracy, scalability, portability and efficiency 
+makes it an attractive tool for the Computational Fluid Dynamics community.
+
+Xcompact3d is currently able to solve the incompressible and low-Mach number variable density 
+Navier-Stokes equations up to a sixth-order accuracy using compact finite-difference schemes 
+with a spectral-like accuracy on a monobloc Cartesian mesh.  
+It was initially designed in France in the mid-90's for serial processors and later ported to HPC systems. 
+It can now be used efficiently on hundreds of thousands CPU cores to investigate turbulence 
+and heat transfer problems thanks to the open-source library 2DECOMP&FFT, 
+which is a Fortran-based 2D pencil/1D slabs decomposition framework to support building 
+large-scale parallel applications on distributed memory systems using MPI. 
+The library has a distributed Fast Fourier Transform module as well as I/O capabilities.
+
+Fractional time stepping is used for the time advancement, solving a Poisson equation to enforce the incompressible condition. 
+The Poisson equation is fully solved in spectral space via the use of relevant 3D Fast Fourier transforms (FFTs),
+allowing the use of any kind of boundary conditions for the velocity field. 
+Using the concept of the modified wavenumber (to allow for operations in the spectral space 
+to have the same accuracy as if they were performed in the physical space), 
+the divergence free condition is ensured up to machine accuracy. 
+The pressure field is staggered from the velocity field by half a mesh point 
+to avoid spurious oscillations created by the implicit finite-difference schemes. 
+The modelling of a fixed or moving solid body inside the computational domain is performed 
+with a customised Immersed Boundary Method. 
+It is based on a direct forcing term in the Navier-Stokes equations to ensure a no-slip boundary condition 
+at the wall of the solid body while imposing non-zero velocities inside the solid body 
+to avoid discontinuities on the velocity field. 
+This customised IBM, fully compatible with the 2D domain decomposition 
+and with a possible mesh refinement at the wall, 
+is based on a 1D expansion of the velocity field from fluid regions into solid regions 
+using Lagrange polynomials or spline reconstructions. 
+In order to reach high Reynolds numbers in a context of LES, 
+it is possible to customise the coefficients of the second derivative schemes (used for the viscous term) 
+to add extra numerical dissipation in the simulation as a substitute of the missing dissipation 
+from the small turbulent scales that are not resolved. 
 
 
 ### External Resources
@@ -15,54 +50,19 @@ When dealing with incompressible flows, the fractional step method used to advan
 
 ## Source Download and Compilation
 
-First, make sure you have all the [required dependencies](#required-build-tools-and-external-libraries) installed.
-Then, acquire the source code by cloning the git repository:
-
-    git clone https://github.com/xcompact3d/Incompact3d
-
-(If you are behind a firewall, you may need to use the `https` protocol instead of the `git` protocol:
-
-    git config --global url."https://".insteadOf git@
-
-Be sure to also configure your system to use the appropriate proxy settings, e.g. by setting the `https_proxy` and `http_proxy` variables.)
-
-By default you will be building the latest unstable version of Incompact3d. However, most users should use the most recent stable version of Incompact3d, which is currently the `4.0` series of releases. You can get this version by changing to the Incompact3d directory and running
-
-    git checkout v4.0
-
-Now run `make` to build the `Incompact3d` executable. To perform a parallel build, use `make -j N` and supply the maximum number of concurrent processes. (See [Platform Specific Build Notes] for details.)
-This takes a while, but only has to be done once. If the defaults in the build do not work for you, and you need to set specific make parameters, you can save them in `Make.user`. The build will automatically check for the existence of `Makefile` and use it if it exists.
-Building Incompact3d requires very little of disk space and virtual memory.
+The current V5 release of the code is using only `CMake` for building and installing. 
+Moreover, the 2DECOMP&FFT library is now distributed independently and can be downloaded from 
+[this repository](http2s://github.com/2decomp-fft/2decomp-fft). 
+Please have a look at [INSTALL.md](INSTALL.md) for the instructions on how to download, build and install 
+the code.
+If you want to keep using the previous version V4.1 of the code with Make for the buding system and V1.4 for 
+2DECOMP&FFT you can find the archived sources at this  [page](https://github.com/xcompact3d/Incompact3d/releases/tag/V4.1) or alternatevely
+```
+$ git clone --branch v4.1 git@github.com:xcompact3d/Incompact3d.git 
+```
 
-**Note:** The compiling process
-
-In the Incompact3d, once you have selected the correct options for your Fortran compiler, you just need to do
-
-    make clean
-    
-to make sure that you will be compiling all the files, and then
-
-    make 
-   
-Once it is built, you just need to go in one of the examples directories, for instance https://github.com/xcompact3d/Incompact3d/tree/master/examples/Taylor-Green-Vortex and from there use the input.i3d file to configure your simulation. To get to know the code, you can start with a ready-to-run input file, see as an example https://github.com/xcompact3d/Incompact3d/blob/master/examples/Taylor-Green-Vortex/input_DNS_Re1600.i3d which can be use to run the Taylor-Green case in a DNS set-up at Re=1600. Using 16 CPU cores, this simulation should last less than 5 minutes. The command to launch the simulation is
-    
-    mpirun -np 16 ../../xcompact3d 
-
-or
-
-    nohup mpirun -np 16 ../../xcompact3d > output.out &
 
 ### Releases
 
 Releases are available via git tags, also on the main Github, or via the [Zenodo DOI](https://zenodo.org/badge/latestdoi/127266756) (see top of README page on Github).
 
-### Optional ADIOS2 I/O backend
-
-As part of the ARCHER2 eCSE0302 project, an optional I/O backend using ADIOS2 has been added to the 2DECOMP&FFT library distributed with Xcompact3d.
-This is enabled at compile time, by default the original MPIIO backend will be used, to enable the ADIOS2 backend build as
-
-    make clean
-    make IO=adios2 ADIOS2DIR=${ADIOS2_DIR}
-    
-where `${ADIOS2_DIR}` points to the install location of ADIOS2.
-ADIOS2 enables configuring the I/O behaviour at runtime using an `xml` configuration file - see the example at `examples/Taylor-Green-Vortex/adios2_config.xml`.
diff --git a/cmake/FindDECOMP2D.cmake b/cmake/FindDECOMP2D.cmake
new file mode 100644
index 000000000..4801c645e
--- /dev/null
+++ b/cmake/FindDECOMP2D.cmake
@@ -0,0 +1,30 @@
+# - Find the 2decomp-fft library
+find_package(decomp2d
+             PATHS ${CMAKE_SOURCE_DIR}/decomp2d/build)
+if (decomp2d_FOUND)
+  message(STATUS "2decomp-fft FOUND")
+else(decomp2d_FOUND)
+  message(STATUS "2decomp-fft PATH not available we'll try to download and install")
+  configure_file(${CMAKE_SOURCE_DIR}/cmake/decomp2d/downloadBuild2decomp.cmake.in decomp2d-build/CMakeLists.txt)
+  #message("Second CMAKE_GENERATOR ${CMAKE_GENERATOR}") 
+  execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
+          RESULT_VARIABLE result
+          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/decomp2d-build )
+  if(result)
+      message(FATAL_ERROR "CMake step for 2decomp-fft failed: ${result}")
+  else()
+      message("CMake step for 2decomp-fft completed (${result}).")
+  endif()
+  execute_process(COMMAND ${CMAKE_COMMAND} --build .
+         RESULT_VARIABLE result
+          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/decomp2d-build )
+  if(result)
+      message(FATAL_ERROR "Build step for 2decomp-fft failed: ${result}")
+  endif()
+  set(D2D_ROOT ${CMAKE_CURRENT_BINARY_DIR}/decomp2d-build/downloadBuild2decomp-prefix/src/downloadBuild2decomp-build)
+  find_package(decomp2d REQUIRED
+	  PATHS ${D2D_ROOT})
+endif(decomp2d_FOUND)
+
+
+
diff --git a/cmake/X3D_Compilers.cmake b/cmake/X3D_Compilers.cmake
new file mode 100644
index 000000000..6a6420cdf
--- /dev/null
+++ b/cmake/X3D_Compilers.cmake
@@ -0,0 +1,90 @@
+# Compilers CMakeLists
+
+set(Fortran_COMPILER_NAME ${CMAKE_Fortran_COMPILER_ID} )
+message(STATUS "COMP ID ${Fortran_COMPILER_NAME}")
+message(STATUS "Fortran compiler name ${Fortran_COMPILER_NAME}")
+message(STATUS "Fortran compiler version ${CMAKE_Fortran_COMPILER_VERSION}")
+
+
+
+if (Fortran_COMPILER_NAME MATCHES "GNU")
+  # gfortran
+  message(STATUS "Setting gfortran flags")
+  include(X3D_flags_gnu)
+elseif (Fortran_COMPILER_NAME MATCHES "Intel")
+  message(STATUS "Setting ifort flags")
+  include(X3D_flags_intel)
+elseif (Fortran_COMPILER_NAME MATCHES "Cray")
+  message(STATUS "Setting cray fortran flags")
+  include(X3D_flags_cray)
+elseif (Fortran_COMPILER_NAME MATCHES "NVHPC")
+  message(STATUS "Setting NVHPC fortran flags")
+  include(X3D_flags_nvidia)
+#   set(CMAKE_Fortran_FLAGS "-cpp -std=f2008" CACHE STRING
+elseif (Fortran_COMPILER_NAME MATCHES "Fujitsu")
+  message(STATUS "Setting Fujitsu fortran flags")
+  include(X3D_flags_fujitsu)
+else (Fortran_COMPILER_NAME MATCHES "GNU")
+  message ("CMAKE_Fortran_COMPILER full path: " ${CMAKE_Fortran_COMPILER})
+  message ("Fortran compiler: " ${Fortran_COMPILER_NAME})
+  message ("No optimized Fortran compiler flags are known, we just try -O2...")
+  set(X3D_FFLAGS_RELEASE "-O2")
+  set(X3D_FFLAGS_DEBUG   "-O0 -g")
+endif (Fortran_COMPILER_NAME MATCHES "GNU")
+
+if (NOT FLAGS_SET)
+  set(CMAKE_Fortran_FLAGS ${X3D_FFLAGS} CACHE STRING 
+	"Base FFLAGS for build" FORCE)
+  set(CMAKE_Fortran_FLAGS_RELEASE ${X3D_FFLAGS_RELEASE} CACHE STRING
+  	"Additional FFLAGS for Release (optimised) build" FORCE)
+  set(CMAKE_Fortran_FLAGS_DEBUG ${X3D_FFLAGS_DEBUG} CACHE STRING
+  	"Additional FFLAGS for Debug build" FORCE)
+  set(CMAKE_Fortran_FLAGS_DEV ${X3D_FFLAGS_DEV} CACHE STRING
+  	"Additional FFLAGS for Dev build" FORCE)
+  set(FLAGS_SET 1 CACHE INTERNAL "Flags are set")
+endif()
+
+if (CMAKE_BUILD_TYPE MATCHES "DEBUG")
+  add_definitions("-DDEBUG")
+endif (CMAKE_BUILD_TYPE MATCHES "DEBUG")
+
+if (CMAKE_BUILD_TYPE MATCHES "DEV")
+  message(FATAL_ERROR "The code is not ready for DEV builds")
+endif (CMAKE_BUILD_TYPE MATCHES "DEV")
+
+if (ENABLE_INPLACE)
+  add_definitions("-DOVERWRITE")
+endif ()
+
+execute_process(
+  COMMAND git describe --tag --long --always
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  OUTPUT_VARIABLE GIT_VERSION
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+add_definitions("-DVERSION=\"${GIT_VERSION}\"")
+option(DOUBLE_PRECISION "Build Xcompact with double precision" ON)
+if (DOUBLE_PRECISION)
+  add_definitions("-DDOUBLE_PREC")
+endif()
+
+option(SINGLE_PRECISION_OUTPUT "Build XCompact with output in single precision" OFF)
+if (SINGLE_PRECISION_OUTPUT)
+  add_definitions("-DSAVE_SINGLE")
+endif()
+
+
+if (IO_BACKEND MATCHES "mpi")
+  message(STATUS "Using mpi (default) IO backend")
+elseif (IO_BACKEND MATCHES "adios2")
+  message(STATUS "Using ADIOS2 IO backend")
+  find_package(adios2 REQUIRED)
+  if (NOT ADIOS2_HAVE_MPI)
+    message(FATAL_ERROR "MPI support is missing in the provided ADIOS2 build")
+  endif (NOT ADIOS2_HAVE_MPI)
+  if (NOT ADIOS2_HAVE_Fortran)
+    message(FATAL_ERROR "Fortran support is missing in the provided ADIOS2 build")
+  endif (NOT ADIOS2_HAVE_Fortran)
+else (IO_BACKEND MATCHES "mpi")
+  message(FATAL_ERROR "Invalid value for CMake variable IO_BACKEND")
+endif (IO_BACKEND MATCHES "mpi")
diff --git a/cmake/X3D_MPI.cmake b/cmake/X3D_MPI.cmake
new file mode 100644
index 000000000..194d53210
--- /dev/null
+++ b/cmake/X3D_MPI.cmake
@@ -0,0 +1,48 @@
+# MPI CMakeLists
+
+find_package(MPI REQUIRED)
+# Stop if there is no MPI_Fortran_Compiler
+if (MPI_Fortran_COMPILER)
+    message(STATUS "X3D MPI_Fortran_COMPILER found: ${MPI_Fortran_COMPILER}")
+    message(STATUS "X3D MPI_VERSION found: ${MPI_VERSION}")
+    # Try to guess the MPI type to adapt compilation flags if necessary
+    string(FIND "${MPI_Fortran_COMPILER}" "mpich" pos)
+    if(pos GREATER_EQUAL "0")
+      set(FIND_MPICH TRUE)
+      message(STATUS "X3D MPI is MPICH type")
+    endif()
+    string(FIND "${MPI_Fortran_COMPILER}" "openmpi" pos)
+    if(pos GREATER_EQUAL "0")
+      set(FIND_OMPI TRUE)
+      message(STATUS "X3D MPI is openMPI type")
+    endif()
+else (MPI_Fortran_COMPILER)
+    message(SEND_ERROR "This application cannot compile without MPI")
+endif(MPI_Fortran_COMPILER)
+# Warning if Include are not found => can be fixed with more recent cmake version
+if (MPI_FOUND)
+  message(STATUS "X3D MPI FOUND: ${MPI_FOUND}")
+  include_directories(SYSTEM ${MPI_INCLUDE_PATH})
+  message(STATUS "X3D MPI INCL ALSO FOUND: ${MPI_INCLUDE_PATH}")
+  # Force the mpirun to be coherent with the mpifortran
+  # We do this only at first iteration of configure after user will take charge
+  # This is the same workaround used for 2DECOMP&FFT -> it is necessary to have a new
+  # variable name for the PATH_TO_MPIRUN to differentiate when 2D&FFT is build autonatically via CMake
+  if (NOT X3D_PATH_TO_MPIRUN)
+    string(REGEX REPLACE "mpif90" "mpirun" X3D_PATH_TO_MPIRUN "${MPI_Fortran_COMPILER}")
+    string(REPLACE "mpif90" "mpirun" X3D_PATH_TO_MPIRUN "${X3D_PATH_TO_MPIRUN}")
+    string(REPLACE "mpiifort" "mpirun" X3D_PATH_TO_MPIRUN "${X3D_PATH_TO_MPIRUN}")
+    string(REPLACE "mpiifx" "mpirun" X3D_PATH_TO_MPIRUN "${X3D_PATH_TO_MPIRUN}")
+    message(STATUS "X3D Path to mpirun ${X3D_PATH_TO_MPIRUN}")
+    set(MPIEXEC_EXECUTABLE "${X3D_PATH_TO_MPIRUN}" CACHE STRING
+        "Force MPIRUN to be consistent with MPI_Fortran_COMPILER" FORCE)
+    # Save this variable to avoid repeting the substitution
+    set(X3D_PATH_TO_MPIRUN "${X3D_PATH_TO_MPIRUN}" CACHE STRING
+	"Force MPIRUN to be consistent with MPI_Fortran_COMPILER" FORCE)
+  endif (NOT X3D_PATH_TO_MPIRUN)
+else (MPI_FOUND)
+  message(STATUS "NO MPI include have been found. The executable won't be targeted with MPI include")
+  message(STATUS "Code will compile but performaces can be compromised")
+  message(STATUS "Alternatively use ccmake to manually set the include if available")
+endif (MPI_FOUND)
+
diff --git a/cmake/compilers/X3D_flags_cray.cmake b/cmake/compilers/X3D_flags_cray.cmake
new file mode 100644
index 000000000..2457d84b0
--- /dev/null
+++ b/cmake/compilers/X3D_flags_cray.cmake
@@ -0,0 +1,6 @@
+#Compilers Flags for Cray
+
+set(X3D_FFLAGS "-eF -g -N 1023")
+set(X3D_FFLAGS_RELEASE "-O3")
+set(X3D_FFLAGS_DEBUG   "-O0 -g")
+set(X3D_FFLAGS_DEV     "${X3D_FFLAGS_DEBUG}")
diff --git a/cmake/compilers/X3D_flags_fujitsu.cmake b/cmake/compilers/X3D_flags_fujitsu.cmake
new file mode 100644
index 000000000..13fafb7d7
--- /dev/null
+++ b/cmake/compilers/X3D_flags_fujitsu.cmake
@@ -0,0 +1,6 @@
+# Compilers flags for Fujitsu
+
+set(X3D_FFLAGS "-Cpp")
+set(X3D_FFLAGS_RELEASE "-O3")
+set(X3D_FFLAGS_DEBUG   "-O0")
+set(X3D_FFLAGS_DEV     "${X3D_FFLAGS_DEBUG}")
diff --git a/cmake/compilers/X3D_flags_gnu.cmake b/cmake/compilers/X3D_flags_gnu.cmake
new file mode 100644
index 000000000..fe5ee3820
--- /dev/null
+++ b/cmake/compilers/X3D_flags_gnu.cmake
@@ -0,0 +1,22 @@
+# Flags for GNU compiler
+#
+set(X3D_FFLAGS "-cpp -ffree-line-length-none -Warray-bounds -fcray-pointer -fbacktrace")
+if (CMAKE_Fortran_COMPILER_VERSION GREATER_EQUAL "10")
+  message(STATUS "Set New Fortran basic flags")
+  set(X3D_FFLAGS "${X3D_FFLAGS} -fallow-argument-mismatch")
+  set(X3D_GNU10 TRUE)
+else (CMAKE_Fortran_COMPILER_VERSION GREATER_EQUAL "10")
+  set(X3D_GNU10 FALSE)
+endif (CMAKE_Fortran_COMPILER_VERSION GREATER_EQUAL "10")
+# Flags extracted from the Makefile
+set(X3D_FFLAGS_RELEASE "-O3 -funroll-loops -floop-optimize -g") #"-O3 -funroll-loops -floop-optimize -march=native")
+set(X3D_FFLAGS_DEBUG "-g3 -Og -ffpe-trap=invalid,zero -fcheck=bounds -fimplicit-none") #"-DDEBUG -g3 -Og -ffpe-trap=invalid,zero -fcheck=all -fimplicit-none")
+# Dev flag below is new and not working yet
+if (FIND_MPICH AND X3D_GNU10)
+  set(X3D_FFLAGS_DEV     "${X3D_FFLAGS_DEBUG} -Wall -Wno-unused-function -Wno-integer-division")
+else()
+  set(X3D_FFLAGS_DEV     "${X3D_FFLAGS_DEBUG} -Wall -Wpedantic -Wno-unused-function -Werror -Wno-integer-division")
+endif()
+if (FIND_OMPI)
+  set(X3D_FFLAGS_DEV     "${X3D_FFLAGS_DEV} -Wimplicit-procedure -Wimplicit-interface")
+endif()
diff --git a/cmake/compilers/X3D_flags_intel.cmake b/cmake/compilers/X3D_flags_intel.cmake
new file mode 100644
index 000000000..60cd1c7a0
--- /dev/null
+++ b/cmake/compilers/X3D_flags_intel.cmake
@@ -0,0 +1,11 @@
+# Compilers Flags for Intel
+
+set(X3D_FFLAGS "-fpp -xHost -heaparrays -safe-cray-ptr -g -traceback")
+set(X3D_FFLAGS_RELEASE "-O3 -ipo")
+set(X3D_FFLAGS_DEBUG   "-g -O0 -debug extended -traceback -DDEBUG")
+set(X3D_FFLAGS_DEV     "${X3D_FFLAGS_DEBUG} -warn all,noexternal")
+#set(CMAKE_Fortran_FLAGS "-cpp xSSE4.2 -axAVX,CORE-AVX-I,CORE-AVX2 -ipo -fp-model fast=2 -mcmodel=large -safe-cray-ptr")
+# 
+set(MKL_INTERFACE "lp64")
+set(MKL_THREADING "sequential")
+find_package(MKL CONFIG REQUIRED)
diff --git a/cmake/compilers/X3D_flags_nvidia.cmake b/cmake/compilers/X3D_flags_nvidia.cmake
new file mode 100644
index 000000000..fa82e6c1e
--- /dev/null
+++ b/cmake/compilers/X3D_flags_nvidia.cmake
@@ -0,0 +1,7 @@
+#Compilers Flags for NVIDIA
+
+set(X3D_FFLAGS "-cpp -Mfree -Kieee")
+set(X3D_FFLAGS_RELEASE "-O3 -fast -march=native")
+set(X3D_FFLAGS_DEBUG   "-O0 -g -traceback -Mbounds -Mchkptr -Ktrap=fp")
+set(X3D_FFLAGS_DEV     "${X3D_FFLAGS_DEBUG}")
+
diff --git a/cmake/decomp2d/downloadBuild2decomp.cmake.in b/cmake/decomp2d/downloadBuild2decomp.cmake.in
new file mode 100644
index 000000000..5af4e115a
--- /dev/null
+++ b/cmake/decomp2d/downloadBuild2decomp.cmake.in
@@ -0,0 +1,26 @@
+# downloadBuild2decomp.cmake.in
+#
+cmake_minimum_required(VERSION 3.0.2)
+
+project(downloadBuild2decomp NONE)
+
+include(ExternalProject)
+
+ExternalProject_Add(downloadBuild2decomp
+    GIT_REPOSITORY    "https://github.com/xcompact3d/2decomp-fft"
+    GIT_TAG           "v2.0.3"
+    SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/decomp2d-src"
+    INSTALL_DIR       "${CMAKE_CURRENT_BINARY_DIR}/decomp2d-opt"
+    TEST_COMMAND      ""
+)
+
+#ExternalProject_Add(downloadBuild2decomp
+#    GIT_REPOSITORY    "https://github.com/xcompact3d/2decomp-fft"
+#    GIT_TAG           "main"
+#    CONFIGURE_COMMAND "cmake -S ${CMAKE_CURRENT_BINARY_DIR}/decomp2d-src "
+#    BUILD_COMMAND     ""
+#    INSTALL_COMMAND   ""
+#    TEST_COMMAND      ""
+#    SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/decomp2d-src"
+#    BINARY_DIR        ""
+#    INSTALL_DIR       ""
diff --git a/decomp2d/CMakeLists.txt b/decomp2d/CMakeLists.txt
deleted file mode 100644
index ef9bb9d6b..000000000
--- a/decomp2d/CMakeLists.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-file(GLOB files_decomp decomp_2d.f90
-                       glassman.f90
-		       io.f90)
-include_directories(${CMAKE_SOURCE_DIR}/decomp2d)
-if(${FFT_Choice} MATCHES "generic")
-  file(GLOB files_fft fft_generic.f90)
-endif(${FFT_Choice} MATCHES "generic")
-
-set(SRCFILES ${files_decomp} ${files_fft})
-
-add_library(decomp2d STATIC ${SRCFILES})
-if (MPI_FOUND)
-  target_link_libraries(decomp2d PRIVATE MPI::MPI_Fortran)
-endif (MPI_FOUND) 
-if(ADIOS2_FOUND)
-  target_link_libraries(decomp2d PRIVATE adios2::fortran_mpi)
-endif()
-
-install(TARGETS decomp2d
-        RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
-        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} 
-)
diff --git a/decomp2d/LICENSE b/decomp2d/LICENSE
deleted file mode 100644
index eae989e3b..000000000
--- a/decomp2d/LICENSE
+++ /dev/null
@@ -1,22 +0,0 @@
-Copyright (c) 2011, The Numerical Algorithms Group (NAG) All rights reserved.
-Copyright (c) 2021, The Univeristy of Edinburgh (UoE)
-
-Redistribution and use in source and binary forms, with or without modification, are permitted
-provided that the following conditions are met:
-
-    Redistributions of source code must retain the above copyright notice, this list of conditions
-    and the following disclaimer.
-    Redistributions in binary form must reproduce the above copyright notice, this list of
-    conditions and the following disclaimer in the documentation and/or other materials provided
-    with the distribution.
-    Neither the name of the copyright owner nor the names of its contributors may be used to endorse
-    or promote products derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
-FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
-IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/decomp2d/alloc.inc b/decomp2d/alloc.inc
deleted file mode 100644
index c01561750..000000000
--- a/decomp2d/alloc.inc
+++ /dev/null
@@ -1,277 +0,0 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2012 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-! Utility routine to help allocate 3D arrays
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-! X-pencil real arrays
-subroutine alloc_x_real(var, opt_decomp, opt_global)
-
-implicit none
-
-real(mytype), allocatable, dimension(:,:,:) :: var
-TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-logical, intent(IN), optional :: opt_global
-
-TYPE(DECOMP_INFO) :: decomp
-logical :: global
-integer :: alloc_stat, errorcode
-
-if (present(opt_decomp)) then
-decomp = opt_decomp
-else
-decomp = decomp_main
-end if
-
-if (present(opt_global)) then
-global = opt_global
-else
-global = .false.
-end if
-
-if (global) then
-allocate(var(decomp%xst(1):decomp%xen(1), &
-decomp%xst(2):decomp%xen(2), decomp%xst(3):decomp%xen(3)), &
-stat=alloc_stat)
-else
-allocate(var(decomp%xsz(1),decomp%xsz(2),decomp%xsz(3)), &
-stat=alloc_stat)
-end if
-
-if (alloc_stat /= 0) then
-errorcode = 8
-call decomp_2d_abort(errorcode, &
-'Memory allocation failed when creating new arrays')
-end if
-
-return
-end subroutine alloc_x_real
-
-
-! X-pencil complex arrays
-subroutine alloc_x_complex(var, opt_decomp, opt_global)
-
-implicit none
-
-complex(mytype), allocatable, dimension(:,:,:) :: var
-TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-logical, intent(IN), optional :: opt_global
-
-TYPE(DECOMP_INFO) :: decomp
-logical :: global
-integer :: alloc_stat, errorcode
-
-if (present(opt_decomp)) then
-decomp = opt_decomp
-else
-decomp = decomp_main
-end if
-
-if (present(opt_global)) then
-global = opt_global
-else
-global = .false.
-end if
-
-if (global) then
-allocate(var(decomp%xst(1):decomp%xen(1), &
-decomp%xst(2):decomp%xen(2), decomp%xst(3):decomp%xen(3)), &
-stat=alloc_stat)
-else
-allocate(var(decomp%xsz(1),decomp%xsz(2),decomp%xsz(3)), &
-stat=alloc_stat)
-end if
-
-if (alloc_stat /= 0) then
-errorcode = 8
-call decomp_2d_abort(errorcode, &
-'Memory allocation failed when creating new arrays')
-end if
-
-return
-end subroutine alloc_x_complex
-
-
-! Y-pencil real arrays
-subroutine alloc_y_real(var, opt_decomp, opt_global)
-
-implicit none
-
-real(mytype), allocatable, dimension(:,:,:) :: var
-TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-logical, intent(IN), optional :: opt_global
-
-TYPE(DECOMP_INFO) :: decomp
-logical :: global
-integer :: alloc_stat, errorcode
-
-if (present(opt_decomp)) then
-decomp = opt_decomp
-else
-decomp = decomp_main
-end if
-
-if (present(opt_global)) then
-global = opt_global
-else
-global = .false.
-end if
-
-if (global) then
-allocate(var(decomp%yst(1):decomp%yen(1), &
-decomp%yst(2):decomp%yen(2), decomp%yst(3):decomp%yen(3)), &
-stat=alloc_stat)
-else
-allocate(var(decomp%ysz(1),decomp%ysz(2),decomp%ysz(3)), &
-stat=alloc_stat)
-end if
-
-if (alloc_stat /= 0) then
-errorcode = 8
-call decomp_2d_abort(errorcode, &
-'Memory allocation failed when creating new arrays')
-end if
-
-return
-end subroutine alloc_y_real
-
-
-! Y-pencil complex arrays
-subroutine alloc_y_complex(var, opt_decomp, opt_global)
-
-implicit none
-
-complex(mytype), allocatable, dimension(:,:,:) :: var
-TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-logical, intent(IN), optional :: opt_global
-
-TYPE(DECOMP_INFO) :: decomp
-logical :: global
-integer :: alloc_stat, errorcode
-
-if (present(opt_decomp)) then
-decomp = opt_decomp
-else
-decomp = decomp_main
-end if
-
-if (present(opt_global)) then
-global = opt_global
-else
-global = .false.
-end if
-
-if (global) then
-allocate(var(decomp%yst(1):decomp%yen(1), &
-decomp%yst(2):decomp%yen(2), decomp%yst(3):decomp%yen(3)), &
-stat=alloc_stat)
-else
-allocate(var(decomp%ysz(1),decomp%ysz(2),decomp%ysz(3)), &
-stat=alloc_stat)
-end if
-
-if (alloc_stat /= 0) then
-errorcode = 8
-call decomp_2d_abort(errorcode, &
-'Memory allocation failed when creating new arrays')
-end if
-
-return
-end subroutine alloc_y_complex
-
-
-! Z-pencil real arrays
-subroutine alloc_z_real(var, opt_decomp, opt_global)
-
-implicit none
-
-real(mytype), allocatable, dimension(:,:,:) :: var
-TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-logical, intent(IN), optional :: opt_global
-
-TYPE(DECOMP_INFO) :: decomp
-logical :: global
-integer :: alloc_stat, errorcode
-
-if (present(opt_decomp)) then
-decomp = opt_decomp
-else
-decomp = decomp_main
-end if
-
-if (present(opt_global)) then
-global = opt_global
-else
-global = .false.
-end if
-
-if (global) then
-allocate(var(decomp%zst(1):decomp%zen(1), &
-decomp%zst(2):decomp%zen(2), decomp%zst(3):decomp%zen(3)), &
-stat=alloc_stat)
-else
-allocate(var(decomp%zsz(1),decomp%zsz(2),decomp%zsz(3)), &
-stat=alloc_stat)
-end if
-
-if (alloc_stat /= 0) then
-errorcode = 8
-call decomp_2d_abort(errorcode, &
-'Memory allocation failed when creating new arrays')
-end if
-
-return
-end subroutine alloc_z_real
-
-
-! Z-pencil complex arrays
-subroutine alloc_z_complex(var, opt_decomp, opt_global)
-
-implicit none
-
-complex(mytype), allocatable, dimension(:,:,:) :: var
-TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-logical, intent(IN), optional :: opt_global
-
-TYPE(DECOMP_INFO) :: decomp
-logical :: global
-integer :: alloc_stat, errorcode
-
-if (present(opt_decomp)) then
-decomp = opt_decomp
-else
-decomp = decomp_main
-end if
-
-if (present(opt_global)) then
-global = opt_global
-else
-global = .false.
-end if
-
-if (global) then
-allocate(var(decomp%zst(1):decomp%zen(1), &
-decomp%zst(2):decomp%zen(2), decomp%zst(3):decomp%zen(3)), &
-stat=alloc_stat)
-else
-allocate(var(decomp%zsz(1),decomp%zsz(2),decomp%zsz(3)), &
-stat=alloc_stat)
-end if
-
-if (alloc_stat /= 0) then
-errorcode = 8
-call decomp_2d_abort(errorcode, &
-'Memory allocation failed when creating new arrays')
-end if
-
-return
-end subroutine alloc_z_complex
diff --git a/decomp2d/decomp_2d.f90 b/decomp2d/decomp_2d.f90
deleted file mode 100644
index aa459801a..000000000
--- a/decomp2d/decomp_2d.f90
+++ /dev/null
@@ -1,1659 +0,0 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2012 Ning Li, the Numerical Algorithms Group (NAG)
-! Copyright (C) 2021               the University of Edinburgh (UoE)
-!
-!=======================================================================
-
-! This is the main 2D pencil decomposition module
-
-module decomp_2d
-
-  use MPI
-  
-  implicit none
-
-  private        ! Make everything private unless declared public
-
-#ifdef DOUBLE_PREC
-  integer, parameter, public :: mytype = KIND(0.0D0)
-  integer, parameter, public :: real_type = MPI_DOUBLE_PRECISION
-  integer, parameter, public :: real2_type = MPI_2DOUBLE_PRECISION
-  integer, parameter, public :: complex_type = MPI_DOUBLE_COMPLEX
-#ifdef SAVE_SINGLE
-  integer, parameter, public :: mytype_single = KIND(0.0)
-  integer, parameter, public :: real_type_single = MPI_REAL
-#else
-  integer, parameter, public :: mytype_single = KIND(0.0D0)
-  integer, parameter, public :: real_type_single = MPI_DOUBLE_PRECISION
-#endif
-#else
-  integer, parameter, public :: mytype = KIND(0.0)
-  integer, parameter, public :: real_type = MPI_REAL
-  integer, parameter, public :: real2_type = MPI_2REAL
-  integer, parameter, public :: complex_type = MPI_COMPLEX
-  integer, parameter, public :: mytype_single = KIND(0.0)
-  integer, parameter, public :: real_type_single = MPI_REAL
-#endif
-
-  integer, save, public :: mytype_bytes
-
-  ! some key global variables
-  integer, save, public :: nx_global, ny_global, nz_global  ! global size
-
-  integer, save, public :: nrank  ! local MPI rank 
-  integer, save, public :: nproc  ! total number of processors
-
-  ! parameters for 2D Cartesian topology 
-  integer, save, dimension(2) :: dims, coord
-  logical, save, dimension(2) :: periodic
-  integer, save, public :: DECOMP_2D_COMM_CART_X, &
-       DECOMP_2D_COMM_CART_Y, DECOMP_2D_COMM_CART_Z 
-  integer, save :: DECOMP_2D_COMM_ROW, DECOMP_2D_COMM_COL
-
-  ! define neighboring blocks (to be used in halo-cell support)
-  !  first dimension 1=X-pencil, 2=Y-pencil, 3=Z-pencil
-  ! second dimension 1=east, 2=west, 3=north, 4=south, 5=top, 6=bottom 
-  integer, save, dimension(3,6) :: neighbour 
-
-  ! flags for periodic condition in three dimensions
-  logical, save :: periodic_x, periodic_y, periodic_z
-
-#ifdef SHM
-  ! derived type to store shared-memory info
-  TYPE, public :: SMP_INFO
-     integer MPI_COMM          ! SMP associated with this communicator
-     integer NODE_ME           ! rank in this communicator
-     integer NCPU              ! size of this communicator
-     integer SMP_COMM          ! communicator for SMP-node masters
-     integer CORE_COMM         ! communicator for cores on SMP-node
-     integer SMP_ME            ! SMP-node id starting from 1 ... NSMP
-     integer NSMP              ! number of SMP-nodes in this communicator
-     integer CORE_ME           ! core id starting from 1 ... NCORE
-     integer NCORE             ! number of cores on this SMP-node
-     integer MAXCORE           ! maximum no. cores on any SMP-node
-     integer N_SND             ! size of SMP shared memory buffer
-     integer N_RCV             ! size of SMP shared memory buffer
-     integer(8) SND_P          ! SNDBUF address (cray pointer), for real 
-     integer(8) RCV_P          ! RCVBUF address (cray pointer), for real
-     integer(8) SND_P_c        ! for complex
-     integer(8) RCV_P_c        ! for complex
-  END TYPE SMP_INFO
-#endif
-
-  ! derived type to store decomposition info for a given global data size
-  TYPE, public :: DECOMP_INFO
-     ! staring/ending index and size of data held by current processor
-     integer, dimension(3) :: xst, xen, xsz  ! x-pencil
-     integer, dimension(3) :: yst, yen, ysz  ! y-pencil
-     integer, dimension(3) :: zst, zen, zsz  ! z-pencil
-
-     ! in addition to local information, processors also need to know 
-     ! some global information for global communications to work 
-
-     ! how each dimension is distributed along pencils
-     integer, allocatable, dimension(:) :: &
-          x1dist, y1dist, y2dist, z2dist
-
-     ! send/receive buffer counts and displacements for MPI_ALLTOALLV
-     integer, allocatable, dimension(:) :: &
-          x1cnts, y1cnts, y2cnts, z2cnts
-     integer, allocatable, dimension(:) :: &
-          x1disp, y1disp, y2disp, z2disp
-
-     ! buffer counts for MPI_ALLTOALL: either for evenly distributed data
-     ! or for padded-alltoall
-     integer :: x1count, y1count, y2count, z2count
-
-     ! evenly distributed data
-     logical :: even
-
-#ifdef SHM
-     ! For shared-memory implementation
-
-     ! one instance of this derived type for each communicator
-     ! shared moemory info, such as which MPI rank belongs to which node
-     TYPE(SMP_INFO) :: ROW_INFO, COL_INFO
-
-     ! shared send/recv buffers for ALLTOALLV
-     integer, allocatable, dimension(:) :: x1cnts_s, y1cnts_s, &
-          y2cnts_s, z2cnts_s
-     integer, allocatable, dimension(:) :: x1disp_s, y1disp_s, &
-          y2disp_s, z2disp_s
-     ! A copy of original buffer displacement (will be overwriten)
-     integer, allocatable, dimension(:) :: x1disp_o, y1disp_o, &
-          y2disp_o, z2disp_o
-#endif
-  END TYPE DECOMP_INFO
-
-  ! main (default) decomposition information for global size nx*ny*nz
-  TYPE(DECOMP_INFO), save :: decomp_main
-  TYPE(DECOMP_INFO), save, public :: phG,ph1,ph2,ph3,ph4
-
-  ! staring/ending index and size of data held by current processor
-  ! duplicate 'decomp_main', needed by apps to define data structure 
-  integer, save, dimension(3), public :: xstart, xend, xsize  ! x-pencil
-  integer, save, dimension(3), public :: ystart, yend, ysize  ! y-pencil
-  integer, save, dimension(3), public :: zstart, zend, zsize  ! z-pencil
-
-  ! These are the buffers used by MPI_ALLTOALL(V) calls
-  integer, save :: decomp_buf_size = 0
-  real(mytype),    allocatable, dimension(:) :: work1_r, work2_r
-  complex(mytype), allocatable, dimension(:) :: work1_c, work2_c
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! To define smaller arrays using every several mesh points
-  integer, save, dimension(3), public :: xszS,yszS,zszS,xstS,ystS,zstS,xenS,yenS,zenS
-  integer, save, dimension(3), public :: xszV,yszV,zszV,xstV,ystV,zstV,xenV,yenV,zenV
-  integer, save, dimension(3), public :: xszP,yszP,zszP,xstP,ystP,zstP,xenP,yenP,zenP
-  logical, save :: coarse_mesh_starts_from_1
-  integer, save :: iskipS, jskipS, kskipS
-  integer, save :: iskipV, jskipV, kskipV
-  integer, save :: iskipP, jskipP, kskipP
-
-  ! public user routines
-  public :: decomp_2d_init, decomp_2d_finalize, &
-       transpose_x_to_y, transpose_y_to_z, &
-       transpose_z_to_y, transpose_y_to_x, &
-#ifdef OCC
-       transpose_x_to_y_start, transpose_y_to_z_start, &
-       transpose_z_to_y_start, transpose_y_to_x_start, &
-       transpose_x_to_y_wait, transpose_y_to_z_wait, &
-       transpose_z_to_y_wait, transpose_y_to_x_wait, &
-       transpose_test, &
-#endif
-       decomp_info_init, decomp_info_finalize, partition, &
-       init_coarser_mesh_statS,fine_to_coarseS,&
-       init_coarser_mesh_statV,fine_to_coarseV,&
-       init_coarser_mesh_statP,fine_to_coarseP,&
-       alloc_x, alloc_y, alloc_z, &
-       update_halo, decomp_2d_abort, &
-       get_decomp_info
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! These are routines to perform global data transpositions
-  ! 
-  !   Four combinations are available, enough to cover all situations
-  !    - transpose_x_to_y (X-pencil --> Y-pencil)
-  !    - transpose_y_to_z (Y-pencil --> Z-pencil)
-  !    - transpose_z_to_y (Z-pencil --> Y-pencil)
-  !    - transpose_y_to_x (Y-pencil --> X-pencil)
-  !
-  !   Generic interface provided here to support multiple data types
-  !    - real and complex types supported through generic interface
-  !    - single/double precision supported through pre-processing
-  !       * see 'mytype' variable at the beginning
-  !    - an optional argument can be supplied to transpose data whose 
-  !      global size is not the default nx*ny*nz 
-  !       * as the case in fft r2c/c2r interface 
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-  interface transpose_x_to_y
-     module procedure transpose_x_to_y_real
-     module procedure transpose_x_to_y_complex
-  end interface transpose_x_to_y
-
-  interface transpose_y_to_z
-     module procedure transpose_y_to_z_real
-     module procedure transpose_y_to_z_complex
-  end interface transpose_y_to_z
-
-  interface transpose_z_to_y
-     module procedure transpose_z_to_y_real
-     module procedure transpose_z_to_y_complex
-  end interface transpose_z_to_y
-
-  interface transpose_y_to_x
-     module procedure transpose_y_to_x_real
-     module procedure transpose_y_to_x_complex
-  end interface transpose_y_to_x
-
-#ifdef OCC
-  interface transpose_x_to_y_start
-     module procedure transpose_x_to_y_real_start
-     module procedure transpose_x_to_y_complex_start
-  end interface transpose_x_to_y_start
-
-  interface transpose_y_to_z_start
-     module procedure transpose_y_to_z_real_start
-     module procedure transpose_y_to_z_complex_start
-  end interface transpose_y_to_z_start
-
-  interface transpose_z_to_y_start
-     module procedure transpose_z_to_y_real_start
-     module procedure transpose_z_to_y_complex_start
-  end interface transpose_z_to_y_start
-
-  interface transpose_y_to_x_start
-     module procedure transpose_y_to_x_real_start
-     module procedure transpose_y_to_x_complex_start
-  end interface transpose_y_to_x_start
-
-  interface transpose_x_to_y_wait
-     module procedure transpose_x_to_y_real_wait
-     module procedure transpose_x_to_y_complex_wait
-  end interface transpose_x_to_y_wait
-
-  interface transpose_y_to_z_wait
-     module procedure transpose_y_to_z_real_wait
-     module procedure transpose_y_to_z_complex_wait
-  end interface transpose_y_to_z_wait
-
-  interface transpose_z_to_y_wait
-     module procedure transpose_z_to_y_real_wait
-     module procedure transpose_z_to_y_complex_wait
-  end interface transpose_z_to_y_wait
-
-  interface transpose_y_to_x_wait
-     module procedure transpose_y_to_x_real_wait
-     module procedure transpose_y_to_x_complex_wait
-  end interface transpose_y_to_x_wait
-#endif
-
-  interface update_halo
-     module procedure update_halo_real
-     module procedure update_halo_complex
-  end interface update_halo
-
-  interface alloc_x
-     module procedure alloc_x_real
-     module procedure alloc_x_complex
-  end interface alloc_x
-
-  interface alloc_y
-     module procedure alloc_y_real
-     module procedure alloc_y_complex
-  end interface alloc_y
-
-  interface alloc_z
-     module procedure alloc_z_real
-     module procedure alloc_z_complex
-  end interface alloc_z
-
-contains
-
-#ifdef SHM_DEBUG
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! For debugging, print the shared-memory structure
-  subroutine print_smp_info(s)
-    TYPE(SMP_INFO) :: s
-    write(10,*) 'size of current communicator:', s%NCPU
-    write(10,*) 'rank in current communicator:', s%NODE_ME
-    write(10,*) 'number of SMP-nodes in this communicator:', s%NSMP
-    write(10,*) 'SMP-node id (1 ~ NSMP):', s%SMP_ME
-    write(10,*) 'NCORE - number of cores on this SMP-node', s%NCORE
-    write(10,*) 'core id (1 ~ NCORE):', s%CORE_ME
-    write(10,*) 'maximum no. cores on any SMP-node:', s%MAXCORE
-    write(10,*) 'size of SMP shared memory SND buffer:', s%N_SND
-    write(10,*) 'size of SMP shared memory RCV buffer:', s%N_RCV
-  end subroutine print_smp_info
-#endif
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Routine to be called by applications to initialise this library
-  !   INPUT:
-  !     nx, ny, nz   - global data dimension
-  !     p_row, p_col - 2D processor grid
-  !   OUTPUT:
-  !     all internal data structures initialised properly
-  !     library ready to use
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine decomp_2d_init(nx,ny,nz,p_row,p_col,periodic_bc)
-    
-    implicit none
-
-    integer, intent(IN) :: nx,ny,nz,p_row,p_col
-    logical, dimension(3), intent(IN), optional :: periodic_bc
-
-    integer :: errorcode, ierror, row, col
-
-#ifdef SHM_DEBUG
-    character(len=80) fname
-#endif
-
-    nx_global = nx
-    ny_global = ny
-    nz_global = nz
-
-    if (present(periodic_bc)) then
-       periodic_x = periodic_bc(1)
-       periodic_y = periodic_bc(2)
-       periodic_z = periodic_bc(3)
-    else
-       periodic_x = .false.
-       periodic_y = .false.
-       periodic_z = .false.
-    end if
-
-    if (p_row==0 .and. p_col==0) then
-       ! determine the best 2D processor grid
-       call best_2d_grid(nproc, row, col)
-    else
-       if (nproc /= p_row*p_col) then
-          errorcode = 1
-          call decomp_2d_abort(errorcode, &
-               'Invalid 2D processor grid - nproc /= p_row*p_col')
-       else
-          row = p_row
-          col = p_col
-       end if
-    end if
-
-    ! Create 2D Catersian topology
-    ! Note that in order to support periodic B.C. in the halo-cell code,
-    ! need to create multiple topology objects: DECOMP_2D_COMM_CART_?,
-    ! corresponding to three pencil orientations. They contain almost
-    ! identical topological information but allow different combinations
-    ! of periodic conditions.
-    dims(1) = row
-    dims(2) = col
-    periodic(1) = periodic_y
-    periodic(2) = periodic_z
-    call MPI_CART_CREATE(MPI_COMM_WORLD,2,dims,periodic, &
-         .false., &  ! do not reorder rank
-         DECOMP_2D_COMM_CART_X, ierror)
-    periodic(1) = periodic_x
-    periodic(2) = periodic_z
-    call MPI_CART_CREATE(MPI_COMM_WORLD,2,dims,periodic, &
-         .false., DECOMP_2D_COMM_CART_Y, ierror)
-    periodic(1) = periodic_x
-    periodic(2) = periodic_y
-    call MPI_CART_CREATE(MPI_COMM_WORLD,2,dims,periodic, &
-         .false., DECOMP_2D_COMM_CART_Z, ierror)
-
-    call MPI_CART_COORDS(DECOMP_2D_COMM_CART_X,nrank,2,coord,ierror)
-
-    ! derive communicators defining sub-groups for ALLTOALL(V)
-    call MPI_CART_SUB(DECOMP_2D_COMM_CART_X,(/.true.,.false./), &
-         DECOMP_2D_COMM_COL,ierror)
-    call MPI_CART_SUB(DECOMP_2D_COMM_CART_X,(/.false.,.true./), &
-         DECOMP_2D_COMM_ROW,ierror)
-
-    ! gather information for halo-cell support code
-    call init_neighbour
-
-    ! actually generate all 2D decomposition information
-    call decomp_info_init(nx,ny,nz,decomp_main)
-
-    ! make a copy of the decomposition information associated with the
-    ! default global size in these global variables so applications can
-    ! use them to create data structures 
-    xstart = decomp_main%xst
-    ystart = decomp_main%yst
-    zstart = decomp_main%zst
-    xend   = decomp_main%xen
-    yend   = decomp_main%yen
-    zend   = decomp_main%zen
-    xsize  = decomp_main%xsz
-    ysize  = decomp_main%ysz
-    zsize  = decomp_main%zsz
-
-#ifdef SHM_DEBUG
-    ! print out shared-memory information
-    write(fname,99) nrank
-99  format('log',I2.2)
-    open(10,file=fname)
-    write(10,*)'I am mpi rank ', nrank, 'Total ranks ', nproc
-    write(10,*)' '
-    write(10,*)'Global data size:'
-    write(10,*)'nx*ny*nz', nx,ny,nz
-    write(10,*)' '
-    write(10,*)'2D processor grid:'
-    write(10,*)'p_row*p_col:', dims(1), dims(2)
-    write(10,*)' '
-    write(10,*)'Portion of global data held locally:'
-    write(10,*)'xsize:',xsize
-    write(10,*)'ysize:',ysize
-    write(10,*)'zsize:',zsize
-    write(10,*)' '
-    write(10,*)'How pensils are to be divided and sent in alltoallv:'
-    write(10,*)'x1dist:',decomp_main%x1dist
-    write(10,*)'y1dist:',decomp_main%y1dist
-    write(10,*)'y2dist:',decomp_main%y2dist
-    write(10,*)'z2dist:',decomp_main%z2dist
-    write(10,*)' '
-    write(10,*)'######Shared buffer set up after this point######'
-    write(10,*)' '
-    write(10,*) 'col communicator detais:'
-    call print_smp_info(decomp_main%COL_INFO)
-    write(10,*)' '
-    write(10,*) 'row communicator detais:'
-    call print_smp_info(decomp_main%ROW_INFO)
-    write(10,*)' '
-    write(10,*)'Buffer count and displacement of per-core buffers'
-    write(10,*)'x1cnts:',decomp_main%x1cnts
-    write(10,*)'y1cnts:',decomp_main%y1cnts
-    write(10,*)'y2cnts:',decomp_main%y2cnts
-    write(10,*)'z2cnts:',decomp_main%z2cnts
-    write(10,*)'x1disp:',decomp_main%x1disp
-    write(10,*)'y1disp:',decomp_main%y1disp
-    write(10,*)'y2disp:',decomp_main%y2disp
-    write(10,*)'z2disp:',decomp_main%z2disp
-    write(10,*)' '
-    write(10,*)'Buffer count and displacement of shared buffers'
-    write(10,*)'x1cnts:',decomp_main%x1cnts_s
-    write(10,*)'y1cnts:',decomp_main%y1cnts_s
-    write(10,*)'y2cnts:',decomp_main%y2cnts_s
-    write(10,*)'z2cnts:',decomp_main%z2cnts_s
-    write(10,*)'x1disp:',decomp_main%x1disp_s
-    write(10,*)'y1disp:',decomp_main%y1disp_s
-    write(10,*)'y2disp:',decomp_main%y2disp_s
-    write(10,*)'z2disp:',decomp_main%z2disp_s
-    write(10,*)' '
-    close(10)
-#endif
-
-    ! determine the number of bytes per float number
-    ! do not use 'mytype' which is compiler dependent
-    ! also possible to use inquire(iolength=...) 
-    call MPI_TYPE_SIZE(real_type,mytype_bytes,ierror)
-
-#ifdef EVEN
-    if (nrank==0) write(*,*) 'Padded ALLTOALL optimisation on'
-#endif 
-
-    return
-  end subroutine decomp_2d_init
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Routine to be called by applications to clean things up
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine decomp_2d_finalize
-    
-    implicit none
-    
-    call decomp_info_finalize(decomp_main)
-
-    decomp_buf_size = 0
-    deallocate(work1_r, work2_r, work1_c, work2_c)
-    
-    return
-  end subroutine decomp_2d_finalize
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Return the default decomposition object
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine get_decomp_info(decomp)
-
-    implicit none
-
-    TYPE(DECOMP_INFO), intent(OUT) :: decomp
-
-    decomp = decomp_main
-
-    return
-  end subroutine get_decomp_info
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Advanced Interface allowing applications to define globle domain of
-  ! any size, distribute it, and then transpose data among pencils.
-  !  - generate 2D decomposition details as defined in DECOMP_INFO
-  !  - the default global data size is nx*ny*nz
-  !  - a different global size nx/2+1,ny,nz is used in FFT r2c/c2r
-  !  - multiple global sizes can co-exist in one application, each
-  !    using its own DECOMP_INFO object
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine decomp_info_init(nx,ny,nz,decomp)
-
-    implicit none
-
-    integer, intent(IN) :: nx,ny,nz
-    TYPE(DECOMP_INFO), intent(INOUT) :: decomp
-
-    integer :: buf_size, status, errorcode
-
-    ! verify the global size can actually be distributed as pencils
-    if (nx_global<dims(1) .or. ny_global<dims(1) .or. ny_global<dims(2) .or. nz_global<dims(2)) then
-       errorcode = 6
-       call decomp_2d_abort(errorcode, &
-            'Invalid 2D processor grid. ' // &
-            'Make sure that min(nx,ny) >= p_row and ' // &
-            'min(ny,nz) >= p_col')
-    end if
-
-    if (mod(nx,dims(1))==0 .and. mod(ny,dims(1))==0 .and. &
-         mod(ny,dims(2))==0 .and. mod(nz,dims(2))==0) then
-       decomp%even = .true.
-    else
-       decomp%even = .false.
-    end if
-
-    ! distribute mesh points
-    allocate(decomp%x1dist(0:dims(1)-1),decomp%y1dist(0:dims(1)-1), &
-         decomp%y2dist(0:dims(2)-1),decomp%z2dist(0:dims(2)-1))
-    call get_dist(nx,ny,nz,decomp)
-
-    ! generate partition information - starting/ending index etc.
-    call partition(nx, ny, nz, (/ 1,2,3 /), &
-         decomp%xst, decomp%xen, decomp%xsz)
-    call partition(nx, ny, nz, (/ 2,1,3 /), &
-         decomp%yst, decomp%yen, decomp%ysz)
-    call partition(nx, ny, nz, (/ 2,3,1 /), &
-         decomp%zst, decomp%zen, decomp%zsz)
-
-    ! prepare send/receive buffer displacement and count for ALLTOALL(V)
-    allocate(decomp%x1cnts(0:dims(1)-1),decomp%y1cnts(0:dims(1)-1), &
-         decomp%y2cnts(0:dims(2)-1),decomp%z2cnts(0:dims(2)-1))
-    allocate(decomp%x1disp(0:dims(1)-1),decomp%y1disp(0:dims(1)-1), &
-         decomp%y2disp(0:dims(2)-1),decomp%z2disp(0:dims(2)-1))
-    call prepare_buffer(decomp)
-
-#ifdef SHM
-    ! prepare shared-memory information if required
-    call decomp_info_init_shm(decomp)
-#endif
-
-    ! allocate memory for the MPI_ALLTOALL(V) buffers
-    ! define the buffers globally for performance reason
-
-    buf_size = max(decomp%xsz(1)*decomp%xsz(2)*decomp%xsz(3), &
-         max(decomp%ysz(1)*decomp%ysz(2)*decomp%ysz(3), &
-         decomp%zsz(1)*decomp%zsz(2)*decomp%zsz(3)) )
-#ifdef EVEN
-    ! padded alltoall optimisation may need larger buffer space
-    buf_size = max(buf_size, &
-         max(decomp%x1count*dims(1),decomp%y2count*dims(2)) ) 
-#endif
-
-    ! check if additional memory is required
-    ! *** TODO: consider how to share the real/complex buffers 
-    if (buf_size > decomp_buf_size) then
-       decomp_buf_size = buf_size
-       if (allocated(work1_r)) deallocate(work1_r)
-       if (allocated(work2_r)) deallocate(work2_r)
-       if (allocated(work1_c)) deallocate(work1_c)
-       if (allocated(work2_c)) deallocate(work2_c)
-       allocate(work1_r(buf_size), STAT=status)
-       allocate(work2_r(buf_size), STAT=status)
-       allocate(work1_c(buf_size), STAT=status)
-       allocate(work2_c(buf_size), STAT=status)
-       if (status /= 0) then
-          errorcode = 2
-          call decomp_2d_abort(errorcode, &
-               'Out of memory when allocating 2DECOMP workspace')
-       end if
-    end if
-
-    return
-  end subroutine decomp_info_init
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Release memory associated with a DECOMP_INFO object
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine decomp_info_finalize(decomp)
-
-    implicit none
-
-    TYPE(DECOMP_INFO), intent(INOUT) :: decomp
-
-    deallocate(decomp%x1dist,decomp%y1dist,decomp%y2dist,decomp%z2dist)
-    deallocate(decomp%x1cnts,decomp%y1cnts,decomp%y2cnts,decomp%z2cnts)
-    deallocate(decomp%x1disp,decomp%y1disp,decomp%y2disp,decomp%z2disp)
-
-#ifdef SHM
-    deallocate(decomp%x1disp_o,decomp%y1disp_o,decomp%y2disp_o, &
-         decomp%z2disp_o)
-    deallocate(decomp%x1cnts_s,decomp%y1cnts_s,decomp%y2cnts_s, &
-         decomp%z2cnts_s)
-    deallocate(decomp%x1disp_s,decomp%y1disp_s,decomp%y2disp_s, &
-         decomp%z2disp_s)
-#endif
-
-    return
-  end subroutine decomp_info_finalize
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Coarser mesh support for statistic
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine init_coarser_mesh_statS(i_skip,j_skip,k_skip,from1)
-
-    implicit none
-
-    integer, intent(IN) :: i_skip,j_skip,k_skip
-    logical, intent(IN) :: from1  ! .true.  - save 1,n+1,2n+1...
-    ! .false. - save n,2n,3n...
-
-    integer, dimension(3) :: skip
-    integer :: i
-
-    coarse_mesh_starts_from_1 = from1
-    iskipS = i_skip
-    jskipS = j_skip
-    kskipS = k_skip
-
-    skip(1)=iskipS
-    skip(2)=jskipS
-    skip(3)=kskipS
-
-    do i=1,3
-       if (from1) then
-          xstS(i) = (xstart(i)+skip(i)-1)/skip(i)
-          if (mod(xstart(i)+skip(i)-1,skip(i))/=0) xstS(i)=xstS(i)+1
-          xenS(i) = (xend(i)+skip(i)-1)/skip(i)
-       else
-          xstS(i) = xstart(i)/skip(i)
-          if (mod(xstart(i),skip(i))/=0) xstS(i)=xstS(i)+1
-          xenS(i) = xend(i)/skip(i)
-       end if
-       xszS(i) = xenS(i)-xstS(i)+1
-    end do
-
-    do i=1,3
-       if (from1) then
-          ystS(i) = (ystart(i)+skip(i)-1)/skip(i)
-          if (mod(ystart(i)+skip(i)-1,skip(i))/=0) ystS(i)=ystS(i)+1
-          yenS(i) = (yend(i)+skip(i)-1)/skip(i)
-       else
-          ystS(i) = ystart(i)/skip(i)
-          if (mod(ystart(i),skip(i))/=0) ystS(i)=ystS(i)+1
-          yenS(i) = yend(i)/skip(i)
-       end if
-       yszS(i) = yenS(i)-ystS(i)+1
-    end do
-
-    do i=1,3
-       if (from1) then
-          zstS(i) = (zstart(i)+skip(i)-1)/skip(i)
-          if (mod(zstart(i)+skip(i)-1,skip(i))/=0) zstS(i)=zstS(i)+1
-          zenS(i) = (zend(i)+skip(i)-1)/skip(i)
-       else
-          zstS(i) = zstart(i)/skip(i)
-          if (mod(zstart(i),skip(i))/=0) zstS(i)=zstS(i)+1
-          zenS(i) = zend(i)/skip(i)
-       end if
-       zszS(i) = zenS(i)-zstS(i)+1
-    end do
-
-    return
-  end subroutine init_coarser_mesh_statS
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Coarser mesh support for visualization
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine init_coarser_mesh_statV(i_skip,j_skip,k_skip,from1)
-
-    implicit none
-
-    integer, intent(IN) :: i_skip,j_skip,k_skip
-    logical, intent(IN) :: from1  ! .true.  - save 1,n+1,2n+1...
-    ! .false. - save n,2n,3n...
-
-    integer, dimension(3) :: skip
-    integer :: i
-
-    coarse_mesh_starts_from_1 = from1
-    iskipV = i_skip
-    jskipV = j_skip
-    kskipV = k_skip
-
-    skip(1)=iskipV
-    skip(2)=jskipV
-    skip(3)=kskipV
-
-    do i=1,3
-       if (from1) then
-          xstV(i) = (xstart(i)+skip(i)-1)/skip(i)
-          if (mod(xstart(i)+skip(i)-1,skip(i))/=0) xstV(i)=xstV(i)+1
-          xenV(i) = (xend(i)+skip(i)-1)/skip(i)
-       else
-          xstV(i) = xstart(i)/skip(i)
-          if (mod(xstart(i),skip(i))/=0) xstV(i)=xstV(i)+1
-          xenV(i) = xend(i)/skip(i)
-       end if
-       xszV(i) = xenV(i)-xstV(i)+1
-    end do
-
-    do i=1,3
-       if (from1) then
-          ystV(i) = (ystart(i)+skip(i)-1)/skip(i)
-          if (mod(ystart(i)+skip(i)-1,skip(i))/=0) ystV(i)=ystV(i)+1
-          yenV(i) = (yend(i)+skip(i)-1)/skip(i)
-       else
-          ystV(i) = ystart(i)/skip(i)
-          if (mod(ystart(i),skip(i))/=0) ystV(i)=ystV(i)+1
-          yenV(i) = yend(i)/skip(i)
-       end if
-       yszV(i) = yenV(i)-ystV(i)+1
-    end do
-
-    do i=1,3
-       if (from1) then
-          zstV(i) = (zstart(i)+skip(i)-1)/skip(i)
-          if (mod(zstart(i)+skip(i)-1,skip(i))/=0) zstV(i)=zstV(i)+1
-          zenV(i) = (zend(i)+skip(i)-1)/skip(i)
-       else
-          zstV(i) = zstart(i)/skip(i)
-          if (mod(zstart(i),skip(i))/=0) zstV(i)=zstV(i)+1
-          zenV(i) = zend(i)/skip(i)
-       end if
-       zszV(i) = zenV(i)-zstV(i)+1
-    end do
-
-    return
-  end subroutine init_coarser_mesh_statV
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Coarser mesh support for probe
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine init_coarser_mesh_statP(i_skip,j_skip,k_skip,from1)
-
-    implicit none
-
-    integer, intent(IN) :: i_skip,j_skip,k_skip
-    logical, intent(IN) :: from1  ! .true.  - save 1,n+1,2n+1...
-    ! .false. - save n,2n,3n...
-
-    integer, dimension(3) :: skip
-    integer :: i
-
-    coarse_mesh_starts_from_1 = from1
-    iskipP = i_skip
-    jskipP = j_skip
-    kskipP = k_skip
-
-    skip(1)=iskipP
-    skip(2)=jskipP
-    skip(3)=kskipP
-
-    do i=1,3
-       if (from1) then
-          xstP(i) = (xstart(i)+skip(i)-1)/skip(i)
-          if (mod(xstart(i)+skip(i)-1,skip(i))/=0) xstP(i)=xstP(i)+1
-          xenP(i) = (xend(i)+skip(i)-1)/skip(i)
-       else
-          xstP(i) = xstart(i)/skip(i)
-          if (mod(xstart(i),skip(i))/=0) xstP(i)=xstP(i)+1
-          xenP(i) = xend(i)/skip(i)
-       end if
-       xszP(i) = xenP(i)-xstP(i)+1
-    end do
-
-    do i=1,3
-       if (from1) then
-          ystP(i) = (ystart(i)+skip(i)-1)/skip(i)
-          if (mod(ystart(i)+skip(i)-1,skip(i))/=0) ystP(i)=ystP(i)+1
-          yenP(i) = (yend(i)+skip(i)-1)/skip(i)
-       else
-          ystP(i) = ystart(i)/skip(i)
-          if (mod(ystart(i),skip(i))/=0) ystP(i)=ystP(i)+1
-          yenP(i) = yend(i)/skip(i)
-       end if
-       yszP(i) = yenP(i)-ystP(i)+1
-    end do
-
-    do i=1,3
-       if (from1) then
-          zstP(i) = (zstart(i)+skip(i)-1)/skip(i)
-          if (mod(zstart(i)+skip(i)-1,skip(i))/=0) zstP(i)=zstP(i)+1
-          zenP(i) = (zend(i)+skip(i)-1)/skip(i)
-       else
-          zstP(i) = zstart(i)/skip(i)
-          if (mod(zstart(i),skip(i))/=0) zstP(i)=zstP(i)+1
-          zenP(i) = zend(i)/skip(i)
-       end if
-       zszP(i) = zenP(i)-zstP(i)+1
-    end do
-
-    return
-  end subroutine init_coarser_mesh_statP
-
-  ! Copy data from a fine-resolution array to a coarse one for statistic
-  subroutine fine_to_coarseS(ipencil,var_fine,var_coarse)
-
-    implicit none
-
-    real(mytype), dimension(:,:,:) :: var_fine
-    real(mytype), dimension(:,:,:) :: var_coarse
-    integer, intent(IN) :: ipencil
-
-    real(mytype), allocatable, dimension(:,:,:) :: wk, wk2
-    integer :: i,j,k
-
-    if (ipencil==1) then
-       allocate(wk(xstS(1):xenS(1),xstS(2):xenS(2),xstS(3):xenS(3)))
-       allocate(wk2(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3)))
-       wk2=var_fine
-       if (coarse_mesh_starts_from_1) then
-          do k=xstS(3),xenS(3)
-             do j=xstS(2),xenS(2)
-                do i=xstS(1),xenS(1)
-                   wk(i,j,k) = wk2((i-1)*iskipS+1,(j-1)*jskipS+1,(k-1)*kskipS+1)
-                end do
-             end do
-          end do
-       else
-          do k=xstS(3),xenS(3)
-             do j=xstS(2),xenS(2)
-                do i=xstS(1),xenS(1)
-                   wk(i,j,k) = wk2(i*iskipS,j*jskipS,k*kskipS)
-                end do
-             end do
-          end do
-       end if
-       var_coarse=wk
-    else if (ipencil==2) then
-       allocate(wk(ystS(1):yenS(1),ystS(2):yenS(2),ystS(3):yenS(3)))
-       allocate(wk2(ystart(1):yend(1),ystart(2):yend(2),ystart(3):yend(3)))
-       wk2=var_fine
-       if (coarse_mesh_starts_from_1) then
-          do k=ystS(3),yenS(3)
-             do j=ystS(2),yenS(2)
-                do i=ystS(1),yenS(1)
-                   wk(i,j,k) = wk2((i-1)*iskipS+1,(j-1)*jskipS+1,(k-1)*kskipS+1)
-                end do
-             end do
-          end do
-       else
-          do k=ystS(3),yenS(3)
-             do j=ystS(2),yenS(2)
-                do i=ystS(1),yenS(1)
-                   wk(i,j,k) = wk2(i*iskipS,j*jskipS,k*kskipS)
-                end do
-             end do
-          end do
-       end if
-       var_coarse=wk
-    else if (ipencil==3) then
-       allocate(wk(zstS(1):zenS(1),zstS(2):zenS(2),zstS(3):zenS(3)))
-       allocate(wk2(zstart(1):zend(1),zstart(2):zend(2),zstart(3):zend(3)))
-       wk2=var_fine
-       if (coarse_mesh_starts_from_1) then
-          do k=zstS(3),zenS(3)
-             do j=zstS(2),zenS(2)
-                do i=zstS(1),zenS(1)
-                   wk(i,j,k) = wk2((i-1)*iskipS+1,(j-1)*jskipS+1,(k-1)*kskipS+1)
-                end do
-             end do
-          end do
-       else
-          do k=zstS(3),zenS(3)
-             do j=zstS(2),zenS(2)
-                do i=zstS(1),zenS(1)
-                   wk(i,j,k) = wk2(i*iskipS,j*jskipS,k*kskipS)
-                end do
-             end do
-          end do
-       end if
-       var_coarse=wk
-    end if
-
-    deallocate(wk,wk2)
-
-    return
-  end subroutine fine_to_coarseS
-
-  ! Copy data from a fine-resolution array to a coarse one for visualization
-  subroutine fine_to_coarseV(ipencil,var_fine,var_coarse)
-
-    implicit none
-
-    real(mytype), dimension(:,:,:) :: var_fine
-    real(mytype), dimension(:,:,:) :: var_coarse
-    integer, intent(IN) :: ipencil
-
-    real(mytype), allocatable, dimension(:,:,:) :: wk, wk2
-    integer :: i,j,k
-
-    if (ipencil==1) then
-       allocate(wk(xstV(1):xenV(1),xstV(2):xenV(2),xstV(3):xenV(3)))
-       allocate(wk2(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3)))
-       wk2=var_fine
-       if (coarse_mesh_starts_from_1) then
-          do k=xstV(3),xenV(3)
-             do j=xstV(2),xenV(2)
-                do i=xstV(1),xenV(1)
-                   wk(i,j,k) = wk2((i-1)*iskipV+1,(j-1)*jskipV+1,(k-1)*kskipV+1)
-                end do
-             end do
-          end do
-       else
-          do k=xstV(3),xenV(3)
-             do j=xstV(2),xenV(2)
-                do i=xstV(1),xenV(1)
-                   wk(i,j,k) = wk2(i*iskipV,j*jskipV,k*kskipV)
-                end do
-             end do
-          end do
-       end if
-       var_coarse=wk
-    else if (ipencil==2) then
-       allocate(wk(ystV(1):yenV(1),ystV(2):yenV(2),ystV(3):yenV(3)))
-       allocate(wk2(ystart(1):yend(1),ystart(2):yend(2),ystart(3):yend(3)))
-       wk2=var_fine
-       if (coarse_mesh_starts_from_1) then
-          do k=ystV(3),yenV(3)
-             do j=ystV(2),yenV(2)
-                do i=ystV(1),yenV(1)
-                   wk(i,j,k) = wk2((i-1)*iskipV+1,(j-1)*jskipV+1,(k-1)*kskipV+1)
-                end do
-             end do
-          end do
-       else
-          do k=ystV(3),yenV(3)
-             do j=ystV(2),yenV(2)
-                do i=ystV(1),yenV(1)
-                   wk(i,j,k) = wk2(i*iskipV,j*jskipV,k*kskipV)
-                end do
-             end do
-          end do
-       end if
-       var_coarse=wk
-    else if (ipencil==3) then
-       allocate(wk(zstV(1):zenV(1),zstV(2):zenV(2),zstV(3):zenV(3)))
-       allocate(wk2(zstart(1):zend(1),zstart(2):zend(2),zstart(3):zend(3)))
-       wk2=var_fine
-       if (coarse_mesh_starts_from_1) then
-          do k=zstV(3),zenV(3)
-             do j=zstV(2),zenV(2)
-                do i=zstV(1),zenV(1)
-                   wk(i,j,k) = wk2((i-1)*iskipV+1,(j-1)*jskipV+1,(k-1)*kskipV+1)
-                end do
-             end do
-          end do
-       else
-          do k=zstV(3),zenV(3)
-             do j=zstV(2),zenV(2)
-                do i=zstV(1),zenV(1)
-                   wk(i,j,k) = wk2(i*iskipV,j*jskipV,k*kskipV)
-                end do
-             end do
-          end do
-       end if
-       var_coarse=wk
-    end if
-
-    deallocate(wk,wk2)
-
-    return
-  end subroutine fine_to_coarseV
-
-  ! Copy data from a fine-resolution array to a coarse one for probe
-  subroutine fine_to_coarseP(ipencil,var_fine,var_coarse)
-
-    implicit none
-
-    real(mytype), dimension(:,:,:) :: var_fine
-    real(mytype), dimension(:,:,:) :: var_coarse
-    integer, intent(IN) :: ipencil
-
-    real(mytype), allocatable, dimension(:,:,:) :: wk, wk2
-    integer :: i,j,k
-
-    if (ipencil==1) then
-       allocate(wk(xstP(1):xenP(1),xstP(2):xenP(2),xstP(3):xenP(3)))
-       allocate(wk2(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3)))
-       wk2=var_fine
-       if (coarse_mesh_starts_from_1) then
-          do k=xstP(3),xenP(3)
-             do j=xstP(2),xenP(2)
-                do i=xstP(1),xenP(1)
-                   wk(i,j,k) = wk2((i-1)*iskipP+1,(j-1)*jskipP+1,(k-1)*kskipP+1)
-                end do
-             end do
-          end do
-       else
-          do k=xstP(3),xenP(3)
-             do j=xstP(2),xenP(2)
-                do i=xstP(1),xenP(1)
-                   wk(i,j,k) = wk2(i*iskipP,j*jskipP,k*kskipP)
-                end do
-             end do
-          end do
-       end if
-       var_coarse=wk
-    else if (ipencil==2) then
-       allocate(wk(ystP(1):yenP(1),ystP(2):yenP(2),ystP(3):yenP(3)))
-       allocate(wk2(ystart(1):yend(1),ystart(2):yend(2),ystart(3):yend(3)))
-       wk2=var_fine
-       if (coarse_mesh_starts_from_1) then
-          do k=ystP(3),yenP(3)
-             do j=ystP(2),yenP(2)
-                do i=ystP(1),yenP(1)
-                   wk(i,j,k) = wk2((i-1)*iskipP+1,(j-1)*jskipP+1,(k-1)*kskipP+1)
-                end do
-             end do
-          end do
-       else
-          do k=ystP(3),yenP(3)
-             do j=ystP(2),yenP(2)
-                do i=ystP(1),yenP(1)
-                   wk(i,j,k) = wk2(i*iskipP,j*jskipP,k*kskipP)
-                end do
-             end do
-          end do
-       end if
-       var_coarse=wk
-    else if (ipencil==3) then
-       allocate(wk(zstP(1):zenP(1),zstP(2):zenP(2),zstP(3):zenP(3)))
-       allocate(wk2(zstart(1):zend(1),zstart(2):zend(2),zstart(3):zend(3)))
-       wk2=var_fine
-       if (coarse_mesh_starts_from_1) then
-          do k=zstP(3),zenP(3)
-             do j=zstP(2),zenP(2)
-                do i=zstP(1),zenP(1)
-                   wk(i,j,k) = wk2((i-1)*iskipP+1,(j-1)*jskipP+1,(k-1)*kskipP+1)
-                end do
-             end do
-          end do
-       else
-          do k=zstP(3),zenP(3)
-             do j=zstP(2),zenP(2)
-                do i=zstP(1),zenP(1)
-                   wk(i,j,k) = wk2(i*iskipP,j*jskipP,k*kskipP)
-                end do
-             end do
-          end do
-       end if
-       var_coarse=wk
-    end if
-
-    deallocate(wk,wk2)
-
-    return
-  end subroutine fine_to_coarseP
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Find sub-domain information held by current processor
-  !   INPUT: 
-  !     nx, ny, nz - global data dimension
-  !     pdim(3)    - number of processor grid in each dimension, 
-  !                  valid values: 1 - distibute locally; 
-  !                                2 - distribute across p_row; 
-  !                                3 - distribute across p_col
-  !   OUTPUT:
-  !     lstart(3)  - starting index
-  !     lend(3)    - ending index
-  !     lsize(3)   - size of the sub-block (redundant) 
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine partition(nx, ny, nz, pdim, lstart, lend, lsize)
-
-    implicit none
-
-    integer, intent(IN) :: nx, ny, nz
-    integer, dimension(3), intent(IN) :: pdim
-    integer, dimension(3), intent(OUT) :: lstart, lend, lsize
-
-    integer, allocatable, dimension(:) :: st,en,sz
-    integer :: i, gsize
-
-    do i = 1, 3
-
-       if (i==1) then
-          gsize = nx
-       else if (i==2) then
-          gsize = ny
-       else if (i==3) then
-          gsize = nz
-       end if
-
-       if (pdim(i) == 1) then        ! all local
-          lstart(i) = 1
-          lend(i)   = gsize
-          lsize(i)  = gsize
-       elseif (pdim(i) == 2) then    ! distribute across dims(1)
-          allocate(st(0:dims(1)-1))
-          allocate(en(0:dims(1)-1))
-          allocate(sz(0:dims(1)-1))
-          call distribute(gsize,dims(1),st,en,sz)
-          lstart(i) = st(coord(1))
-          lend(i)   = en(coord(1))
-          lsize(i)  = sz(coord(1))
-          deallocate(st,en,sz)
-       elseif (pdim(i) == 3) then    ! distribute across dims(2)
-          allocate(st(0:dims(2)-1))
-          allocate(en(0:dims(2)-1))
-          allocate(sz(0:dims(2)-1))
-          call distribute(gsize,dims(2),st,en,sz)
-          lstart(i) = st(coord(2))
-          lend(i)   = en(coord(2))
-          lsize(i)  = sz(coord(2))
-          deallocate(st,en,sz)
-       end if
-
-    end do
-    return   
-
-  end subroutine partition
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !   - distibutes grid points in one dimension
-  !   - handles uneven distribution properly 
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
-  subroutine distribute(data1,proc,st,en,sz)
-
-    implicit none
-    ! data1 -- data size in any dimension to be partitioned
-    ! proc  -- number of processors in that dimension
-    ! st    -- array of starting index
-    ! en    -- array of ending index
-    ! sz    -- array of local size  (redundent)
-    integer data1,proc,st(0:proc-1),en(0:proc-1),sz(0:proc-1)
-    integer i,size1,nl,nu
-
-    size1=data1/proc
-    nu = data1 - size1 * proc
-    nl = proc - nu
-    st(0) = 1
-    sz(0) = size1
-    en(0) = size1
-    do i=1,nl-1
-       st(i) = st(i-1) + size1
-       sz(i) = size1
-       en(i) = en(i-1) + size1
-    end do
-    size1 = size1 + 1
-    do i=nl,proc-1
-       st(i) = en(i-1) + 1
-       sz(i) = size1
-       en(i) = en(i-1) + size1
-    end do
-    en(proc-1)= data1 
-    sz(proc-1)= data1-st(proc-1)+1
-
-    return
-  end subroutine distribute
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !  Define how each dimension is distributed across processors
-  !    e.g. 17 meshes across 4 processor would be distibuted as (4,4,4,5)
-  !    such global information is required locally at MPI_ALLTOALLV time
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine get_dist(nx,ny,nz,decomp)
-
-    integer, intent(IN) :: nx, ny, nz
-    TYPE(DECOMP_INFO), intent(INOUT) :: decomp
-    integer, allocatable, dimension(:) :: st,en
-
-    allocate(st(0:dims(1)-1))
-    allocate(en(0:dims(1)-1))
-    call distribute(nx,dims(1),st,en,decomp%x1dist)
-    call distribute(ny,dims(1),st,en,decomp%y1dist)
-    deallocate(st,en)
-
-    allocate(st(0:dims(2)-1))
-    allocate(en(0:dims(2)-1))
-    call distribute(ny,dims(2),st,en,decomp%y2dist)
-    call distribute(nz,dims(2),st,en,decomp%z2dist)
-    deallocate(st,en)
-
-    return
-  end subroutine get_dist
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Prepare the send / receive buffers for MPI_ALLTOALLV communications
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine prepare_buffer(decomp)
-
-    implicit none
-
-    TYPE(DECOMP_INFO), intent(INOUT) :: decomp
-
-    integer :: i
-
-    !LG : AJOUTS "bidons" pour eviter un plantage en -O3 avec gcc9.3
-    !       * la fonction sortait des valeurs 'aleatoires'
-    !         et le calcul plantait dans MPI_ALLTOALLV
-    !       * pas de plantage en O2
-    
-    character(len=100) :: tmp_char
-    if (nrank==0) then
-     open(newunit=i,file='temp.dat', form='unformatted')
-         write(i) decomp%x1dist,decomp%y1dist,decomp%y2dist,decomp%z2dist, &
-              decomp%xsz,decomp%ysz,decomp%zsz
-     close(i)
-     call system("rm temp.dat")
-    endif
-
-    ! MPI_ALLTOALLV buffer information
-
-    do i=0, dims(1)-1
-       decomp%x1cnts(i) = decomp%x1dist(i)*decomp%xsz(2)*decomp%xsz(3)
-       decomp%y1cnts(i) = decomp%ysz(1)*decomp%y1dist(i)*decomp%ysz(3)
-       if (i==0) then
-          decomp%x1disp(i) = 0  ! displacement is 0-based index
-          decomp%y1disp(i) = 0
-       else
-          decomp%x1disp(i) = decomp%x1disp(i-1) + decomp%x1cnts(i-1)
-          decomp%y1disp(i) = decomp%y1disp(i-1) + decomp%y1cnts(i-1)
-       end if
-    end do
-
-    do i=0, dims(2)-1
-       decomp%y2cnts(i) = decomp%ysz(1)*decomp%y2dist(i)*decomp%ysz(3)
-       decomp%z2cnts(i) = decomp%zsz(1)*decomp%zsz(2)*decomp%z2dist(i)
-       if (i==0) then
-          decomp%y2disp(i) = 0  ! displacement is 0-based index
-          decomp%z2disp(i) = 0
-       else
-          decomp%y2disp(i) = decomp%y2disp(i-1) + decomp%y2cnts(i-1)
-          decomp%z2disp(i) = decomp%z2disp(i-1) + decomp%z2cnts(i-1)
-       end if
-    end do
-
-    ! MPI_ALLTOALL buffer information
-
-    ! For evenly distributed data, following is an easier implementation.
-    ! But it should be covered by the more general formulation below.
-    !decomp%x1count = decomp%xsz(1)*decomp%xsz(2)*decomp%xsz(3)/dims(1)
-    !decomp%y1count = decomp%ysz(1)*decomp%ysz(2)*decomp%ysz(3)/dims(1) 
-    !decomp%y2count = decomp%ysz(1)*decomp%ysz(2)*decomp%ysz(3)/dims(2)
-    !decomp%z2count = decomp%zsz(1)*decomp%zsz(2)*decomp%zsz(3)/dims(2)
-
-    ! For unevenly distributed data, pad smaller messages. Note the 
-    ! last blocks along pencils always get assigned more mesh points
-    ! for X <=> Y transposes
-    decomp%x1count = decomp%x1dist(dims(1)-1) * &
-         decomp%y1dist(dims(1)-1) * decomp%xsz(3)
-    decomp%y1count = decomp%x1count
-    ! for Y <=> Z transposes
-    decomp%y2count = decomp%y2dist(dims(2)-1) * &
-         decomp%z2dist(dims(2)-1) * decomp%zsz(1)
-    decomp%z2count = decomp%y2count
-
-    return
-  end subroutine prepare_buffer
-
-#ifdef SHM
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !  Generate shared-memory information 
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine decomp_info_init_shm(decomp)
-
-    implicit none
-
-    TYPE(DECOMP_INFO), intent(INOUT) :: decomp
-
-    ! a copy of old displacement array (will be overwritten by shm code)
-    allocate(decomp%x1disp_o(0:dims(1)-1),decomp%y1disp_o(0:dims(1)-1), &
-         decomp%y2disp_o(0:dims(2)-1),decomp%z2disp_o(0:dims(2)-1))
-    decomp%x1disp_o = decomp%x1disp
-    decomp%y1disp_o = decomp%y1disp
-    decomp%y2disp_o = decomp%y2disp
-    decomp%z2disp_o = decomp%z2disp
-
-    call prepare_shared_buffer(decomp%ROW_INFO,DECOMP_2D_COMM_ROW,decomp)
-    call prepare_shared_buffer(decomp%COL_INFO,DECOMP_2D_COMM_COL,decomp)
-
-    return
-  end subroutine decomp_info_init_shm
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! For shared-memory implementation, prepare send/recv shared buffer
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine prepare_shared_buffer(C,MPI_COMM,decomp)
-
-    implicit none
-
-    TYPE(SMP_INFO) :: C
-    INTEGER :: MPI_COMM
-    TYPE(DECOMP_INFO) :: decomp
-
-    INTEGER, ALLOCATABLE :: KTBL(:,:),NARY(:,:),KTBLALL(:,:)
-    INTEGER MYSMP, MYCORE, COLOR
-
-    integer :: ierror
-
-    C%MPI_COMM = MPI_COMM
-    CALL MPI_COMM_SIZE(MPI_COMM,C%NCPU,ierror)
-    CALL MPI_COMM_RANK(MPI_COMM,C%NODE_ME,ierror)
-    C%SMP_COMM  = MPI_COMM_NULL
-    C%CORE_COMM = MPI_COMM_NULL
-    C%SMP_ME= 0
-    C%NCORE = 0
-    C%CORE_ME = 0
-    C%MAXCORE = 0
-    C%NSMP  = 0
-    C%N_SND = 0
-    C%N_RCV = 0
-    C%SND_P = 0
-    C%RCV_P = 0
-    C%SND_P_c = 0
-    C%RCV_P_c = 0
-
-    ! get smp-node map for this communicator and set up smp communicators
-    CALL GET_SMP_MAP(C%MPI_COMM, C%NSMP, MYSMP, &
-         C%NCORE, MYCORE, C%MAXCORE)
-    C%SMP_ME = MYSMP + 1
-    C%CORE_ME = MYCORE + 1
-    ! - set up inter/intra smp-node communicators
-    COLOR = MYCORE
-    IF (COLOR.GT.0) COLOR = MPI_UNDEFINED
-    CALL MPI_Comm_split(C%MPI_COMM, COLOR, MYSMP, C%SMP_COMM, ierror)
-    CALL MPI_Comm_split(C%MPI_COMM, MYSMP, MYCORE, C%CORE_COMM, ierror)
-    ! - allocate work space
-    ALLOCATE(KTBL(C%MAXCORE,C%NSMP),NARY(C%NCPU,C%NCORE))
-    ALLOCATE(KTBLALL(C%MAXCORE,C%NSMP))
-    ! - set up smp-node/core to node_me lookup table
-    KTBL = 0
-    KTBL(C%CORE_ME,C%SMP_ME) = C%NODE_ME + 1
-    CALL MPI_ALLREDUCE(KTBL,KTBLALL,C%NSMP*C%MAXCORE,MPI_INTEGER, &
-         MPI_SUM,MPI_COMM,ierror)
-    KTBL=KTBLALL
-    !  IF (SUM(KTBL) /= C%NCPU*(C%NCPU+1)/2) &
-    !       CALL MPI_ABORT(...
-
-    ! compute offsets in shared SNDBUF and RCVBUF
-    CALL MAPSET_SMPSHM(C, KTBL, NARY, decomp)
-
-    DEALLOCATE(KTBL,NARY)
-
-    return
-  end subroutine prepare_shared_buffer
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Use Ian Bush's FreeIPC to generate shared-memory information
-  !  - system independent solution
-  !  - replacing David Tanqueray's implementation in alloc_shm.c
-  !    (old C code renamed to get_smp_map2)
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine get_smp_map(comm, nnodes, my_node, ncores, my_core, maxcor)
-
-    use FIPC_module
-
-    implicit none
-
-    integer, intent(IN) :: comm
-    integer, intent(OUT) :: nnodes, my_node, ncores, my_core, maxcor
-
-    integer :: intra_comm, extra_comm
-    integer :: ierror
-
-    call FIPC_init(comm, ierror)
-
-    ! intra_comm: communicator for processes on this shared memory node
-    ! extra_comm: communicator for all rank 0 on each shared memory node
-    call FIPC_ctxt_intra_comm(FIPC_ctxt_world, intra_comm, ierror)
-    call FIPC_ctxt_extra_comm(FIPC_ctxt_world, extra_comm, ierror)
-
-    call MPI_COMM_SIZE(intra_comm,  ncores, ierror)
-    call MPI_COMM_RANK(intra_comm, my_core, ierror)
-
-    ! only rank 0 on each shared memory node member of extra_comm
-    ! for others extra_comm = MPI_COMM_NULL
-    if (extra_comm /= MPI_COMM_NULL) then
-       call MPI_COMM_SIZE(extra_comm,  nnodes, ierror)
-       call MPI_COMM_RANK(extra_comm, my_node, ierror)
-    end if
-
-    ! other ranks share the same information as their leaders
-    call MPI_BCAST( nnodes, 1, MPI_INTEGER, 0, intra_comm, ierror)
-    call MPI_BCAST(my_node, 1, MPI_INTEGER, 0, intra_comm, ierror)
-
-    ! maxcor
-    call MPI_ALLREDUCE(ncores, maxcor, 1, MPI_INTEGER, MPI_MAX, &
-         MPI_COMM_WORLD, ierror)
-
-    call FIPC_finalize(ierror)
-
-    return
-
-  end subroutine get_smp_map
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Set up smp-node based shared memory maps
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  SUBROUTINE MAPSET_SMPSHM(C, KTBL, NARY, decomp)
-
-    IMPLICIT NONE
-
-    TYPE (SMP_INFO) C
-    INTEGER KTBL(C%MAXCORE,C%NSMP)
-    INTEGER NARY(C%NCPU,C%NCORE)
-    TYPE (DECOMP_INFO) :: decomp
-
-    INTEGER i, j, k, l, N, PTR, BSIZ, ierror, status, seed
-    character*16 s
-
-    BSIZ = C%N_SND
-
-    ! a - SNDBUF
-    IF (C%MPI_COMM==DECOMP_2D_COMM_COL) THEN
-       ALLOCATE(decomp%x1cnts_s(C%NSMP),decomp%x1disp_s(C%NSMP+1), &
-            stat=status)
-       CALL MPI_Allgather(decomp%x1cnts, C%NCPU, MPI_INTEGER, &
-            NARY, C%NCPU, MPI_INTEGER, C%CORE_COMM, ierror)
-       PTR = 0
-       DO i=1,C%NSMP
-          decomp%x1disp_s(i) = PTR
-          N = 0
-          DO j=1,C%MAXCORE
-             k = KTBL(j,i)
-             IF (k > 0) then
-                DO l=1,C%NCORE
-                   IF (l == C%CORE_ME) decomp%x1disp_o(k-1) = PTR
-                   N = N + NARY(k,l)
-                   PTR = PTR + NARY(k,l)
-                END DO
-             END IF
-          END DO
-          decomp%x1cnts_s(i) = N
-       END DO
-       decomp%x1disp_s(C%NSMP+1) = PTR
-       IF (PTR > BSIZ) BSIZ = PTR
-
-    ELSE IF (C%MPI_COMM==DECOMP_2D_COMM_ROW) THEN
-       ALLOCATE(decomp%y2cnts_s(C%NSMP),decomp%y2disp_s(C%NSMP+1), &
-            stat=status)
-       CALL MPI_Allgather(decomp%y2cnts, C%NCPU, MPI_INTEGER, &
-            NARY, C%NCPU, MPI_INTEGER, C%CORE_COMM, ierror)
-       PTR = 0
-       DO i=1,C%NSMP
-          decomp%y2disp_s(i) = PTR
-          N = 0
-          DO j=1,C%MAXCORE
-             k = KTBL(j,i)
-             IF (k > 0) then
-                DO l=1,C%NCORE
-                   IF (l == C%CORE_ME) decomp%y2disp_o(k-1) = PTR
-                   N = N + NARY(k,l)
-                   PTR = PTR + NARY(k,l)
-                END DO
-             END IF
-          END DO
-          decomp%y2cnts_s(i) = N
-       END DO
-       decomp%y2disp_s(C%NSMP+1) = PTR
-       IF (PTR > BSIZ) BSIZ = PTR
-    END IF
-
-    ! b - RCVBUF
-
-    IF (C%MPI_COMM==DECOMP_2D_COMM_COL) THEN
-       ALLOCATE(decomp%y1cnts_s(C%NSMP),decomp%y1disp_s(C%NSMP+1), &
-            stat=status)
-       CALL MPI_Allgather(decomp%y1cnts, C%NCPU, MPI_INTEGER, &
-            NARY, C%NCPU, MPI_INTEGER, C%CORE_COMM, ierror)
-       PTR = 0
-       DO i=1,C%NSMP
-          decomp%y1disp_s(i) = PTR
-          N=0
-          DO j=1,C%NCORE
-             DO l=1,C%MAXCORE
-                k = KTBL(l,i)
-                IF (k > 0) then
-                   IF (j == C%CORE_ME) decomp%y1disp_o(k-1) = PTR
-                   N = N + NARY(k,j)
-                   PTR = PTR + NARY(k,j)
-                END IF
-             END DO
-          END DO
-          decomp%y1cnts_s(i) = N
-       END DO
-       decomp%y1disp_s(C%NSMP+1) = PTR
-       IF (PTR > BSIZ) BSIZ = PTR
-
-    ELSE IF (C%MPI_COMM==DECOMP_2D_COMM_ROW) THEN
-       ALLOCATE(decomp%z2cnts_s(C%NSMP),decomp%z2disp_s(C%NSMP+1), &
-            stat=status)
-       CALL MPI_Allgather(decomp%z2cnts, C%NCPU, MPI_INTEGER, &
-            NARY, C%NCPU, MPI_INTEGER, C%CORE_COMM, ierror)
-       PTR = 0
-       DO i=1,C%NSMP
-          decomp%z2disp_s(i) = PTR
-          N=0
-          DO j=1,C%NCORE
-             DO l=1,C%MAXCORE
-                k = KTBL(l,i)
-                IF (k > 0) then
-                   IF (j == C%CORE_ME) decomp%z2disp_o(k-1) = PTR
-                   N = N + NARY(k,j)
-                   PTR = PTR + NARY(k,j)
-                END IF
-             END DO
-          END DO
-          decomp%z2cnts_s(i) = N
-       END DO
-       decomp%z2disp_s(C%NSMP+1) = PTR
-       IF (PTR > BSIZ) BSIZ = PTR
-
-    END IF
-
-    ! check buffer size and (re)-allocate buffer space if necessary
-    IF (BSIZ > C%N_SND) then
-       IF (C%SND_P /= 0) CALL DEALLOC_SHM(C%SND_P, C%CORE_COMM)
-       ! make sure each rank has unique keys to get shared memory
-       !IF (C%MPI_COMM==DECOMP_2D_COMM_COL) THEN
-       !   seed = nrank+nproc*0+1 ! has to be non-zero
-       !ELSE IF (C%MPI_COMM==DECOMP_2D_COMM_ROW) THEN
-       !   seed = nrank+nproc*1+1
-       !END IF
-       status = 1
-       !CALL ALLOC_SHM(C%SND_P, BSIZ, real_type, C%CORE_COMM, status, &
-       !     seed)
-       CALL ALLOC_SHM(C%SND_P, BSIZ, real_type, C%CORE_COMM, status)
-       C%N_SND = BSIZ
-
-       IF (C%RCV_P /= 0) CALL DEALLOC_SHM(C%RCV_P, C%CORE_COMM)
-       status = 1
-       CALL ALLOC_SHM(C%RCV_P, BSIZ, real_type, C%CORE_COMM, status)
-       C%N_RCV = BSIZ
-
-       IF (C%SND_P_c /= 0) CALL DEALLOC_SHM(C%SND_P_c, C%CORE_COMM)
-       status = 1
-       CALL ALLOC_SHM(C%SND_P_c, BSIZ, complex_type, C%CORE_COMM, status)
-       C%N_SND = BSIZ
-
-       IF (C%RCV_P_c /= 0) CALL DEALLOC_SHM(C%RCV_P_c, C%CORE_COMM)
-       status = 1
-       CALL ALLOC_SHM(C%RCV_P_c, BSIZ, complex_type, C%CORE_COMM, status)
-       C%N_RCV = BSIZ
-
-
-    END IF
-
-    RETURN
-  END SUBROUTINE MAPSET_SMPSHM
-
-#endif
-
-
-#ifdef OCC
-  ! For non-blocking communication code, progress the comminication stack
-  subroutine transpose_test(handle)
-
-    implicit none
-
-    integer :: handle, ierror
-
-    call NBC_TEST(handle,ierror)
-
-    return
-  end subroutine transpose_test
-#endif
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Transposition routines 
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-#include "transpose_x_to_y.inc"
-#include "transpose_y_to_z.inc"
-#include "transpose_z_to_y.inc"
-#include "transpose_y_to_x.inc"
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Auto-tuning algorithm to select the best 2D processor grid
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine best_2d_grid(iproc, best_p_row, best_p_col)
-    implicit none
-
-    integer, intent(IN) :: iproc
-    integer, intent(OUT) :: best_p_row, best_p_col
-
-    integer, allocatable, dimension(:) :: factors
-    integer :: nfact, i, row, col, i_best
-
-    if (nrank==0) write(*,*) 'In auto-tuning mode......'
-
-    i = int(sqrt(real(iproc))) + 10  ! enough space to save all factors 
-    allocate(factors(i))
-    call findfactor(iproc, factors, nfact)
-    if (nrank==0) write(*,*) 'factors: ', (factors(i), i=1,nfact)
-
-    i_best=nfact/2+1
-    col=factors(i_best)
-
-    best_p_col = col
-    best_p_row=iproc/col
-    if (nrank==0) print *,'p_row x p_col', best_p_row, best_p_col
-    if ((best_p_col==1).and.(nrank==0)) then
-       print *,'WARNING: current 2D DECOMP set-up might not work'
-    endif
-    
-    deallocate(factors)
-
-    return
-  end subroutine best_2d_grid
-
-#include "factor.inc"
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Halo cell support
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-#include "halo.inc"
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Error handling
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine decomp_2d_abort(errorcode, msg)
-
-    implicit none
-
-    integer, intent(IN) :: errorcode
-    character(len=*), intent(IN) :: msg
-
-    integer :: ierror
-
-    if (nrank==0) then
-       write(*,*) '2DECOMP&FFT ERROR - errorcode: ', errorcode
-       write(*,*) 'ERROR MESSAGE: ' // msg
-    end if
-    call MPI_ABORT(MPI_COMM_WORLD,errorcode,ierror)
-
-    return
-  end subroutine decomp_2d_abort
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Utility routines to help allocate 3D arrays
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-#include "alloc.inc"
-
-
-end module decomp_2d
-
diff --git a/decomp2d/factor.inc b/decomp2d/factor.inc
deleted file mode 100644
index 1ea2988c0..000000000
--- a/decomp2d/factor.inc
+++ /dev/null
@@ -1,82 +0,0 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-! A few utility routines to find factors of integer numbers
-
-subroutine findfactor(num, factors, nfact)
-
-implicit none
-
-integer, intent(IN) :: num
-integer, intent(OUT), dimension(*) :: factors
-integer, intent(OUT) :: nfact
-integer :: i, m
-
-! find the factors <= sqrt(num)
-m = int(sqrt(real(num)))
-nfact = 1
-do i=1,m
-if (num/i*i == num) then
-factors(nfact) = i
-nfact = nfact + 1
-end if
-end do
-nfact = nfact - 1
-
-! derive those > sqrt(num)
-if (factors(nfact)**2/=num) then
-do i=nfact+1, 2*nfact
-factors(i) = num / factors(2*nfact-i+1)
-end do
-nfact = nfact * 2
-else
-do i=nfact+1, 2*nfact-1
-factors(i) = num / factors(2*nfact-i)
-end do
-nfact = nfact * 2 - 1
-endif
-
-return
-
-end subroutine findfactor
-
-
-subroutine primefactors(num, factors, nfact)
-
-implicit none
-
-integer, intent(IN) :: num
-integer, intent(OUT), dimension(*) :: factors
-integer, intent(INOUT) :: nfact
-
-integer :: i, n
-
-i = 2  
-nfact = 1
-n = num 
-do
-if (mod(n,i) == 0) then
-factors(nfact) = i
-nfact = nfact + 1
-n = n / i
-else
-i = i + 1
-end if
-if (n == 1) then
-nfact = nfact - 1
-exit
-end if
-end do
-
-return
-
-end subroutine primefactors
-
diff --git a/decomp2d/fft_common.inc b/decomp2d/fft_common.inc
deleted file mode 100644
index 2e7b45c32..000000000
--- a/decomp2d/fft_common.inc
+++ /dev/null
@@ -1,187 +0,0 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-! This file contains common code shared by all FFT engines
-
-integer, parameter, public :: DECOMP_2D_FFT_FORWARD = -1
-integer, parameter, public :: DECOMP_2D_FFT_BACKWARD = 1
-
-! Physical space data can be stored in either X-pencil or Z-pencil
-integer, parameter, public :: PHYSICAL_IN_X = 1
-integer, parameter, public :: PHYSICAL_IN_Z = 3 
-
-integer, save :: format                 ! input X-pencil or Z-pencil
-
-! The libary can only be initialised once
-logical, save :: initialised = .false. 
-
-! Global size of the FFT
-integer, save :: nx_fft, ny_fft, nz_fft
-
-! 2D processor grid
-integer, save, dimension(2) :: dims
-
-! Decomposition objects
-TYPE(DECOMP_INFO), save :: ph  ! physical space
-TYPE(DECOMP_INFO), save :: sp  ! spectral space
-
-! Workspace to store the intermediate Y-pencil data
-! *** TODO: investigate how to use only one workspace array
-complex(mytype), allocatable, dimension(:,:,:) :: wk2_c2c, wk2_r2c
-complex(mytype), allocatable, dimension(:,:,:) :: wk13
-
-public :: decomp_2d_fft_init, decomp_2d_fft_3d, &
-decomp_2d_fft_finalize, decomp_2d_fft_get_size
-
-! Declare generic interfaces to handle different inputs
-
-interface decomp_2d_fft_init
-module procedure fft_init_noarg
-module procedure fft_init_arg
-module procedure fft_init_general
-end interface
-
-interface decomp_2d_fft_3d
-module procedure fft_3d_c2c
-module procedure fft_3d_r2c
-module procedure fft_3d_c2r
-end interface
-
-
-contains
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-! Initialise the FFT module
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-subroutine fft_init_noarg
-
-implicit none
-
-call fft_init_arg(PHYSICAL_IN_X)  ! default input is X-pencil data
-
-return
-end subroutine fft_init_noarg
-
-subroutine fft_init_arg(pencil)     ! allow to handle Z-pencil input
-
-implicit none
-
-integer, intent(IN) :: pencil
-
-call fft_init_general(pencil, nx_global, ny_global, nz_global)
-
-return
-end subroutine fft_init_arg
-
-! Initialise the FFT library to perform arbitrary size transforms
-subroutine fft_init_general(pencil, nx, ny, nz)
-
-implicit none
-
-integer, intent(IN) :: pencil
-integer, intent(IN) :: nx, ny, nz
-
-logical, dimension(2) :: dummy_periods
-integer, dimension(2) :: dummy_coords
-integer :: status, errorcode, ierror
-
-if (initialised) then
-errorcode = 4
-call decomp_2d_abort(errorcode, &
-'FFT library should only be initialised once')
-end if
-
-format = pencil
-nx_fft = nx
-ny_fft = ny
-nz_fft = nz
-
-! determine the processor grid in use
-call MPI_CART_GET(DECOMP_2D_COMM_CART_X, 2, &
-dims, dummy_periods, dummy_coords, ierror)
-
-! for c2r/r2c interface:
-! if in physical space, a real array is of size: nx*ny*nz
-! in spectral space, the complex array is of size:
-!         (nx/2+1)*ny*nz, if PHYSICAL_IN_X
-!      or nx*ny*(nz/2+1), if PHYSICAL_IN_Z
-
-call decomp_info_init(nx, ny, nz, ph)
-if (format==PHYSICAL_IN_X) then
-call decomp_info_init(nx/2+1, ny, nz, sp)
-else if (format==PHYSICAL_IN_Z) then
-call decomp_info_init(nx, ny, nz/2+1, sp)
-end if
-
-allocate(wk2_c2c(ph%ysz(1),ph%ysz(2),ph%ysz(3)), STAT=status)
-allocate(wk2_r2c(sp%ysz(1),sp%ysz(2),sp%ysz(3)), STAT=status)
-if (format==PHYSICAL_IN_X) then
-allocate(wk13(sp%xsz(1),sp%xsz(2),sp%xsz(3)), STAT=status)
-else if (format==PHYSICAL_IN_Z) then
-allocate(wk13(sp%zsz(1),sp%zsz(2),sp%zsz(3)), STAT=status)
-end if
-if (status /= 0) then
-errorcode = 3
-call decomp_2d_abort(errorcode, &
-'Out of memory when initialising FFT')
-end if
-
-call init_fft_engine
-
-initialised = .true.
-
-return
-end subroutine fft_init_general
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-! Final clean up
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-subroutine decomp_2d_fft_finalize
-
-implicit none
-
-call decomp_info_finalize(ph)
-call decomp_info_finalize(sp)
-
-deallocate(wk2_c2c, wk2_r2c, wk13)
-
-call finalize_fft_engine
-
-initialised = .false.
-
-return
-end subroutine decomp_2d_fft_finalize
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-! Return the size, starting/ending index of the distributed array 
-!  whose global size is (nx/2+1)*ny*nz, for defining data structures
-!  in r2c and c2r interfaces
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-subroutine decomp_2d_fft_get_size(istart, iend, isize)
-
-implicit none
-integer, dimension(3), intent(OUT) :: istart, iend, isize
-
-if (format==PHYSICAL_IN_X) then
-istart = sp%zst
-iend   = sp%zen
-isize  = sp%zsz
-else if (format==PHYSICAL_IN_Z) then
-istart = sp%xst
-iend   = sp%xen
-isize  = sp%xsz
-end if
-
-return
-end subroutine decomp_2d_fft_get_size
diff --git a/decomp2d/fft_common_3d.inc b/decomp2d/fft_common_3d.inc
deleted file mode 100644
index b3ba1df8c..000000000
--- a/decomp2d/fft_common_3d.inc
+++ /dev/null
@@ -1,263 +0,0 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-! This file contains 3D c2c/r2c/c2r transform subroutines which are
-! identical for several FFT engines 
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-! 3D FFT - complex to complex
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-subroutine fft_3d_c2c(in, out, isign)
-
-implicit none
-
-complex(mytype), dimension(:,:,:), intent(INOUT) :: in
-complex(mytype), dimension(:,:,:), intent(OUT) :: out
-integer, intent(IN) :: isign
-
-#ifndef OVERWRITE
-complex(mytype), allocatable, dimension(:,:,:) :: wk1
-#endif
-
-if (format==PHYSICAL_IN_X .AND. isign==DECOMP_2D_FFT_FORWARD .OR.  &
-format==PHYSICAL_IN_Z .AND. isign==DECOMP_2D_FFT_BACKWARD) then
-
-! ===== 1D FFTs in X =====
-#ifdef OVERWRITE
-call c2c_1m_x(in,isign,ph)
-#else
-allocate (wk1(ph%xsz(1),ph%xsz(2),ph%xsz(3)))
-wk1 = in
-call c2c_1m_x(wk1,isign,ph)
-#endif
-
-! ===== Swap X --> Y; 1D FFTs in Y =====
-
-if (dims(1)>1) then
-#ifdef OVERWRITE
-call transpose_x_to_y(in,wk2_c2c,ph)
-#else
-call transpose_x_to_y(wk1,wk2_c2c,ph)
-#endif
-call c2c_1m_y(wk2_c2c,isign,ph)
-else
-#ifdef OVERWRITE
-call c2c_1m_y(in,isign,ph)
-#else
-call c2c_1m_y(wk1,isign,ph)
-#endif
-end if
-
-! ===== Swap Y --> Z; 1D FFTs in Z =====
-if (dims(1)>1) then
-call transpose_y_to_z(wk2_c2c,out,ph)
-else
-#ifdef OVERWRITE
-call transpose_y_to_z(in,out,ph)
-#else
-call transpose_y_to_z(wk1,out,ph)
-#endif
-end if
-call c2c_1m_z(out,isign,ph)
-
-else if (format==PHYSICAL_IN_X .AND. isign==DECOMP_2D_FFT_BACKWARD &
-.OR. & 
-format==PHYSICAL_IN_Z .AND. isign==DECOMP_2D_FFT_FORWARD) then
-
-! ===== 1D FFTs in Z =====
-#ifdef OVERWRITE
-call c2c_1m_z(in,isign,ph)
-#else
-allocate (wk1(ph%zsz(1),ph%zsz(2),ph%zsz(3)))
-wk1 = in
-call c2c_1m_z(wk1,isign,ph)
-#endif
-
-! ===== Swap Z --> Y; 1D FFTs in Y =====
-if (dims(1)>1) then
-#ifdef OVERWRITE
-call transpose_z_to_y(in,wk2_c2c,ph)
-#else
-call transpose_z_to_y(wk1,wk2_c2c,ph)
-#endif
-call c2c_1m_y(wk2_c2c,isign,ph)
-else  ! out==wk2_c2c if 1D decomposition
-#ifdef OVERWRITE
-call transpose_z_to_y(in,out,ph)
-#else
-call transpose_z_to_y(wk1,out,ph)
-#endif
-call c2c_1m_y(out,isign,ph)
-end if
-
-! ===== Swap Y --> X; 1D FFTs in X =====
-if (dims(1)>1) then
-call transpose_y_to_x(wk2_c2c,out,ph)
-end if
-call c2c_1m_x(out,isign,ph)
-
-end if
-
-#ifndef OVERWRITE
-! Free memory
-if (allocated(wk1)) deallocate(wk1)
-#endif
-
-return
-end subroutine fft_3d_c2c
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-! 3D forward FFT - real to complex
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-subroutine fft_3d_r2c(in_r, out_c)
-
-implicit none
-
-real(mytype), dimension(:,:,:), intent(IN) :: in_r
-complex(mytype), dimension(:,:,:), intent(OUT) :: out_c
-
-if (format==PHYSICAL_IN_X) then
-
-! ===== 1D FFTs in X =====
-call r2c_1m_x(in_r,wk13)
-
-! ===== Swap X --> Y; 1D FFTs in Y =====
-if (dims(1)>1) then
-call transpose_x_to_y(wk13,wk2_r2c,sp)
-call c2c_1m_y(wk2_r2c,-1,sp)
-else
-call c2c_1m_y(wk13,-1,sp)
-end if
-
-! ===== Swap Y --> Z; 1D FFTs in Z =====
-if (dims(1)>1) then
-call transpose_y_to_z(wk2_r2c,out_c,sp)
-else
-call transpose_y_to_z(wk13,out_c,sp)
-end if
-call c2c_1m_z(out_c,-1,sp)
-
-else if (format==PHYSICAL_IN_Z) then
-
-! ===== 1D FFTs in Z =====
-call r2c_1m_z(in_r,wk13)
-
-! ===== Swap Z --> Y; 1D FFTs in Y =====
-if (dims(1)>1) then
-call transpose_z_to_y(wk13,wk2_r2c,sp)
-call c2c_1m_y(wk2_r2c,-1,sp)
-else  ! out_c==wk2_r2c if 1D decomposition
-call transpose_z_to_y(wk13,out_c,sp)
-call c2c_1m_y(out_c,-1,sp)
-end if
-
-! ===== Swap Y --> X; 1D FFTs in X =====
-if (dims(1)>1) then
-call transpose_y_to_x(wk2_r2c,out_c,sp)
-end if
-call c2c_1m_x(out_c,-1,sp)
-
-end if
-
-return
-end subroutine fft_3d_r2c
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-! 3D inverse FFT - complex to real
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-subroutine fft_3d_c2r(in_c, out_r)
-
-implicit none
-
-complex(mytype), dimension(:,:,:), intent(INOUT) :: in_c
-real(mytype), dimension(:,:,:), intent(OUT) :: out_r
-
-#ifndef OVERWRITE
-complex(mytype), allocatable, dimension(:,:,:) :: wk1
-#endif
-
-if (format==PHYSICAL_IN_X) then
-
-! ===== 1D FFTs in Z =====
-#ifdef OVERWRITE
-call c2c_1m_z(in_c,1,sp)       
-#else
-allocate(wk1(sp%zsz(1),sp%zsz(2),sp%zsz(3)))
-wk1 = in_c
-call c2c_1m_z(wk1,1,sp)
-#endif
-
-! ===== Swap Z --> Y; 1D FFTs in Y =====
-#ifdef OVERWRITE
-call transpose_z_to_y(in_c,wk2_r2c,sp)
-#else
-call transpose_z_to_y(wk1,wk2_r2c,sp)
-#endif
-call c2c_1m_y(wk2_r2c,1,sp)
-
-! ===== Swap Y --> X; 1D FFTs in X =====
-if (dims(1)>1) then
-call transpose_y_to_x(wk2_r2c,wk13,sp)
-call c2r_1m_x(wk13,out_r)
-else
-call c2r_1m_x(wk2_r2c,out_r)
-end if
-
-else if (format==PHYSICAL_IN_Z) then
-
-! ===== 1D FFTs in X =====
-#ifdef OVERWRITE
-call c2c_1m_x(in_c,1,sp)
-#else
-allocate(wk1(sp%xsz(1),sp%xsz(2),sp%xsz(3)))
-wk1 = in_c
-call c2c_1m_x(wk1,1,sp)
-#endif
-
-! ===== Swap X --> Y; 1D FFTs in Y =====
-if (dims(1)>1) then
-#ifdef OVERWRITE
-call transpose_x_to_y(in_c,wk2_r2c,sp)
-#else
-call transpose_x_to_y(wk1,wk2_r2c,sp)
-#endif
-call c2c_1m_y(wk2_r2c,1,sp)
-else  ! in_c==wk2_r2c if 1D decomposition
-#ifdef OVERWRITE
-call c2c_1m_y(in_c,1,sp)
-#else
-call c2c_1m_y(wk1,1,sp)
-#endif
-end if
-
-! ===== Swap Y --> Z; 1D FFTs in Z =====
-if (dims(1)>1) then
-call transpose_y_to_z(wk2_r2c,wk13,sp)
-else
-#ifdef OVERWRITE
-call transpose_y_to_z(in_c,wk13,sp)
-#else
-call transpose_y_to_z(wk1,wk13,sp)
-#endif
-end if
-call c2r_1m_z(wk13,out_r)
-
-end if
-
-#ifndef OVERWRITE
-! Free memory
-if (allocated(wk1)) deallocate(wk1)
-#endif
-
-return
-end subroutine fft_3d_c2r
diff --git a/decomp2d/fft_ffte.f90 b/decomp2d/fft_ffte.f90
deleted file mode 100644
index 4f5393e97..000000000
--- a/decomp2d/fft_ffte.f90
+++ /dev/null
@@ -1,417 +0,0 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2012 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-! This is the FFTE implementation of the FFT library
-! Note that FFTE only support transform size with prime factor 2/3/5.
-! Other sizes are computed by the generic engine.
-
-module decomp_2d_fft
-  
-  use decomp_2d  ! 2D decomposition module
-  use glassman
-  
-  implicit none
-  
-  private        ! Make everything private unless declared public
-
-  ! engine-specific global variables
-  complex(mytype), allocatable, dimension(:) :: buf, scratch
-
-  logical, save :: x_goodsize, y_goodsize, z_goodsize
-
-  ! common code used for all engines, including global variables, 
-  ! generic interface definitions and several subroutines
-#include "fft_common.inc"
-
-  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !  This routine performs one-time initialisations for the FFT engine
-  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine init_fft_engine
-
-    implicit none
-
-    integer :: cbuf_size, errorcode
-
-    if (KIND(0.0) == mytype) then
-       errorcode = 5
-       call decomp_2d_abort(errorcode, &
-            'FFTE engine only suppport double-precision calculations')
-    end if
-
-    if (nrank==0) then
-       write(*,*) ' '
-       write(*,*) '***** Using the FFTE engine *****'
-       write(*,*) ' '
-    end if
-
-    cbuf_size = max(ph%xsz(1), ph%ysz(2))
-    cbuf_size = max(cbuf_size, ph%zsz(3))
-    allocate(buf(cbuf_size))
-    allocate(scratch(2*cbuf_size))  ! twice space for vffte library
-
-    ! check the transform sizes
-    x_goodsize = check_size(ph%xsz(1))
-    y_goodsize = check_size(ph%ysz(2))
-    z_goodsize = check_size(ph%zsz(3))
-
-    return
-  end subroutine init_fft_engine
-
-  ! factorisation routine
-#include "factor.inc"
-
-  logical function check_size(nsize)
-
-    integer, intent(IN) :: nsize
-
-    integer, allocatable, dimension(:) :: factors
-    integer :: nfact, i
-
-    i = ceiling(log(real(nsize))*1.442695)  ! log_2_N
-    allocate(factors(i))
-
-    call primefactors(nsize,factors,nfact)
-
-    check_size = .true.
-
-    do i=1,nfact
-       if (factors(i) > 5) then
-          check_size = .false.
-          exit
-       end if
-    end do
-
-    return
-  end function check_size
-
-
-  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !  This routine performs one-time finalisations for the FFT engine
-  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine finalize_fft_engine
-
-    implicit none
-
-    deallocate(buf,scratch)
-
-    return
-  end subroutine finalize_fft_engine
-
-
-  ! Following routines calculate multiple one-dimensional FFTs to form 
-  ! the basis of three-dimensional FFTs.
-
-  ! c2c transform, multiple 1D FFTs in x direction
-  subroutine c2c_1m_x(inout, isign, decomp)
-
-    implicit none
-
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    integer, intent(IN) :: isign
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-    integer :: i,j,k
-    
-    if (x_goodsize) then
-       call zfft1d(buf,decomp%xsz(1),0,scratch)
-    end if
-
-    do k=1,decomp%xsz(3)
-       do j=1,decomp%xsz(2)
-          do i=1,decomp%xsz(1)
-             buf(i) = inout(i,j,k)
-          end do
-          if (x_goodsize) then
-             call zfft1d(buf,decomp%xsz(1),isign,scratch)
-          else
-             call spcfft(buf,decomp%xsz(1),isign,scratch)
-          end if
-          do i=1,decomp%xsz(1)
-             inout(i,j,k) = buf(i)
-          end do
-       end do
-    end do
-
-    ! Note FFTE backward transform is scaled, unscale here
-    if (x_goodsize .AND. isign==DECOMP_2D_FFT_BACKWARD) then
-       inout = inout * real(decomp%xsz(1), kind=mytype)
-    end if
-
-    return
-
-  end subroutine c2c_1m_x
-
-  ! c2c transform, multiple 1D FFTs in y direction
-  subroutine c2c_1m_y(inout, isign, decomp)
-
-    implicit none
-
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    integer, intent(IN) :: isign
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-    integer :: i,j,k
-
-    if (y_goodsize) then
-       call zfft1d(buf,decomp%ysz(2),0,scratch)
-    end if
-
-    do k=1,decomp%ysz(3)
-       do i=1,decomp%ysz(1)
-          do j=1,decomp%ysz(2)
-             buf(j) = inout(i,j,k)
-          end do
-          if (y_goodsize) then
-             call zfft1d(buf,decomp%ysz(2),isign,scratch)
-          else
-             call spcfft(buf,decomp%ysz(2),isign,scratch)
-          end if
-          do j=1,decomp%ysz(2)
-             inout(i,j,k) = buf(j)
-          end do
-       end do
-    end do
-
-    if (y_goodsize .AND. isign==DECOMP_2D_FFT_BACKWARD) then
-       inout = inout * real(decomp%ysz(2), kind=mytype)
-    end if
-
-    return
-
-  end subroutine c2c_1m_y
-
-  ! c2c transform, multiple 1D FFTs in z direction
-  subroutine c2c_1m_z(inout, isign, decomp)
-
-    implicit none
-
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    integer, intent(IN) :: isign
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-    integer :: i,j,k
-
-    if (z_goodsize) then
-       call zfft1d(buf,decomp%zsz(3),0,scratch)
-    end if
-
-    do j=1,decomp%zsz(2)
-       do i=1,decomp%zsz(1)
-          do k=1,decomp%zsz(3)
-             buf(k) = inout(i,j,k)
-          end do
-          if (z_goodsize) then
-             call zfft1d(buf,decomp%zsz(3),isign,scratch)
-          else
-             call spcfft(buf,decomp%zsz(3),isign,scratch)
-          end if
-          do k=1,decomp%zsz(3)
-             inout(i,j,k) = buf(k)
-          end do
-       end do
-    end do
-
-    if (z_goodsize .AND. isign==DECOMP_2D_FFT_BACKWARD) then
-       inout = inout * real(decomp%zsz(3), kind=mytype)
-    end if
-
-    return
-
-  end subroutine c2c_1m_z
-
-  ! r2c transform, multiple 1D FFTs in x direction
-  subroutine r2c_1m_x(input, output)
-
-    implicit none
-
-    real(mytype), dimension(:,:,:), intent(IN)  ::  input
-    complex(mytype), dimension(:,:,:), intent(OUT) :: output
-
-    integer :: i,j,k, s1,s2,s3, d1
-
-    s1 = size(input,1)
-    s2 = size(input,2)
-    s3 = size(input,3)
-    d1 = size(output,1)
-
-    if (x_goodsize) then
-       call zfft1d(buf,s1,0,scratch)
-    end if
-
-    do k=1,s3
-       do j=1,s2
-          ! The base FFT engine is c2c only, 
-          ! needing some pre- and post-processing for r2c
-          ! pack real input in complex storage
-          do i=1,s1
-             buf(i) = cmplx(input(i,j,k),0._mytype, kind=mytype)
-          end do
-          if (x_goodsize) then
-             call zfft1d(buf,s1,-1,scratch)
-          else
-             call spcfft(buf,s1,-1,scratch)
-          end if
-          ! note d1 ~ s1/2+1
-          ! simply drop the redundant part of the complex output
-          do i=1,d1
-             output(i,j,k) = buf(i)
-          end do
-       end do
-    end do
-
-    return
-
-  end subroutine r2c_1m_x
-
-  ! r2c transform, multiple 1D FFTs in z direction
-  subroutine r2c_1m_z(input, output)
-
-    implicit none
-
-    real(mytype), dimension(:,:,:), intent(IN)  ::  input
-    complex(mytype), dimension(:,:,:), intent(OUT) :: output
-
-    integer :: i,j,k, s1,s2,s3, d3
-
-    s1 = size(input,1)
-    s2 = size(input,2)
-    s3 = size(input,3)
-    d3 = size(output,3)
-
-    if (z_goodsize) then
-       call zfft1d(buf,s3,0,scratch)
-    end if
-
-    do j=1,s2
-       do i=1,s1
-          do k=1,s3
-             buf(k) = cmplx(input(i,j,k),0._mytype, kind=mytype)
-          end do
-          if (z_goodsize) then
-             call zfft1d(buf,s3,-1,scratch)
-          else
-             call spcfft(buf,s3,-1,scratch)
-          end if
-          do k=1,d3
-             output(i,j,k) = buf(k)
-          end do
-       end do
-    end do
-
-    return
-
-  end subroutine r2c_1m_z
-
-  ! c2r transform, multiple 1D FFTs in x direction
-  subroutine c2r_1m_x(input, output)
-
-    implicit none
-
-    complex(mytype), dimension(:,:,:), intent(IN)  ::  input
-    real(mytype), dimension(:,:,:), intent(OUT) :: output
-
-    integer :: i,j,k, d1,d2,d3
-
-    d1 = size(output,1)
-    d2 = size(output,2)
-    d3 = size(output,3)
-
-    if (x_goodsize) then
-       call zfft1d(buf,d1,0,scratch)
-    end if
-
-    do k=1,d3
-       do j=1,d2
-          ! The base FFT engine is c2c only, 
-          ! needing some pre- and post-processing for c2r
-          do i=1,d1/2+1
-             buf(i) = input(i,j,k)
-          end do
-          ! expanding to a full-size complex array
-          ! For odd N, the storage is:
-          !  1, 2, ...... N/2+1   integer division rounded down
-          !     N, ...... N/2+2   => a(i) is conjugate of a(N+2-i)
-          ! For even N, the storage is:
-          !  1, 2, ...... N/2  , N/2+1
-          !     N, ...... N/2+2  again a(i) conjugate of a(N+2-i)
-          do i=d1/2+2,d1
-             buf(i) =  conjg(buf(d1+2-i))
-          end do
-          if (x_goodsize) then
-             call zfft1d(buf,d1,1,scratch)
-          else
-             call spcfft(buf,d1,1,scratch)
-          end if
-          do i=1,d1
-             ! simply drop imaginary part
-             output(i,j,k) = real(buf(i), kind=mytype)
-          end do
-       end do
-    end do
-
-    if (x_goodsize) then
-       output = output * real(d1, kind=mytype)
-    end if
-
-    return
-
-  end subroutine c2r_1m_x
-
-  ! c2r transform, multiple 1D FFTs in z direction
-  subroutine c2r_1m_z(input, output)
-
-    implicit none
-
-    complex(mytype), dimension(:,:,:), intent(IN)  ::  input
-    real(mytype), dimension(:,:,:), intent(OUT) :: output
-
-    integer :: i,j,k, d1,d2,d3
-
-    d1 = size(output,1)
-    d2 = size(output,2)
-    d3 = size(output,3)
-
-    if (z_goodsize) then
-       call zfft1d(buf,d3,0,scratch)
-    end if
-
-    do j=1,d2
-       do i=1,d1
-          do k=1,d3/2+1
-             buf(k) = input(i,j,k)
-          end do
-          do k=d3/2+2,d3
-             buf(k) =  conjg(buf(d3+2-k))
-          end do
-          if (z_goodsize) then
-             call zfft1d(buf,d3,1,scratch)
-          else
-             call spcfft(buf,d3,1,scratch)
-          end if
-          do k=1,d3
-             output(i,j,k) = real(buf(k), kind=mytype)
-          end do
-       end do
-    end do
-
-    if (z_goodsize) then
-       output = output * real(d3, kind=mytype)
-    end if
-
-    return
-
-  end subroutine c2r_1m_z
-
-
-#include "fft_common_3d.inc"
-
-  
-end module decomp_2d_fft
diff --git a/decomp2d/fft_fftw3.f90 b/decomp2d/fft_fftw3.f90
deleted file mode 100644
index 09edb0d9a..000000000
--- a/decomp2d/fft_fftw3.f90
+++ /dev/null
@@ -1,724 +0,0 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-! This is the FFTW (version 3.x) implementation of the FFT library
-
-module decomp_2d_fft
-
-  use decomp_2d  ! 2D decomposition module
-
-  implicit none
-
-  include "fftw3.f"
-
-  private        ! Make everything private unless declared public
-
-  ! engine-specific global variables
-  integer, save :: plan_type = FFTW_MEASURE
-
-  ! FFTW plans
-  ! j=1,2,3 corresponds to the 1D FFTs in X,Y,Z direction, respectively
-  ! For c2c transforms: 
-  !     use plan(-1,j) for  forward transform; 
-  !     use plan( 1,j) for backward transform;
-  ! For r2c/c2r transforms:
-  !     use plan(0,j) for r2c transforms;
-  !     use plan(2,j) for c2r transforms;
-  integer*8, save :: plan(-1:2,3)
-
-  ! common code used for all engines, including global variables, 
-  ! generic interface definitions and several subroutines
-#include "fft_common.inc"
-
-  ! Return a FFTW3 plan for multiple 1D c2c FFTs in X direction
-  subroutine c2c_1m_x_plan(plan1, decomp, isign)
-
-    implicit none
-
-    integer*8, intent(OUT) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-    integer, intent(IN) :: isign
-
-    complex(mytype), allocatable, dimension(:,:,:) :: a1
-
-    allocate(a1(decomp%xsz(1),decomp%xsz(2),decomp%xsz(3)))
-
-#ifdef DOUBLE_PREC
-    call dfftw_plan_many_dft(plan1, 1, decomp%xsz(1), &
-         decomp%xsz(2)*decomp%xsz(3), a1, decomp%xsz(1), 1, &
-         decomp%xsz(1), a1, decomp%xsz(1), 1, decomp%xsz(1), &
-         isign, plan_type)
-#else
-    call sfftw_plan_many_dft(plan1, 1, decomp%xsz(1), &
-         decomp%xsz(2)*decomp%xsz(3), a1, decomp%xsz(1), 1, &
-         decomp%xsz(1), a1, decomp%xsz(1), 1, decomp%xsz(1), &
-         isign, plan_type)
-#endif
-
-    deallocate(a1)
-
-    return
-  end subroutine c2c_1m_x_plan
-
-  ! Return a FFTW3 plan for multiple 1D c2c FFTs in Y direction
-  subroutine c2c_1m_y_plan(plan1, decomp, isign)
-
-    implicit none
-
-    integer*8, intent(OUT) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-    integer, intent(IN) :: isign
-
-    complex(mytype), allocatable, dimension(:,:) :: a1
-
-    ! Due to memory pattern of 3D arrays, 1D FFTs along Y have to be
-    ! done one Z-plane at a time. So plan for 2D data sets here.
-
-    allocate(a1(decomp%ysz(1),decomp%ysz(2)))
-
-#ifdef DOUBLE_PREC
-    call dfftw_plan_many_dft(plan1, 1, decomp%ysz(2), decomp%ysz(1), &
-         a1, decomp%ysz(2), decomp%ysz(1), 1, a1, decomp%ysz(2), &
-         decomp%ysz(1), 1, isign, plan_type)
-#else
-    call sfftw_plan_many_dft(plan1, 1, decomp%ysz(2), decomp%ysz(1), &
-         a1, decomp%ysz(2), decomp%ysz(1), 1, a1, decomp%ysz(2), &
-         decomp%ysz(1), 1, isign, plan_type)
-#endif
-
-    deallocate(a1)
-
-    return
-  end subroutine c2c_1m_y_plan
-
-
-  ! Return a FFTW3 plan for multiple 1D c2c FFTs in Z direction
-  subroutine c2c_1m_z_plan(plan1, decomp, isign)
-
-    implicit none
-
-    integer*8, intent(OUT) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-    integer, intent(IN) :: isign
-
-    complex(mytype), allocatable, dimension(:,:,:) :: a1
-
-    allocate(a1(decomp%zsz(1),decomp%zsz(2),decomp%zsz(3)))
-
-#ifdef DOUBLE_PREC
-    call dfftw_plan_many_dft(plan1, 1, decomp%zsz(3), &
-         decomp%zsz(1)*decomp%zsz(2), a1, decomp%zsz(3), &
-         decomp%zsz(1)*decomp%zsz(2), 1, a1, decomp%zsz(3), &
-         decomp%zsz(1)*decomp%zsz(2), 1, isign, plan_type)
-#else
-    call sfftw_plan_many_dft(plan1, 1, decomp%zsz(3), &
-         decomp%zsz(1)*decomp%zsz(2), a1, decomp%zsz(3), &
-         decomp%zsz(1)*decomp%zsz(2), 1, a1, decomp%zsz(3), &
-         decomp%zsz(1)*decomp%zsz(2), 1, isign, plan_type)
-#endif
-
-    deallocate(a1)
-
-    return
-  end subroutine c2c_1m_z_plan
-
-
-  ! Return a FFTW3 plan for multiple 1D r2c FFTs in X direction
-  subroutine r2c_1m_x_plan(plan1, decomp_ph, decomp_sp)
-
-    implicit none
-
-    integer*8, intent(OUT) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
-
-    real(mytype), allocatable, dimension(:,:,:) :: a1
-    complex(mytype), allocatable, dimension(:,:,:) :: a2
-
-    allocate(a1(decomp_ph%xsz(1),decomp_ph%xsz(2),decomp_ph%xsz(3)))
-    allocate(a2(decomp_sp%xsz(1),decomp_sp%xsz(2),decomp_sp%xsz(3)))
-#ifdef DOUBLE_PREC
-    call dfftw_plan_many_dft_r2c(plan1, 1, decomp_ph%xsz(1), &
-         decomp_ph%xsz(2)*decomp_ph%xsz(3), a1, decomp_ph%xsz(1), 1, &
-         decomp_ph%xsz(1), a2, decomp_sp%xsz(1), 1, decomp_sp%xsz(1), &
-         plan_type)
-#else
-    call sfftw_plan_many_dft_r2c(plan1, 1, decomp_ph%xsz(1), &
-         decomp_ph%xsz(2)*decomp_ph%xsz(3), a1, decomp_ph%xsz(1), 1, &
-         decomp_ph%xsz(1), a2, decomp_sp%xsz(1), 1, decomp_sp%xsz(1), &
-         plan_type)
-#endif
-    deallocate(a1,a2)    
-
-    return
-  end subroutine r2c_1m_x_plan
-
-
-  ! Return a FFTW3 plan for multiple 1D c2r FFTs in X direction
-  subroutine c2r_1m_x_plan(plan1, decomp_sp, decomp_ph)
-
-    implicit none
-
-    integer*8, intent(OUT) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
-
-    complex(mytype), allocatable, dimension(:,:,:) :: a1
-    real(mytype), allocatable, dimension(:,:,:) :: a2
-
-    allocate(a1(decomp_sp%xsz(1),decomp_sp%xsz(2),decomp_sp%xsz(3)))
-    allocate(a2(decomp_ph%xsz(1),decomp_ph%xsz(2),decomp_ph%xsz(3)))
-#ifdef DOUBLE_PREC
-    call dfftw_plan_many_dft_c2r(plan1, 1, decomp_ph%xsz(1), &
-         decomp_ph%xsz(2)*decomp_ph%xsz(3), a1, decomp_sp%xsz(1), 1, &
-         decomp_sp%xsz(1), a2, decomp_ph%xsz(1), 1, decomp_ph%xsz(1), &
-         plan_type)
-#else
-    call sfftw_plan_many_dft_c2r(plan1, 1, decomp_ph%xsz(1), &
-         decomp_ph%xsz(2)*decomp_ph%xsz(3), a1, decomp_sp%xsz(1), 1, &
-         decomp_sp%xsz(1), a2, decomp_ph%xsz(1), 1, decomp_ph%xsz(1), &
-         plan_type)
-#endif
-    deallocate(a1,a2)
-
-    return
-  end subroutine c2r_1m_x_plan
-
-
-  ! Return a FFTW3 plan for multiple 1D r2c FFTs in Z direction
-  subroutine r2c_1m_z_plan(plan1, decomp_ph, decomp_sp)
-
-    implicit none
-
-    integer*8, intent(OUT) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
-
-    real(mytype), allocatable, dimension(:,:,:) :: a1
-    complex(mytype), allocatable, dimension(:,:,:) :: a2
-
-    allocate(a1(decomp_ph%zsz(1),decomp_ph%zsz(2),decomp_ph%zsz(3)))
-    allocate(a2(decomp_sp%zsz(1),decomp_sp%zsz(2),decomp_sp%zsz(3)))
-#ifdef DOUBLE_PREC
-    call dfftw_plan_many_dft_r2c(plan1, 1, decomp_ph%zsz(3), &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2), a1, decomp_ph%zsz(3), &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2), 1, a2, decomp_sp%zsz(3), &
-         decomp_sp%zsz(1)*decomp_sp%zsz(2), 1, plan_type)
-#else
-    call sfftw_plan_many_dft_r2c(plan1, 1, decomp_ph%zsz(3), &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2), a1, decomp_ph%zsz(3), &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2), 1, a2, decomp_sp%zsz(3), &
-         decomp_sp%zsz(1)*decomp_sp%zsz(2), 1, plan_type)
-#endif
-    deallocate(a1,a2)
-
-    return
-  end subroutine r2c_1m_z_plan
-
-
-  ! Return a FFTW3 plan for multiple 1D c2r FFTs in Z direction
-  subroutine c2r_1m_z_plan(plan1, decomp_sp, decomp_ph)
-
-    implicit none
-
-    integer*8, intent(OUT) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
-
-    complex(mytype), allocatable, dimension(:,:,:) :: a1
-    real(mytype), allocatable, dimension(:,:,:) :: a2
-
-    allocate(a1(decomp_sp%zsz(1),decomp_sp%zsz(2),decomp_sp%zsz(3)))
-    allocate(a2(decomp_ph%zsz(1),decomp_ph%zsz(2),decomp_ph%zsz(3)))
-
-#ifdef DOUBLE_PREC
-    call dfftw_plan_many_dft_c2r(plan1, 1, decomp_ph%zsz(3), &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2), a1, decomp_sp%zsz(3), &
-         decomp_sp%zsz(1)*decomp_sp%zsz(2), 1, a2, decomp_ph%zsz(3), &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2), 1, plan_type)
-#else
-    call sfftw_plan_many_dft_c2r(plan1, 1, decomp_ph%zsz(3), &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2), a1, decomp_sp%zsz(3), &
-         decomp_sp%zsz(1)*decomp_sp%zsz(2), 1, a2, decomp_ph%zsz(3), &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2), 1, plan_type)
-#endif
-    deallocate(a1,a2)     
-
-    return
-  end subroutine c2r_1m_z_plan
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !  This routine performs one-time initialisations for the FFT engine
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine init_fft_engine
-
-    implicit none
-
-    if (nrank==0) then
-       write(*,*) ' '
-       write(*,*) '***** Using the FFTW (version 3.x) engine *****'
-       write(*,*) ' '
-    end if
-
-    if (format == PHYSICAL_IN_X) then
-
-       ! For C2C transforms
-       call c2c_1m_x_plan(plan(-1,1), ph, FFTW_FORWARD )
-       call c2c_1m_y_plan(plan(-1,2), ph, FFTW_FORWARD )
-       call c2c_1m_z_plan(plan(-1,3), ph, FFTW_FORWARD )
-       call c2c_1m_z_plan(plan( 1,3), ph, FFTW_BACKWARD)
-       call c2c_1m_y_plan(plan( 1,2), ph, FFTW_BACKWARD)
-       call c2c_1m_x_plan(plan( 1,1), ph, FFTW_BACKWARD)
-
-       ! For R2C/C2R tranforms
-       call r2c_1m_x_plan(plan(0,1), ph, sp)
-       call c2c_1m_y_plan(plan(0,2), sp, FFTW_FORWARD )
-       call c2c_1m_z_plan(plan(0,3), sp, FFTW_FORWARD )
-       call c2c_1m_z_plan(plan(2,3), sp, FFTW_BACKWARD)
-       call c2c_1m_y_plan(plan(2,2), sp, FFTW_BACKWARD)
-       call c2r_1m_x_plan(plan(2,1), sp, ph)
-
-    else if (format == PHYSICAL_IN_Z) then
-
-       ! For C2C transforms
-       call c2c_1m_z_plan(plan(-1,3), ph, FFTW_FORWARD )
-       call c2c_1m_y_plan(plan(-1,2), ph, FFTW_FORWARD ) 
-       call c2c_1m_x_plan(plan(-1,1), ph, FFTW_FORWARD )
-       call c2c_1m_x_plan(plan( 1,1), ph, FFTW_BACKWARD)
-       call c2c_1m_y_plan(plan( 1,2), ph, FFTW_BACKWARD)
-       call c2c_1m_z_plan(plan( 1,3), ph, FFTW_BACKWARD)
-
-       ! For R2C/C2R tranforms
-       call r2c_1m_z_plan(plan(0,3), ph, sp)
-       call c2c_1m_y_plan(plan(0,2), sp, FFTW_FORWARD )
-       call c2c_1m_x_plan(plan(0,1), sp, FFTW_FORWARD )
-       call c2c_1m_x_plan(plan(2,1), sp, FFTW_BACKWARD)
-       call c2c_1m_y_plan(plan(2,2), sp, FFTW_BACKWARD)
-       call c2r_1m_z_plan(plan(2,3), sp, ph)
-
-    end if
-
-    return
-  end subroutine init_fft_engine
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !  This routine performs one-time finalisations for the FFT engine
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine finalize_fft_engine
-
-    implicit none
-
-    integer :: i,j
-
-    do j=1,3
-       do i=-1,2
-#ifdef DOUBLE_PREC
-          call dfftw_destroy_plan(plan(i,j))
-#else
-          call sfftw_destroy_plan(plan(i,j))
-#endif
-       end do
-    end do
-
-    return
-  end subroutine finalize_fft_engine
-
-
-  ! Following routines calculate multiple one-dimensional FFTs to form 
-  ! the basis of three-dimensional FFTs.
-
-  ! c2c transform, multiple 1D FFTs in x direction
-  subroutine c2c_1m_x(inout, isign, plan1)
-
-    implicit none
-
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    integer, intent(IN) :: isign
-    integer*8, intent(IN) :: plan1
-
-#ifdef DOUBLE_PREC
-    call dfftw_execute_dft(plan1, inout, inout)
-#else
-    call sfftw_execute_dft(plan1, inout, inout)
-#endif
-
-    return
-  end subroutine c2c_1m_x
-
-
-  ! c2c transform, multiple 1D FFTs in y direction
-  subroutine c2c_1m_y(inout, isign, plan1)
-
-    implicit none
-
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    integer, intent(IN) :: isign
-    integer*8, intent(IN) :: plan1
-
-    integer :: k, s3
-
-    ! transform on one Z-plane at a time
-    s3 = size(inout,3)
-    do k=1,s3
-#ifdef DOUBLE_PREC
-       call dfftw_execute_dft(plan1, inout(:,:,k), inout(:,:,k))
-#else
-       call sfftw_execute_dft(plan1, inout(:,:,k), inout(:,:,k))
-#endif
-    end do
-
-    return
-  end subroutine c2c_1m_y
-
-  ! c2c transform, multiple 1D FFTs in z direction
-  subroutine c2c_1m_z(inout, isign, plan1)
-
-    implicit none
-
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    integer, intent(IN) :: isign
-    integer*8, intent(IN) :: plan1
-
-#ifdef DOUBLE_PREC
-    call dfftw_execute_dft(plan1, inout, inout)
-#else
-    call sfftw_execute_dft(plan1, inout, inout)
-#endif
-
-    return
-  end subroutine c2c_1m_z
-
-  ! r2c transform, multiple 1D FFTs in x direction
-  subroutine r2c_1m_x(input, output)
-
-    implicit none
-
-    real(mytype), dimension(:,:,:), intent(IN)  ::  input
-    complex(mytype), dimension(:,:,:), intent(OUT) :: output
-
-#ifdef DOUBLE_PREC
-    call dfftw_execute_dft_r2c(plan(0,1), input, output)
-#else
-    call sfftw_execute_dft_r2c(plan(0,1), input, output)
-#endif    
-
-    return
-
-  end subroutine r2c_1m_x
-
-  ! r2c transform, multiple 1D FFTs in z direction
-  subroutine r2c_1m_z(input, output)
-
-    implicit none
-
-    real(mytype), dimension(:,:,:), intent(IN)  ::  input
-    complex(mytype), dimension(:,:,:), intent(OUT) :: output
-
-#ifdef DOUBLE_PREC
-    call dfftw_execute_dft_r2c(plan(0,3), input, output)
-#else
-    call sfftw_execute_dft_r2c(plan(0,3), input, output)
-#endif
-
-    return
-
-  end subroutine r2c_1m_z
-
-  ! c2r transform, multiple 1D FFTs in x direction
-  subroutine c2r_1m_x(input, output)
-
-    implicit none
-
-    complex(mytype), dimension(:,:,:), intent(IN)  ::  input
-    real(mytype), dimension(:,:,:), intent(OUT) :: output
-
-#ifdef DOUBLE_PREC
-    call dfftw_execute_dft_c2r(plan(2,1), input, output)
-#else
-    call sfftw_execute_dft_c2r(plan(2,1), input, output)
-#endif
-
-    return
-
-  end subroutine c2r_1m_x
-
-  ! c2r transform, multiple 1D FFTs in z direction
-  subroutine c2r_1m_z(input, output)
-
-    implicit none
-
-    complex(mytype), dimension(:,:,:), intent(IN) :: input
-    real(mytype), dimension(:,:,:), intent(OUT) :: output
-
-#ifdef DOUBLE_PREC
-    call dfftw_execute_dft_c2r(plan(2,3), input, output)
-#else
-    call sfftw_execute_dft_c2r(plan(2,3), input, output)
-#endif    
-
-    return
-
-  end subroutine c2r_1m_z
-
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! 3D FFT - complex to complex
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine fft_3d_c2c(in, out, isign)
-
-    implicit none
-
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: in
-    complex(mytype), dimension(:,:,:), intent(OUT) :: out
-    integer, intent(IN) :: isign
-
-#ifndef OVERWRITE
-    complex(mytype), allocatable, dimension(:,:,:) :: wk1
-#endif
-
-    if (format==PHYSICAL_IN_X .AND. isign==DECOMP_2D_FFT_FORWARD .OR.  &
-         format==PHYSICAL_IN_Z .AND. isign==DECOMP_2D_FFT_BACKWARD) then
-
-       ! ===== 1D FFTs in X =====
-#ifdef OVERWRITE
-       call c2c_1m_x(in,isign,plan(isign,1))
-#else
-       allocate (wk1(ph%xsz(1),ph%xsz(2),ph%xsz(3)))
-       wk1 = in
-       call c2c_1m_x(wk1,isign,plan(isign,1))
-#endif
-
-       ! ===== Swap X --> Y; 1D FFTs in Y =====
-
-       if (dims(1)>1) then
-#ifdef OVERWRITE
-          call transpose_x_to_y(in,wk2_c2c,ph)
-#else
-          call transpose_x_to_y(wk1,wk2_c2c,ph)
-#endif
-          call c2c_1m_y(wk2_c2c,isign,plan(isign,2))
-       else
-#ifdef OVERWRITE
-          call c2c_1m_y(in,isign,plan(isign,2))
-#else
-          call c2c_1m_y(wk1,isign,plan(isign,2))
-#endif
-       end if
-
-       ! ===== Swap Y --> Z; 1D FFTs in Z =====
-       if (dims(1)>1) then
-          call transpose_y_to_z(wk2_c2c,out,ph)
-       else
-#ifdef OVERWRITE
-          call transpose_y_to_z(in,out,ph)
-#else
-          call transpose_y_to_z(wk1,out,ph)
-#endif
-       end if
-       call c2c_1m_z(out,isign,plan(isign,3))
-
-    else if (format==PHYSICAL_IN_X .AND. isign==DECOMP_2D_FFT_BACKWARD &
-         .OR. & 
-         format==PHYSICAL_IN_Z .AND. isign==DECOMP_2D_FFT_FORWARD) then
-
-       ! ===== 1D FFTs in Z =====
-#ifdef OVERWRITE
-       call c2c_1m_z(in,isign,plan(isign,3))
-#else
-       allocate (wk1(ph%zsz(1),ph%zsz(2),ph%zsz(3)))
-       wk1 = in
-       call c2c_1m_z(wk1,isign,plan(isign,3))
-#endif
-
-       ! ===== Swap Z --> Y; 1D FFTs in Y =====
-       if (dims(1)>1) then
-#ifdef OVERWRITE
-          call transpose_z_to_y(in,wk2_c2c,ph)
-#else
-          call transpose_z_to_y(wk1,wk2_c2c,ph)
-#endif
-          call c2c_1m_y(wk2_c2c,isign,plan(isign,2))
-       else  ! out==wk2_c2c if 1D decomposition
-#ifdef OVERWRITE
-          call transpose_z_to_y(in,out,ph)
-#else
-          call transpose_z_to_y(wk1,out,ph)
-#endif
-          call c2c_1m_y(out,isign,plan(isign,2))
-       end if
-
-       ! ===== Swap Y --> X; 1D FFTs in X =====
-       if (dims(1)>1) then
-          call transpose_y_to_x(wk2_c2c,out,ph)
-       end if
-       call c2c_1m_x(out,isign,plan(isign,1))
-
-    end if
-
-#ifndef OVERWRITE
-    deallocate (wk1)
-#endif
-
-    return
-  end subroutine fft_3d_c2c
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! 3D forward FFT - real to complex
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine fft_3d_r2c(in_r, out_c)
-
-    implicit none
-
-    real(mytype), dimension(:,:,:), intent(IN) :: in_r
-    complex(mytype), dimension(:,:,:), intent(OUT) :: out_c
-
-    if (format==PHYSICAL_IN_X) then
-
-       ! ===== 1D FFTs in X =====
-       call r2c_1m_x(in_r,wk13)
-
-       ! ===== Swap X --> Y; 1D FFTs in Y =====
-       if (dims(1)>1) then
-          call transpose_x_to_y(wk13,wk2_r2c,sp)
-          call c2c_1m_y(wk2_r2c,-1,plan(0,2))
-       else
-          call c2c_1m_y(wk13,-1,plan(0,2))
-       end if
-
-       ! ===== Swap Y --> Z; 1D FFTs in Z =====
-       if (dims(1)>1) then
-          call transpose_y_to_z(wk2_r2c,out_c,sp)
-       else
-          call transpose_y_to_z(wk13,out_c,sp)
-       end if
-       call c2c_1m_z(out_c,-1,plan(0,3))
-
-    else if (format==PHYSICAL_IN_Z) then
-
-       ! ===== 1D FFTs in Z =====
-       call r2c_1m_z(in_r,wk13)
-
-       ! ===== Swap Z --> Y; 1D FFTs in Y =====
-       if (dims(1)>1) then
-          call transpose_z_to_y(wk13,wk2_r2c,sp)
-          call c2c_1m_y(wk2_r2c,-1,plan(0,2))
-       else  ! out_c==wk2_r2c if 1D decomposition
-          call transpose_z_to_y(wk13,out_c,sp)
-          call c2c_1m_y(out_c,-1,plan(0,2))
-       end if
-
-       ! ===== Swap Y --> X; 1D FFTs in X =====
-       if (dims(1)>1) then
-          call transpose_y_to_x(wk2_r2c,out_c,sp)
-       end if
-       call c2c_1m_x(out_c,-1,plan(0,1))
-
-    end if
-
-    return
-  end subroutine fft_3d_r2c
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! 3D inverse FFT - complex to real
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine fft_3d_c2r(in_c, out_r)
-
-    implicit none
-
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: in_c
-    real(mytype), dimension(:,:,:), intent(OUT) :: out_r
-
-#ifndef OVERWRITE
-    complex(mytype), allocatable, dimension(:,:,:) :: wk1
-#endif
-
-    if (format==PHYSICAL_IN_X) then
-
-       ! ===== 1D FFTs in Z =====
-#ifdef OVERWRITE
-       call c2c_1m_z(in_c,1,plan(2,3))       
-#else
-       allocate (wk1(sp%zsz(1),sp%zsz(2),sp%zsz(3)))
-       wk1 = in_c
-       call c2c_1m_z(wk1,1,plan(2,3))
-#endif
-
-       ! ===== Swap Z --> Y; 1D FFTs in Y =====
-#ifdef OVERWRITE
-       call transpose_z_to_y(in_c,wk2_r2c,sp)
-#else
-       call transpose_z_to_y(wk1,wk2_r2c,sp)
-#endif
-       call c2c_1m_y(wk2_r2c,1,plan(2,2))
-
-       ! ===== Swap Y --> X; 1D FFTs in X =====
-       if (dims(1)>1) then
-          call transpose_y_to_x(wk2_r2c,wk13,sp)
-          call c2r_1m_x(wk13,out_r)
-       else
-          call c2r_1m_x(wk2_r2c,out_r)
-       end if
-
-    else if (format==PHYSICAL_IN_Z) then
-
-       ! ===== 1D FFTs in X =====
-#ifdef OVERWRITE
-       call c2c_1m_x(in_c,1,plan(2,1))
-#else
-       allocate(wk1(sp%xsz(1),sp%xsz(2),sp%xsz(3)))
-       wk1 = in_c
-       call c2c_1m_x(wk1,1,plan(2,1))
-#endif
-
-       ! ===== Swap X --> Y; 1D FFTs in Y =====
-       if (dims(1)>1) then
-#ifdef OVERWRITE
-          call transpose_x_to_y(in_c,wk2_r2c,sp)
-#else
-          call transpose_x_to_y(wk1,wk2_r2c,sp)
-#endif
-          call c2c_1m_y(wk2_r2c,1,plan(2,2))
-       else  ! in_c==wk2_r2c if 1D decomposition
-#ifdef OVERWRITE
-          call c2c_1m_y(in_c,1,plan(2,2))
-#else
-          call c2c_1m_y(wk1,1,plan(2,2))
-#endif
-       end if
-
-       ! ===== Swap Y --> Z; 1D FFTs in Z =====
-       if (dims(1)>1) then
-          call transpose_y_to_z(wk2_r2c,wk13,sp)
-       else
-#ifdef OVERWRITE
-          call transpose_y_to_z(in_c,wk13,sp)
-#else
-          call transpose_y_to_z(wk1,wk13,sp)
-#endif
-       end if
-       call c2r_1m_z(wk13,out_r)
-
-    end if
-
-#ifndef OVERWRITE
-    deallocate (wk1)
-#endif
-
-    return
-  end subroutine fft_3d_c2r
-
-
-end module decomp_2d_fft
diff --git a/decomp2d/fft_generic.f90 b/decomp2d/fft_generic.f90
deleted file mode 100644
index 4fb5c41b8..000000000
--- a/decomp2d/fft_generic.f90
+++ /dev/null
@@ -1,303 +0,0 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-! This is the 'generic' implementation of the FFT library
-
-module decomp_2d_fft
-
-  use decomp_2d  ! 2D decomposition module
-  use glassman
-
-  implicit none
-
-  private        ! Make everything private unless declared public
-
-  ! engine-specific global variables
-  complex(mytype), allocatable, dimension(:) :: buf, scratch
-
-  ! common code used for all engines, including global variables, 
-  ! generic interface definitions and several subroutines
-#include "fft_common.inc"
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !  This routine performs one-time initialisations for the FFT engine
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine init_fft_engine
-
-    implicit none
-
-    integer :: cbuf_size
-
-    if (nrank==0) then
-       write(*,*) ' '
-       write(*,*) '***** Using the generic FFT engine *****'
-       write(*,*) ' '
-    end if
-
-    cbuf_size = max(ph%xsz(1), ph%ysz(2))
-    cbuf_size = max(cbuf_size, ph%zsz(3))
-    allocate(buf(cbuf_size))
-    allocate(scratch(cbuf_size))
-
-    return
-  end subroutine init_fft_engine
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !  This routine performs one-time finalisations for the FFT engine
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine finalize_fft_engine
-
-    implicit none
-
-    deallocate(buf,scratch)
-
-    return
-  end subroutine finalize_fft_engine
-
-
-  ! Following routines calculate multiple one-dimensional FFTs to form 
-  ! the basis of three-dimensional FFTs.
-
-  ! c2c transform, multiple 1D FFTs in x direction
-  subroutine c2c_1m_x(inout, isign, decomp)
-
-    implicit none
-
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    integer, intent(IN) :: isign
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-    integer :: i,j,k
-
-    do k=1,decomp%xsz(3)
-       do j=1,decomp%xsz(2)
-          do i=1,decomp%xsz(1)
-             buf(i) = inout(i,j,k)
-          end do
-          call spcfft(buf,decomp%xsz(1),isign,scratch)
-          do i=1,decomp%xsz(1)
-             inout(i,j,k) = buf(i)
-          end do
-       end do
-    end do
-
-    return
-
-  end subroutine c2c_1m_x
-
-  ! c2c transform, multiple 1D FFTs in y direction
-  subroutine c2c_1m_y(inout, isign, decomp)
-
-    implicit none
-
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    integer, intent(IN) :: isign
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-    integer :: i,j,k
-
-    do k=1,decomp%ysz(3)
-       do i=1,decomp%ysz(1)
-          do j=1,decomp%ysz(2)
-             buf(j) = inout(i,j,k)
-          end do
-          call spcfft(buf,decomp%ysz(2),isign,scratch)
-          do j=1,decomp%ysz(2)
-             inout(i,j,k) = buf(j)
-          end do
-       end do
-    end do
-
-    return
-
-  end subroutine c2c_1m_y
-
-  ! c2c transform, multiple 1D FFTs in z direction
-  subroutine c2c_1m_z(inout, isign, decomp)
-
-    implicit none
-
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    integer, intent(IN) :: isign
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-    integer :: i,j,k
-
-    do j=1,decomp%zsz(2)
-       do i=1,decomp%zsz(1)
-          do k=1,decomp%zsz(3)
-             buf(k) = inout(i,j,k)
-          end do
-          call spcfft(buf,decomp%zsz(3),isign,scratch)
-          do k=1,decomp%zsz(3)
-             inout(i,j,k) = buf(k)
-          end do
-       end do
-    end do
-
-    return
-
-  end subroutine c2c_1m_z
-
-  ! r2c transform, multiple 1D FFTs in x direction
-  subroutine r2c_1m_x(input, output)
-
-    implicit none
-
-    real(mytype), dimension(:,:,:), intent(IN)  ::  input
-    complex(mytype), dimension(:,:,:), intent(OUT) :: output
-
-    integer :: i,j,k, s1,s2,s3, d1
-
-    s1 = size(input,1)
-    s2 = size(input,2)
-    s3 = size(input,3)
-    d1 = size(output,1)
-
-    do k=1,s3
-       do j=1,s2
-          ! Glassman's FFT is c2c only, 
-          ! needing some pre- and post-processing for r2c
-          ! pack real input in complex storage
-          do i=1,s1
-             buf(i) = cmplx(input(i,j,k),0._mytype, kind=mytype)
-          end do
-          call spcfft(buf,s1,-1,scratch)
-          ! note d1 ~ s1/2+1
-          ! simply drop the redundant part of the complex output
-          do i=1,d1
-             output(i,j,k) = buf(i)
-          end do
-       end do
-    end do
-
-    return
-
-  end subroutine r2c_1m_x
-
-  ! r2c transform, multiple 1D FFTs in z direction
-  subroutine r2c_1m_z(input, output)
-
-    implicit none
-
-    real(mytype), dimension(:,:,:), intent(IN)  ::  input
-    complex(mytype), dimension(:,:,:), intent(OUT) :: output
-
-    integer :: i,j,k, s1,s2,s3, d3
-
-    s1 = size(input,1)
-    s2 = size(input,2)
-    s3 = size(input,3)
-    d3 = size(output,3)
-
-    do j=1,s2
-       do i=1,s1
-          ! Glassman's FFT is c2c only, 
-          ! needing some pre- and post-processing for r2c
-          ! pack real input in complex storage
-          do k=1,s3
-             buf(k) = cmplx(input(i,j,k),0._mytype, kind=mytype)
-          end do
-          call spcfft(buf,s3,-1,scratch)
-          ! note d3 ~ s3/2+1
-          ! simply drop the redundant part of the complex output
-          do k=1,d3
-             output(i,j,k) = buf(k)
-          end do
-       end do
-    end do
-
-    return
-
-  end subroutine r2c_1m_z
-
-  ! c2r transform, multiple 1D FFTs in x direction
-  subroutine c2r_1m_x(input, output)
-
-    implicit none
-
-    complex(mytype), dimension(:,:,:), intent(IN)  ::  input
-    real(mytype), dimension(:,:,:), intent(OUT) :: output
-
-    integer :: i,j,k, d1,d2,d3
-
-    d1 = size(output,1)
-    d2 = size(output,2)
-    d3 = size(output,3)
-
-    do k=1,d3
-       do j=1,d2
-          ! Glassman's FFT is c2c only, 
-          ! needing some pre- and post-processing for c2r
-          do i=1,d1/2+1
-             buf(i) = input(i,j,k)
-          end do
-          ! expanding to a full-size complex array
-          ! For odd N, the storage is:
-          !  1, 2, ...... N/2+1   integer division rounded down
-          !     N, ...... N/2+2   => a(i) is conjugate of a(N+2-i)
-          ! For even N, the storage is:
-          !  1, 2, ...... N/2  , N/2+1
-          !     N, ...... N/2+2  again a(i) conjugate of a(N+2-i)
-          do i=d1/2+2,d1
-             buf(i) =  conjg(buf(d1+2-i))
-          end do
-          call spcfft(buf,d1,1,scratch)
-          do i=1,d1
-             ! simply drop imaginary part
-             output(i,j,k) = real(buf(i), kind=mytype)
-          end do
-       end do
-    end do
-
-    return
-
-  end subroutine c2r_1m_x
-
-  ! c2r transform, multiple 1D FFTs in z direction
-  subroutine c2r_1m_z(input, output)
-
-    implicit none
-
-    complex(mytype), dimension(:,:,:), intent(IN)  ::  input
-    real(mytype), dimension(:,:,:), intent(OUT) :: output
-
-    integer :: i,j,k, d1,d2,d3
-
-    d1 = size(output,1)
-    d2 = size(output,2)
-    d3 = size(output,3)
-
-    do j=1,d2
-       do i=1,d1
-          do k=1,d3/2+1
-             buf(k) = input(i,j,k)
-          end do
-          do k=d3/2+2,d3
-             buf(k) =  conjg(buf(d3+2-k))
-          end do
-          call spcfft(buf,d3,1,scratch)
-          do k=1,d3
-             output(i,j,k) = real(buf(k), kind=mytype)
-          end do
-       end do
-    end do
-
-    return
-
-  end subroutine c2r_1m_z
-
-
-#include "fft_common_3d.inc"
-
-
-end module decomp_2d_fft
diff --git a/decomp2d/fft_mkl.f90 b/decomp2d/fft_mkl.f90
deleted file mode 100644
index 5a957741f..000000000
--- a/decomp2d/fft_mkl.f90
+++ /dev/null
@@ -1,583 +0,0 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2012 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-! This is the Intel MKL implementation of the FFT library
-
-module decomp_2d_fft
-
-  use decomp_2d  ! 2D decomposition module
-  use MKL_DFTI   ! MKL FFT module
-
-  implicit none
-
-  private        ! Make everything private unless declared public
-
-  ! engine-specific global variables
-
-  ! Descriptors for MKL FFT, one for each set of 1D FFTs
-  !  for c2c transforms
-  type(DFTI_DESCRIPTOR), pointer :: c2c_x, c2c_y, c2c_z     
-  !  for r2c/c2r transforms, PHYSICAL_IN_X
-  type(DFTI_DESCRIPTOR), pointer :: r2c_x, c2c_y2, c2c_z2, c2r_x
-  !  for r2c/c2r transforms, PHYSICAL_IN_Z
-  type(DFTI_DESCRIPTOR), pointer :: r2c_z, c2c_x2, c2r_z
-
-  ! common code used for all engines, including global variables, 
-  ! generic interface definitions and several subroutines
-#include "fft_common.inc"
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !  This routine performs one-time initialisations for the FFT engine
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine init_fft_engine
-
-    implicit none
-
-    if (nrank==0) then
-       write(*,*) ' '
-       write(*,*) '***** Using the MKL engine *****'
-       write(*,*) ' '
-    end if
-
-    ! For C2C transforms
-    call c2c_1m_x_plan(c2c_x, ph)
-    call c2c_1m_y_plan(c2c_y, ph)
-    call c2c_1m_z_plan(c2c_z, ph)
-
-    ! For R2C/C2R tranfroms with physical space in X-pencil
-    if (format == PHYSICAL_IN_X) then
-       call r2c_1m_x_plan(r2c_x, ph, sp, -1)
-       call c2c_1m_y_plan(c2c_y2, sp)
-       call c2c_1m_z_plan(c2c_z2, sp)
-       call r2c_1m_x_plan(c2r_x, ph, sp,  1)
-
-       ! For R2C/C2R tranfroms with physical space in Z-pencil
-    else if (format == PHYSICAL_IN_Z) then
-       call r2c_1m_z_plan(r2c_z, ph, sp, -1)
-       call c2c_1m_y_plan(c2c_y2, sp)
-       call c2c_1m_x_plan(c2c_x2, sp)
-       call r2c_1m_z_plan(c2r_z, ph, sp,  1)
-    end if
-
-    return
-  end subroutine init_fft_engine
-
-
-  ! Return an MKL plan for multiple 1D c2c FFTs in X direction
-  subroutine c2c_1m_x_plan(desc, decomp)
-
-    implicit none
-
-    type(DFTI_DESCRIPTOR), pointer :: desc
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-    integer :: status
-
-#ifdef DOUBLE_PREC
-    status = DftiCreateDescriptor(desc, DFTI_DOUBLE, &
-         DFTI_COMPLEX, 1, decomp%xsz(1))
-#else       
-    status = DftiCreateDescriptor(desc, DFTI_SINGLE, &
-         DFTI_COMPLEX, 1, decomp%xsz(1))
-#endif
-    status = DftiSetValue(desc, DFTI_NUMBER_OF_TRANSFORMS, &
-         decomp%xsz(2)*decomp%xsz(3))
-    status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_NOT_INPLACE)
-    status = DftiSetValue(desc, DFTI_INPUT_DISTANCE, decomp%xsz(1))
-    status = DftiSetValue(desc, DFTI_OUTPUT_DISTANCE, decomp%xsz(1))
-    status = DftiCommitDescriptor(desc)
-
-    return
-  end subroutine c2c_1m_x_plan
-
-  ! Return an MKL plan for multiple 1D c2c FFTs in Y direction
-  subroutine c2c_1m_y_plan(desc, decomp)
-
-    implicit none
-
-    type(DFTI_DESCRIPTOR), pointer :: desc
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-    integer :: status, strides(2)
-
-    ! Due to memory pattern of 3D arrays, 1D FFTs along Y have to be
-    ! done one Z-plane at a time. So plan for 2D data sets here.
-
-#ifdef DOUBLE_PREC
-    status = DftiCreateDescriptor(desc, DFTI_DOUBLE, &
-         DFTI_COMPLEX, 1, decomp%ysz(2))
-#else
-    status = DftiCreateDescriptor(desc, DFTI_SINGLE, &
-         DFTI_COMPLEX, 1, decomp%ysz(2))
-#endif
-    status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_NOT_INPLACE)
-    status = DftiSetValue(desc, DFTI_NUMBER_OF_TRANSFORMS, decomp%ysz(1))
-    status = DftiSetValue(desc, DFTI_INPUT_DISTANCE, 1)
-    status = DftiSetValue(desc, DFTI_OUTPUT_DISTANCE, 1)
-    strides(1) = 0
-    strides(2) = decomp%ysz(1)
-    status = DftiSetValue(desc, DFTI_INPUT_STRIDES, strides)
-    status = DftiSetValue(desc, DFTI_OUTPUT_STRIDES, strides)
-    status = DftiCommitDescriptor(desc)
-
-    return
-  end subroutine c2c_1m_y_plan
-
-  ! Return an MKL plan for multiple 1D c2c FFTs in Z direction
-  subroutine c2c_1m_z_plan(desc, decomp)
-
-    implicit none
-
-    type(DFTI_DESCRIPTOR), pointer :: desc
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-    integer :: status, strides(2)
-
-#ifdef DOUBLE_PREC
-    status = DftiCreateDescriptor(desc, DFTI_DOUBLE, &
-         DFTI_COMPLEX, 1, decomp%zsz(3))
-#else
-    status = DftiCreateDescriptor(desc, DFTI_SINGLE, &
-         DFTI_COMPLEX, 1, decomp%zsz(3))
-#endif
-    status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_NOT_INPLACE)
-    status = DftiSetValue(desc, DFTI_NUMBER_OF_TRANSFORMS, &
-         decomp%zsz(1)*decomp%zsz(2))
-    status = DftiSetValue(desc, DFTI_INPUT_DISTANCE, 1)
-    status = DftiSetValue(desc, DFTI_OUTPUT_DISTANCE, 1)
-    strides(1) = 0
-    strides(2) = decomp%zsz(1)*decomp%zsz(2)
-    status = DftiSetValue(desc, DFTI_INPUT_STRIDES, strides)
-    status = DftiSetValue(desc, DFTI_OUTPUT_STRIDES, strides)
-    status = DftiCommitDescriptor(desc)    
-
-    return
-  end subroutine c2c_1m_z_plan
-
-  ! Return an MKL plan for multiple 1D r2c FFTs in X direction
-  subroutine r2c_1m_x_plan(desc, decomp_ph, decomp_sp, direction)
-
-    implicit none
-
-    type(DFTI_DESCRIPTOR), pointer :: desc
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_ph, decomp_sp
-    integer, intent(IN) :: direction ! (-1=r2c; 1=c2r)
-
-    integer :: status
-
-    ! c2r and r2c plans are almost the same, just swap input/output
-
-#ifdef DOUBLE_PREC
-    status = DftiCreateDescriptor(desc, DFTI_DOUBLE, &
-         DFTI_REAL, 1, decomp_ph%xsz(1))
-#else
-    status = DftiCreateDescriptor(desc, DFTI_SINGLE, &
-         DFTI_REAL, 1, decomp_ph%xsz(1))
-#endif
-    status = DftiSetValue(desc, DFTI_NUMBER_OF_TRANSFORMS, &
-         decomp_ph%xsz(2)*decomp_ph%xsz(3))
-    status = DftiSetValue(desc, DFTI_CONJUGATE_EVEN_STORAGE, &
-         DFTI_COMPLEX_COMPLEX)
-    status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_NOT_INPLACE)
-    if (direction == -1) then  ! r2c
-       status = DftiSetValue(desc, DFTI_INPUT_DISTANCE, &
-            decomp_ph%xsz(1))
-       status = DftiSetValue(desc, DFTI_OUTPUT_DISTANCE, &
-            decomp_sp%xsz(1))
-    else if (direction == 1) then  ! c2r
-       status = DftiSetValue(desc, DFTI_INPUT_DISTANCE, &
-            decomp_sp%xsz(1))
-       status = DftiSetValue(desc, DFTI_OUTPUT_DISTANCE, &
-            decomp_ph%xsz(1))
-    end if
-    status = DftiCommitDescriptor(desc)
-
-    return
-  end subroutine r2c_1m_x_plan
-
-  ! Return an MKL plan for multiple 1D r2c FFTs in Z direction
-  subroutine r2c_1m_z_plan(desc, decomp_ph, decomp_sp, direction)
-
-    implicit none
-
-    type(DFTI_DESCRIPTOR), pointer :: desc
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_ph, decomp_sp
-    integer, intent(IN) :: direction ! (-1=r2c; 1=c2r)
-
-    integer :: status, strides(2)
-
-    ! c2r and r2c plans are almost the same, just swap input/output
-
-#ifdef DOUBLE_PREC
-    status = DftiCreateDescriptor(desc, DFTI_DOUBLE, &
-         DFTI_REAL, 1, decomp_ph%zsz(3))
-#else
-    status = DftiCreateDescriptor(desc, DFTI_SINGLE, &
-         DFTI_REAL, 1, decomp_ph%zsz(3))
-#endif
-    status = DftiSetValue(desc, DFTI_NUMBER_OF_TRANSFORMS, &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2))
-    status = DftiSetValue(desc, DFTI_CONJUGATE_EVEN_STORAGE, &
-         DFTI_COMPLEX_COMPLEX)
-    status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_NOT_INPLACE)
-    status = DftiSetValue(desc, DFTI_INPUT_DISTANCE, 1)
-    status = DftiSetValue(desc, DFTI_OUTPUT_DISTANCE, 1)
-    strides(1) = 0
-    strides(2) = decomp_ph%zsz(1)*decomp_ph%zsz(2)
-    if (direction == -1) then
-       status = DftiSetValue(desc, DFTI_INPUT_STRIDES, strides)
-    else if (direction == 1) then
-       status = DftiSetValue(desc, DFTI_OUTPUT_STRIDES, strides)
-    end if
-    strides(2) = decomp_sp%zsz(1)*decomp_sp%zsz(2)
-    if (direction == -1) then
-       status = DftiSetValue(desc, DFTI_OUTPUT_STRIDES, strides)
-    else if (direction == 1) then
-       status = DftiSetValue(desc, DFTI_INPUT_STRIDES, strides)
-    end if
-    status = DftiCommitDescriptor(desc)
-
-    return
-  end subroutine r2c_1m_z_plan
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !  This routine performs one-time finalisations for the FFT engine
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine finalize_fft_engine
-
-    implicit none
-
-    integer :: status
-
-    status = DftiFreeDescriptor(c2c_x)
-    status = DftiFreeDescriptor(c2c_y)
-    status = DftiFreeDescriptor(c2c_z)
-    if (format==PHYSICAL_IN_X) then
-       status = DftiFreeDescriptor(r2c_x)
-       status = DftiFreeDescriptor(c2c_z2)
-       status = DftiFreeDescriptor(c2r_x)
-    else if (format==PHYSICAL_IN_Z) then
-       status = DftiFreeDescriptor(r2c_z)
-       status = DftiFreeDescriptor(c2c_x2)
-       status = DftiFreeDescriptor(c2r_z)
-    end if
-    status = DftiFreeDescriptor(c2c_y2)
-
-    return
-  end subroutine finalize_fft_engine
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! 3D FFT - complex to complex
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine fft_3d_c2c(in, out, isign)
-
-    implicit none
-
-    complex(mytype), dimension(:,:,:), intent(IN) :: in
-    complex(mytype), dimension(:,:,:), intent(OUT) :: out
-    integer, intent(IN) :: isign   
-
-    complex(mytype), allocatable, dimension(:,:,:) :: wk1,wk2,wk2b,wk3
-    integer :: k, status
-
-    if (format==PHYSICAL_IN_X .AND. isign==DECOMP_2D_FFT_FORWARD .OR.  &
-         format==PHYSICAL_IN_Z .AND. isign==DECOMP_2D_FFT_BACKWARD) then
-
-       ! ===== 1D FFTs in X =====
-       allocate (wk1(ph%xsz(1),ph%xsz(2),ph%xsz(3)))
-       !       if (isign==DECOMP_2D_FFT_FORWARD) then
-       !          status = DftiComputeForward(c2c_x, in(:,1,1), wk1(:,1,1))
-       !       else if (isign==DECOMP_2D_FFT_BACKWARD) then
-       !          status = DftiComputeBackward(c2c_x, in(:,1,1), wk1(:,1,1))
-       !       end if
-       status = wrapper_c2c(c2c_x, in, wk1, isign)
-
-       ! ===== Swap X --> Y =====
-       allocate (wk2(ph%ysz(1),ph%ysz(2),ph%ysz(3)))
-       call transpose_x_to_y(wk1,wk2,ph)
-
-       ! ===== 1D FFTs in Y =====
-       allocate (wk2b(ph%ysz(1),ph%ysz(2),ph%ysz(3))) 
-       do k=1,ph%xsz(3) ! one Z-plane at a time
-          !          if (isign==DECOMP_2D_FFT_FORWARD) then
-          !             status = DftiComputeForward(c2c_y, wk2(:,1,k), wk2b(:,1,k))
-          !          else if (isign==DECOMP_2D_FFT_BACKWARD) then
-          !             status = DftiComputeBackward(c2c_y, wk2(:,1,k), wk2b(:,1,k))
-          !          end if
-          status = wrapper_c2c(c2c_y, wk2(1,1,k), wk2b(1,1,k), isign)
-       end do
-
-       ! ===== Swap Y --> Z =====
-       allocate (wk3(ph%zsz(1),ph%zsz(2),ph%zsz(3)))
-       call transpose_y_to_z(wk2b,wk3,ph)
-
-       ! ===== 1D FFTs in Z =====
-       !       if (isign==DECOMP_2D_FFT_FORWARD) then
-       !          status = DftiComputeForward(c2c_z, wk3(:,1,1), out(:,1,1))
-       !       else if (isign==DECOMP_2D_FFT_BACKWARD) then
-       !          status = DftiComputeBackward(c2c_z, wk3(:,1,1), out(:,1,1))
-       !       end if
-       status = wrapper_c2c(c2c_z, wk3, out, isign)
-
-    else if (format==PHYSICAL_IN_X .AND. isign==DECOMP_2D_FFT_BACKWARD &
-         .OR. & 
-         format==PHYSICAL_IN_Z .AND. isign==DECOMP_2D_FFT_FORWARD) then
-
-       ! ===== 1D FFTs in Z =====
-       allocate (wk1(ph%zsz(1),ph%zsz(2),ph%zsz(3)))
-       !       if (isign==DECOMP_2D_FFT_FORWARD) then
-       !          status = DftiComputeForward(c2c_z, in(:,1,1), wk1(:,1,1))
-       !       else if (isign==DECOMP_2D_FFT_BACKWARD) then
-       !          status = DftiComputeBackward(c2c_z, in(:,1,1), wk1(:,1,1))
-       !       end if
-       status = wrapper_c2c(c2c_z, in, wk1, isign)
-
-       ! ===== Swap Z --> Y =====
-       allocate (wk2(ph%ysz(1),ph%ysz(2),ph%ysz(3)))
-       call transpose_z_to_y(wk1,wk2,ph)
-
-       ! ===== 1D FFTs in Y =====
-       allocate (wk2b(ph%ysz(1),ph%ysz(2),ph%ysz(3)))
-       do k=1,ph%xsz(3) ! one Z-plane at a time
-          !          if (isign==DECOMP_2D_FFT_FORWARD) then
-          !             status = DftiComputeForward(c2c_y, wk2(:,1,k), wk2b(:,1,k))
-          !          else if (isign==DECOMP_2D_FFT_BACKWARD) then
-          !             status = DftiComputeBackward(c2c_y, wk2(:,1,k), wk2b(:,1,k))
-          !          end if
-          status = wrapper_c2c(c2c_y, wk2(1,1,k), wk2b(1,1,k), isign)
-       end do
-
-       ! ===== Swap Y --> X =====
-       allocate (wk3(ph%xsz(1),ph%xsz(2),ph%xsz(3)))
-       call transpose_y_to_x(wk2b,wk3,ph)
-
-       ! ===== 1D FFTs in X =====
-       !       if (isign==DECOMP_2D_FFT_FORWARD) then
-       !          status = DftiComputeForward(c2c_x, wk3(:,1,1), out(:,1,1))
-       !       else if (isign==DECOMP_2D_FFT_BACKWARD) then
-       !          status = DftiComputeBackward(c2c_x, wk3(:,1,1), out(:,1,1))
-       !       end if
-       status = wrapper_c2c(c2c_x, wk3, out, isign)
-
-    end if
-
-    return
-  end subroutine fft_3d_c2c
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! 3D forward FFT - real to complex
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine fft_3d_r2c(in_r, out_c)
-
-    implicit none
-
-    real(mytype), dimension(:,:,:), intent(IN) :: in_r
-    complex(mytype), dimension(:,:,:), intent(OUT) :: out_c
-
-    complex(mytype), allocatable, dimension(:,:,:) :: wk1,wk2,wk2b,wk3
-    integer :: k, status, isign
-
-    isign = DECOMP_2D_FFT_FORWARD
-
-    if (format==PHYSICAL_IN_X) then
-
-       ! ===== 1D FFTs in X =====
-       allocate(wk1(sp%xsz(1),sp%xsz(2),sp%xsz(3)))
-       !       status = DftiComputeForward(r2c_x, in_r(:,1,1), wk1(:,1,1))       
-       status = wrapper_r2c(r2c_x, in_r, wk1)
-
-       ! ===== Swap X --> Y =====
-       allocate (wk2(sp%ysz(1),sp%ysz(2),sp%ysz(3)))
-       call transpose_x_to_y(wk1,wk2,sp)
-
-       ! ===== 1D FFTs in Y =====
-       allocate (wk2b(sp%ysz(1),sp%ysz(2),sp%ysz(3)))
-       do k=1,sp%ysz(3)
-          !          status = DftiComputeForward(c2c_y2, wk2(:,1,k), wk2b(:,1,k))
-          status = wrapper_c2c(c2c_y2, wk2(1,1,k), wk2b(1,1,k), isign)
-       end do
-
-       ! ===== Swap Y --> Z =====
-       allocate (wk3(sp%zsz(1),sp%zsz(2),sp%zsz(3)))
-       call transpose_y_to_z(wk2b,wk3,sp)
-
-       ! ===== 1D FFTs in Z =====
-       !       status = DftiComputeForward(c2c_z2, wk3(:,1,1), out_c(:,1,1))
-       status = wrapper_c2c(c2c_z2, wk3, out_c, isign)
-
-    else if (format==PHYSICAL_IN_Z) then
-
-       ! ===== 1D FFTs in Z =====
-       allocate(wk1(sp%zsz(1),sp%zsz(2),sp%zsz(3)))
-       !       status = DftiComputeForward(r2c_z, in_r(:,1,1), wk1(:,1,1))
-       status = wrapper_r2c(r2c_z, in_r, wk1)
-
-       ! ===== Swap Z --> Y =====
-       allocate (wk2(sp%ysz(1),sp%ysz(2),sp%ysz(3)))
-       call transpose_z_to_y(wk1,wk2,sp)
-
-       ! ===== 1D FFTs in Y =====
-       allocate (wk2b(sp%ysz(1),sp%ysz(2),sp%ysz(3)))
-       do k=1,sp%ysz(3)
-          !          status = DftiComputeForward(c2c_y2, wk2(:,1,k), wk2b(:,1,k))
-          status = wrapper_c2c(c2c_y2, wk2(1,1,k), wk2b(1,1,k), isign)
-       end do
-
-       ! ===== Swap Y --> X =====
-       allocate (wk3(sp%xsz(1),sp%xsz(2),sp%xsz(3)))
-       call transpose_y_to_x(wk2b,wk3,sp)
-
-       ! ===== 1D FFTs in X =====
-       !       status = DftiComputeForward(c2c_x2, wk3(:,1,1), out_c(:,1,1))
-       status = wrapper_c2c(c2c_x2, wk3, out_c, isign)
-
-    end if
-
-    return
-  end subroutine fft_3d_r2c
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! 3D inverse FFT - complex to real
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine fft_3d_c2r(in_c, out_r)
-
-    implicit none
-
-    complex(mytype), dimension(:,:,:), intent(IN) :: in_c
-    real(mytype), dimension(:,:,:), intent(OUT) :: out_r
-
-    complex(mytype), allocatable, dimension(:,:,:) :: wk1,wk2,wk2b,wk3
-    integer :: k, status, isign
-
-    isign = DECOMP_2D_FFT_BACKWARD
-
-    if (format==PHYSICAL_IN_X) then
-
-       ! ===== 1D FFTs in Z =====
-       allocate (wk1(sp%zsz(1),sp%zsz(2),sp%zsz(3)))
-       !       status = DftiComputeBackward(c2c_z2, in_c(:,1,1), wk1(:,1,1))
-       status = wrapper_c2c(c2c_z2, in_c, wk1, isign)
-
-       ! ===== Swap Z --> Y =====
-       allocate (wk2(sp%ysz(1),sp%ysz(2),sp%ysz(3)))
-       call transpose_z_to_y(wk1,wk2,sp)
-
-       ! ===== 1D FFTs in Y =====
-       allocate (wk2b(sp%ysz(1),sp%ysz(2),sp%ysz(3)))
-       do k=1,sp%ysz(3)
-          !          status = DftiComputeBackward(c2c_y2, wk2(:,1,k), wk2b(:,1,k))
-          status = wrapper_c2c(c2c_y2, wk2(1,1,k), wk2b(1,1,k), isign)
-       end do
-
-       ! ===== Swap Y --> X =====
-       allocate (wk3(sp%xsz(1),sp%xsz(2),sp%xsz(3)))
-       call transpose_y_to_x(wk2b,wk3,sp)
-
-       ! ===== 1D FFTs in X =====
-       !       status = DftiComputeBackward(c2r_x, wk3(:,1,1), out_r(:,1,1))
-       status =  wrapper_c2r(c2r_x, wk3, out_r)
-
-    else if (format==PHYSICAL_IN_Z) then
-
-       ! ===== 1D FFTs in X =====
-       allocate(wk1(sp%xsz(1),sp%xsz(2),sp%xsz(3)))
-       !       status = DftiComputeBackward(c2c_x2, in_c(:,1,1), wk1(:,1,1))
-       status = wrapper_c2c(c2c_x2, in_c, wk1, isign)
-
-       ! ===== Swap X --> Y =====
-       allocate (wk2(sp%ysz(1),sp%ysz(2),sp%ysz(3)))
-       call transpose_x_to_y(wk1,wk2,sp)
-
-       ! ===== 1D FFTs in Y =====
-       allocate (wk2b(sp%ysz(1),sp%ysz(2),sp%ysz(3)))
-       do k=1,sp%ysz(3)
-          !          status = DftiComputeBackward(c2c_y2, wk2(:,1,k), wk2b(:,1,k))
-          status = wrapper_c2c(c2c_y2, wk2(1,1,k), wk2b(1,1,k), isign)
-       end do
-
-       ! ===== Swap Y --> Z =====
-       allocate (wk3(sp%zsz(1),sp%zsz(2),sp%zsz(3)))
-       call transpose_y_to_z(wk2b,wk3,sp)
-
-       ! ===== 1D FFTs in Z =====
-       !       status = DftiComputeBackward(c2r_z, wk3(:,1,1), out_r(:,1,1))
-       status =  wrapper_c2r(c2r_z, wk3, out_r)
-
-    end if
-
-    return
-  end subroutine fft_3d_c2r
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Wrapper functions so that one can pass 3D arrays to DftiCompute
-  !  -- MKL accepts only 1D arrays as input/output for its multi-
-  !     dimensional FFTs.
-  !  -- Using EQUIVALENCE as suggested by MKL documents is impossible 
-  !     for allocated arrays, not to mention bad coding style
-  !  -- All code commented out above may well work but not safe. There 
-  !     is no guarantee that compiler wouldn't make copies of 1D arrays
-  !     (which would contain only one slice of the original 3D data) 
-  !     rather than referring to the same memory address, i.e. 3D array
-  !     A and 1D array A(:,1,1) may refer to different memory location.
-  !  -- Using the following wrappers is safe and standard conforming.
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-  integer function wrapper_c2c(desc, in, out, isign)
-
-    implicit none
-
-    type(DFTI_DESCRIPTOR), pointer :: desc
-    complex(mytype), dimension(*) :: in, out
-    integer :: isign, status
-
-    if (isign == DECOMP_2D_FFT_FORWARD) then
-       status = DftiComputeForward(desc, in, out)
-    else if (isign == DECOMP_2D_FFT_BACKWARD) then
-       status = DftiComputeBackward(desc, in, out)
-    end if
-
-    wrapper_c2c = status
-
-    return
-  end function wrapper_c2c
-
-  integer function wrapper_r2c(desc, in, out)
-
-    implicit none
-
-    type(DFTI_DESCRIPTOR), pointer :: desc
-    real(mytype), dimension(*) :: in 
-    complex(mytype), dimension(*) :: out
-
-    wrapper_r2c = DftiComputeForward(desc, in, out)
-
-    return
-  end function wrapper_r2c
-
-  integer function wrapper_c2r(desc, in, out)
-
-    implicit none
-
-    type(DFTI_DESCRIPTOR), pointer :: desc
-    complex(mytype), dimension(*) :: in 
-    real(mytype), dimension(*) :: out
-
-    wrapper_c2r = DftiComputeBackward(desc, in, out)
-
-    return
-  end function wrapper_c2r
-
-end module decomp_2d_fft
diff --git a/decomp2d/glassman.f90 b/decomp2d/glassman.f90
deleted file mode 100644
index 05545e858..000000000
--- a/decomp2d/glassman.f90
+++ /dev/null
@@ -1,184 +0,0 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-! This module contains a few 'generic' FFT routines, making the 
-! 2DECOMP&FFT library not dependent on any external libraries
-
-module glassman
-
-  use decomp_2d, only : mytype
-
-  implicit none
-
-contains
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Following is a FFT implementation based on algorithm proposed by
-  ! Glassman, a general FFT algorithm supporting arbitrary input length.
-  !
-  ! W. E. Ferguson, Jr., "A simple derivation of Glassman general-n fast
-  ! Fourier transform," Comput. and Math. with Appls., vol. 8, no. 6, pp.
-  ! 401-411, 1982.
-  !
-  ! Original implemtation online at http://www.jjj.de/fft/fftpage.html
-  !
-  ! Updated  
-  !  -  to handle double-precision as well
-  !  -  unnecessary scaling code removed
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-  SUBROUTINE SPCFFT(U,N,ISIGN,WORK)
-
-    IMPLICIT NONE
-
-    LOGICAL :: INU
-    INTEGER :: A,B,C,N,I,ISIGN
-    COMPLEX(mytype) :: U(*),WORK(*)
-
-    A = 1
-    B = N
-    C = 1
-    INU = .TRUE.
-
-    DO WHILE ( B .GT. 1 )
-       A = C * A
-       C = 2
-       DO WHILE ( MOD(B,C) .NE. 0 )
-          C = C + 1
-       END DO
-       B = B / C
-       IF ( INU ) THEN
-          CALL SPCPFT (A,B,C,U,WORK,ISIGN)
-       ELSE
-          CALL SPCPFT (A,B,C,WORK,U,ISIGN)
-       END IF
-       INU = ( .NOT. INU )
-    END DO
-
-    IF ( .NOT. INU ) THEN
-       DO I = 1, N
-          U(I) = WORK(I)
-       END DO
-    END IF
-
-    RETURN
-  END SUBROUTINE SPCFFT
-
-
-  SUBROUTINE SPCPFT( A, B, C, UIN, UOUT, ISIGN )
-
-    IMPLICIT NONE
-
-    INTEGER :: ISIGN,A,B,C,IA,IB,IC,JCR,JC
-
-    DOUBLE PRECISION :: ANGLE
-
-    COMPLEX(mytype) :: UIN(B,C,A),UOUT(B,A,C),DELTA,OMEGA,SUM
-
-    ANGLE = 6.28318530717958_mytype / REAL( A * C, kind=mytype )
-    OMEGA = CMPLX( 1.0, 0.0, kind=mytype )
-
-    IF( ISIGN .EQ. 1 ) THEN
-       DELTA = CMPLX( DCOS(ANGLE), DSIN(ANGLE), kind=mytype )
-    ELSE
-       DELTA = CMPLX( DCOS(ANGLE), -DSIN(ANGLE), kind=mytype )
-    END IF
-
-    DO IC = 1, C
-       DO IA = 1, A
-          DO IB = 1, B
-             SUM = UIN( IB, C, IA )
-             DO JCR = 2, C
-                JC = C + 1 - JCR
-                SUM = UIN( IB, JC, IA ) + OMEGA * SUM
-             END DO
-             UOUT( IB, IA, IC ) = SUM
-          END DO
-          OMEGA = DELTA * OMEGA
-       END DO
-    END DO
-
-    RETURN
-  END SUBROUTINE SPCPFT
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! A 3D real-to-complex routine implemented using the 1D FFT above
-  !   Input:   nx*ny*nz real numbers
-  !   Output:  (nx/2+1)*ny*nz complex numbers
-  ! Just like big FFT libraries (such as FFTW) do
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine glassman_3d_r2c(in_r,nx,ny,nz,out_c)
-
-    implicit none
-
-    integer, intent(IN) :: nx,ny,nz
-    real(mytype), dimension(nx,ny,nz) :: in_r
-    complex(mytype), dimension(nx/2+1,ny,nz) :: out_c
-
-    complex(mytype), allocatable, dimension(:) :: buf, scratch
-    integer :: maxsize, i,j,k
-
-    maxsize = max(nx, max(ny,nz))
-    allocate(buf(maxsize))
-    allocate(scratch(maxsize))
-
-    ! ===== 1D FFTs in X =====
-    do k=1,nz
-       do j=1,ny
-          ! Glassman's 1D FFT is c2c only, 
-          ! needing some pre- and post-processing for r2c
-          ! pack real input in complex storage
-          do i=1,nx
-             buf(i) = cmplx(in_r(i,j,k),0._mytype, kind=mytype)
-          end do
-          call spcfft(buf,nx,-1,scratch)
-          ! simply drop the redundant part of the complex output
-          do i=1,nx/2+1
-             out_c(i,j,k) = buf(i)
-          end do
-       end do
-    end do
-
-    ! ===== 1D FFTs in Y =====
-    do k=1,nz
-       do i=1,nx/2+1
-          do j=1,ny
-             buf(j) = out_c(i,j,k)
-          end do
-          call spcfft(buf,ny,-1,scratch)
-          do j=1,ny
-             out_c(i,j,k) = buf(j)
-          end do
-       end do
-    end do
-
-    ! ===== 1D FFTs in Z =====
-    do j=1,ny
-       do i=1,nx/2+1
-          do k=1,nz
-             buf(k) = out_c(i,j,k)
-          end do
-          call spcfft(buf,nz,-1,scratch)
-          do k=1,nz
-             out_c(i,j,k) = buf(k)
-          end do
-       end do
-    end do
-
-    deallocate(buf,scratch)
-
-    return
-  end subroutine glassman_3d_r2c
-
-
-end module glassman
-
diff --git a/decomp2d/halo.inc b/decomp2d/halo.inc
deleted file mode 100644
index 9fd4b7617..000000000
--- a/decomp2d/halo.inc
+++ /dev/null
@@ -1,115 +0,0 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-! Halo cell support for neighbouring pencils to exchange data
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-subroutine update_halo_real(in, out, level, opt_decomp, opt_global)
-
-implicit none
-
-integer, intent(IN) :: level      ! levels of halo cells required
-real(mytype), dimension(:,:,:), intent(IN) :: in    
-real(mytype), allocatable, dimension(:,:,:), intent(OUT) :: out
-TYPE(DECOMP_INFO), optional :: opt_decomp
-logical, optional :: opt_global
-
-TYPE(DECOMP_INFO) :: decomp
-logical :: global
-
-! starting/ending index of array with halo cells
-integer :: xs, ys, zs, xe, ye, ze
-
-integer :: i, j, k, s1, s2, s3, ierror
-integer :: data_type
-
-integer :: icount, ilength, ijump 
-integer :: halo12, halo21, halo31, halo32                 
-integer, dimension(4) :: requests
-integer, dimension(MPI_STATUS_SIZE,4) :: status
-integer :: tag_e, tag_w, tag_n, tag_s, tag_t, tag_b
-
-data_type = real_type
-
-#include "halo_common.inc"
-
-return
-end subroutine update_halo_real
-
-
-subroutine update_halo_complex(in, out, level, opt_decomp, opt_global)
-
-implicit none
-
-integer, intent(IN) :: level      ! levels of halo cells required
-complex(mytype), dimension(:,:,:), intent(IN) :: in    
-complex(mytype), allocatable, dimension(:,:,:), intent(OUT) :: out
-TYPE(DECOMP_INFO), optional :: opt_decomp
-logical, optional :: opt_global
-
-TYPE(DECOMP_INFO) :: decomp
-logical :: global
-
-! starting/ending index of array with halo cells
-integer :: xs, ys, zs, xe, ye, ze
-
-integer :: i, j, k, s1, s2, s3, ierror
-integer :: data_type
-
-integer :: icount, ilength, ijump 
-integer :: halo12, halo21, halo31, halo32                 
-integer, dimension(4) :: requests
-integer, dimension(MPI_STATUS_SIZE,4) :: status
-integer :: tag_e, tag_w, tag_n, tag_s, tag_t, tag_b
-
-data_type = complex_type
-
-#include "halo_common.inc"
-
-return
-end subroutine update_halo_complex
-
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-! To support halo-cell exchange:
-!   find the MPI ranks of neighbouring pencils
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-subroutine init_neighbour
-
-integer :: ierror
-
-! For X-pencil
-neighbour(1,1) = MPI_PROC_NULL               ! east
-neighbour(1,2) = MPI_PROC_NULL               ! west
-call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_X, 0, 1, &
-neighbour(1,4), neighbour(1,3), ierror) ! north & south
-call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_X, 1, 1, &
-neighbour(1,6), neighbour(1,5), ierror) ! top & bottom
-
-! For Y-pencil
-call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_Y, 0, 1, &
-neighbour(2,2), neighbour(2,1), ierror) ! east & west
-neighbour(2,3) = MPI_PROC_NULL               ! north
-neighbour(2,4) = MPI_PROC_NULL               ! south
-call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_Y, 1, 1, &
-neighbour(2,6), neighbour(2,5), ierror) ! top & bottom
-
-! For Z-pencil
-call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_Z, 0, 1, &
-neighbour(3,2), neighbour(3,1), ierror) ! east & west
-call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_Z, 1, 1, &
-neighbour(3,4), neighbour(3,3), ierror) ! north & south
-neighbour(3,5) = MPI_PROC_NULL               ! top
-neighbour(3,6) = MPI_PROC_NULL               ! bottom
-
-return
-end subroutine init_neighbour
diff --git a/decomp2d/halo_common.inc b/decomp2d/halo_common.inc
deleted file mode 100644
index 1064f07a2..000000000
--- a/decomp2d/halo_common.inc
+++ /dev/null
@@ -1,425 +0,0 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-! This file contain common code to be included by subroutines 
-! 'update_halo_...' in halo.inc
-
-if (present(opt_decomp)) then
-decomp = opt_decomp
-else
-decomp = decomp_main
-end if
-
-if (present(opt_global)) then
-global = opt_global
-else
-global = .false.
-end if
-
-s1 = size(in,1)
-s2 = size(in,2)
-s3 = size(in,3)
-
-! Calculate the starting index and ending index of output
-if (s1==decomp%xsz(1)) then  ! X-pencil input
-if (global) then
-xs = decomp%xst(1) 
-xe = decomp%xen(1)
-ys = decomp%xst(2) - level
-ye = decomp%xen(2) + level
-zs = decomp%xst(3) - level
-ze = decomp%xen(3) + level
-else
-xs = 1 
-xe = s1
-ys = 1 - level
-ye = s2 + level 
-zs = 1 - level
-ze = s3 + level
-end if
-else if (s2==decomp%ysz(2)) then  ! Y-pencil input
-if (global) then
-xs = decomp%yst(1) - level 
-xe = decomp%yen(1) + level
-ys = decomp%yst(2)
-ye = decomp%yen(2)
-zs = decomp%yst(3) - level
-ze = decomp%yen(3) + level
-else
-xs = 1 - level
-xe = s1 + level
-ys = 1
-ye = s2
-zs = 1 - level
-ze = s3 + level
-end if
-else if (s3==decomp%zsz(3)) then  ! Z-pencil input
-if (global) then
-xs = decomp%zst(1) - level 
-xe = decomp%zen(1) + level
-ys = decomp%zst(2) - level
-ye = decomp%zen(2) + level
-zs = decomp%zst(3)
-ze = decomp%zen(3)
-else
-xs = 1 - level
-xe = s1 + level
-ys = 1 - level
-ye = s2 + level
-zs = 1
-ze = s3
-end if
-else
-! invalid input
-call decomp_2d_abort(10, &
-'Invalid data passed to update_halo')
-end if
-
-
-allocate(out(xs:xe, ys:ye, zs:ze))
-!    out = -1.0_mytype ! fill the halo for debugging
-
-! copy input data to output
-if (global) then
-! using global coordinate
-! note the input array passed in always has index starting from 1 
-! need to work out the corresponding global index
-if (s1==decomp%xsz(1)) then
-do k=decomp%xst(3),decomp%xen(3)
-do j=decomp%xst(2),decomp%xen(2)
-do i=1,s1  ! x all local
-out(i,j,k) = in(i,j-decomp%xst(2)+1,k-decomp%xst(3)+1)
-end do
-end do
-end do
-else if (s2==decomp%ysz(2)) then
-do k=decomp%yst(3),decomp%yen(3)
-do j=1,s2  ! y all local
-do i=decomp%yst(1),decomp%yen(1)
-out(i,j,k) = in(i-decomp%yst(1)+1,j,k-decomp%yst(3)+1)
-end do
-end do
-end do
-else if (s3==decomp%zsz(3)) then
-do k=1,s3  ! z all local
-do j=decomp%zst(2),decomp%zen(2)
-do i=decomp%zst(1),decomp%zen(1)
-out(i,j,k) = in(i-decomp%zst(1)+1,j-decomp%zst(2)+1,k)
-end do
-end do
-end do
-end if
-else
-! not using global coordinate
-do k=1,s3
-do j=1,s2
-do i=1,s1
-out(i,j,k) = in(i,j,k)
-end do
-end do
-end do
-end if
-
-! If needed, define MPI derived data type to pack halo data,
-! then call MPI send/receive to exchange halo data
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-! X-pencil
-if (s1==decomp%xsz(1)) then
-
-#ifdef HALO_DEBUG
-if (nrank==4) then
-write(*,*) 'X-pencil input'
-write(*,*) '=============='
-write(*,*) 'Data on a y-z plane is shown'
-write(*,*) 'Before halo exchange'
-do j=ye,ys,-1
-write(*,'(10F4.0)') (out(1,j,k),k=zs,ze)
-end do
-end if
-#endif
-
-! *** east/west ***
-! all data in local memory already, no halo exchange
-
-! *** north/south *** 
-tag_s = coord(1)
-if (coord(1)==dims(1)-1 .AND. periodic_y) then
-tag_n = 0
-else
-tag_n = coord(1) + 1
-end if
-icount = s3 + 2*level
-ilength = level * s1
-ijump = s1*(s2+2*level)
-call MPI_TYPE_VECTOR(icount, ilength, ijump, &
-data_type, halo12, ierror)
-call MPI_TYPE_COMMIT(halo12, ierror)
-! receive from south
-call MPI_IRECV(out(xs,ys,zs), 1, halo12, &
-neighbour(1,4), tag_s, DECOMP_2D_COMM_CART_X, &
-requests(1), ierror)
-! receive from north
-call MPI_IRECV(out(xs,ye-level+1,zs), 1, halo12, &
-neighbour(1,3), tag_n, DECOMP_2D_COMM_CART_X, &
-requests(2), ierror)
-! send to south
-call MPI_ISSEND(out(xs,ys+level,zs), 1, halo12, &
-neighbour(1,4), tag_s, DECOMP_2D_COMM_CART_X, &
-requests(3), ierror)
-! send to north
-call MPI_ISSEND(out(xs,ye-level-level+1,zs), 1, halo12, &
-neighbour(1,3), tag_n, DECOMP_2D_COMM_CART_X, &
-requests(4), ierror)
-call MPI_WAITALL(4, requests, status, ierror)
-call MPI_TYPE_FREE(halo12, ierror)
-#ifdef HALO_DEBUG
-if (nrank==4) then
-write(*,*) 'After exchange in Y'
-do j=ye,ys,-1
-write(*,'(10F4.0)') (out(1,j,k),k=zs,ze)
-end do
-end if
-#endif
-
-! *** top/bottom ***
-! no need to define derived data type as data on xy-planes
-! all contiguous in memory, which can be sent/received using
-! MPI directly
-tag_b = coord(2)
-if (coord(2)==dims(2)-1 .AND. periodic_z) then
-tag_t = 0
-else
-tag_t = coord(2) + 1
-end if
-icount = (s1 * (s2+2*level)) * level
-! receive from bottom
-call MPI_IRECV(out(xs,ys,zs), icount, data_type, &
-neighbour(1,6), tag_b, DECOMP_2D_COMM_CART_X, &
-requests(1), ierror)
-! receive from top
-call MPI_IRECV(out(xs,ys,ze-level+1), icount, data_type, &
-neighbour(1,5), tag_t, DECOMP_2D_COMM_CART_X, &
-requests(2), ierror)
-! send to bottom
-call MPI_ISSEND(out(xs,ys,zs+level), icount, data_type, &
-neighbour(1,6), tag_b, DECOMP_2D_COMM_CART_X, &
-requests(3), ierror)
-! send to top
-call MPI_ISSEND(out(xs,ys,ze-level-level+1), icount, data_type, &
-neighbour(1,5), tag_t, DECOMP_2D_COMM_CART_X, &
-requests(4), ierror)
-call MPI_WAITALL(4, requests, status, ierror)
-#ifdef HALO_DEBUG       
-if (nrank==4) then
-write(*,*) 'After exchange in Z'
-do j=ye,ys,-1
-write(*,'(10F4.0)') (out(1,j,k),k=zs,ze)
-end do
-end if
-#endif       
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!   
-! Y-pencil   
-else if (s2==decomp%ysz(2)) then
-
-#ifdef HALO_DEBUG
-if (nrank==4) then
-write(*,*) 'Y-pencil input'
-write(*,*) '=============='
-write(*,*) 'Data on a x-z plane is shown'
-write(*,*) 'Before halo exchange'
-do i=xe,xs,-1
-write(*,'(10F4.0)') (out(i,1,k),k=zs,ze)
-end do
-end if
-#endif
-
-! *** east/west ***
-tag_w = coord(1)
-if (coord(1)==dims(1)-1 .AND. periodic_x) then
-tag_e = 0
-else
-tag_e = coord(1) + 1
-end if
-icount = s2*(s3+2*level)
-ilength = level
-ijump = s1+2*level
-call MPI_TYPE_VECTOR(icount, ilength, ijump, &
-data_type, halo21, ierror)
-call MPI_TYPE_COMMIT(halo21, ierror)
-! receive from west
-call MPI_IRECV(out(xs,ys,zs), 1, halo21, &
-neighbour(2,2), tag_w, DECOMP_2D_COMM_CART_Y, &
-requests(1), ierror)
-! receive from east
-call MPI_IRECV(out(xe-level+1,ys,zs), 1, halo21, &
-neighbour(2,1), tag_e, DECOMP_2D_COMM_CART_Y, &
-requests(2), ierror)
-! send to west
-call MPI_ISSEND(out(xs+level,ys,zs), 1, halo21, &
-neighbour(2,2), tag_w, DECOMP_2D_COMM_CART_Y, &
-requests(3), ierror)
-! send to east
-call MPI_ISSEND(out(xe-level-level+1,ys,zs), 1, halo21, &
-neighbour(2,1), tag_e, DECOMP_2D_COMM_CART_Y, &
-requests(4), ierror)
-call MPI_WAITALL(4, requests, status, ierror)
-call MPI_TYPE_FREE(halo21, ierror)
-#ifdef HALO_DEBUG       
-if (nrank==4) then
-write(*,*) 'After exchange in X'
-do i=xe,xs,-1
-write(*,'(10F4.0)') (out(i,1,k),k=zs,ze)
-end do
-end if
-#endif
-
-! *** north/south ***
-! all data in local memory already, no halo exchange
-
-! *** top/bottom ***
-! no need to define derived data type as data on xy-planes
-! all contiguous in memory, which can be sent/received using
-! MPI directly
-tag_b = coord(2)
-if (coord(2)==dims(2)-1 .AND. periodic_z) then
-tag_t = 0
-else
-tag_t = coord(2) + 1
-end if
-icount = (s2 * (s1+2*level)) * level
-! receive from bottom
-call MPI_IRECV(out(xs,ys,zs), icount, data_type, &
-neighbour(2,6), tag_b, DECOMP_2D_COMM_CART_Y, &
-requests(1), ierror)
-! receive from top
-call MPI_IRECV(out(xs,ys,ze-level+1), icount, data_type, &
-neighbour(2,5), tag_t, DECOMP_2D_COMM_CART_Y, &
-requests(2), ierror)
-! send to bottom
-call MPI_ISSEND(out(xs,ys,zs+level), icount, data_type, &
-neighbour(2,6), tag_b, DECOMP_2D_COMM_CART_Y, &
-requests(3), ierror)
-! send to top
-call MPI_ISSEND(out(xs,ys,ze-level-level+1), icount, data_type, &
-neighbour(2,5), tag_t, DECOMP_2D_COMM_CART_Y, &
-requests(4), ierror)
-call MPI_WAITALL(4, requests, status, ierror)
-#ifdef HALO_DEBUG
-if (nrank==4) then
-write(*,*) 'After exchange in Z'
-do i=xe,xs,-1
-write(*,'(10F4.0)') (out(i,1,k),k=zs,ze)
-end do
-end if
-#endif
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!   
-! Z-pencil
-else if (s3==decomp%zsz(3)) then   
-
-#ifdef HALO_DEBUG
-if (nrank==4) then
-write(*,*) 'Z-pencil input'
-write(*,*) '=============='
-write(*,*) 'Data on a x-y plane is shown'
-write(*,*) 'Before halo exchange'
-do i=xe,xs,-1
-write(*,'(10F4.0)') (out(i,j,1),j=ys,ye)
-end do
-end if
-#endif
-
-! *** east/west ***
-tag_w = coord(1)
-if (coord(1)==dims(1)-1 .AND. periodic_x) then
-tag_e = 0
-else
-tag_e = coord(1) + 1
-end if
-icount = (s2+2*level)*s3
-ilength = level
-ijump = s1+2*level
-call MPI_TYPE_VECTOR(icount, ilength, ijump, &
-data_type, halo31, ierror)
-call MPI_TYPE_COMMIT(halo31, ierror)
-! receive from west
-call MPI_IRECV(out(xs,ys,zs), 1, halo31, &
-neighbour(3,2), tag_w, DECOMP_2D_COMM_CART_Z, &
-requests(1), ierror)
-! receive from east
-call MPI_IRECV(out(xe-level+1,ys,zs), 1, halo31, &
-neighbour(3,1), tag_e, DECOMP_2D_COMM_CART_Z, &
-requests(2), ierror)
-! send to west
-call MPI_ISSEND(out(xs+level,ys,zs), 1, halo31, &
-neighbour(3,2), tag_w, DECOMP_2D_COMM_CART_Z, &
-requests(3), ierror)
-! send to east
-call MPI_ISSEND(out(xe-level-level+1,ys,zs), 1, halo31, &
-neighbour(3,1), tag_e, DECOMP_2D_COMM_CART_Z, &
-requests(4), ierror)
-call MPI_WAITALL(4, requests, status, ierror)
-call MPI_TYPE_FREE(halo31, ierror)       
-#ifdef HALO_DEBUG
-if (nrank==4) then
-write(*,*) 'After exchange in X'
-do i=xe,xs,-1
-write(*,'(10F4.0)') (out(i,j,1),j=ys,ye)
-end do
-end if
-#endif
-
-! *** north/south *** 
-tag_s = coord(2)
-if (coord(2)==dims(2)-1 .AND. periodic_y) then
-tag_n = 0
-else
-tag_n = coord(2) + 1
-end if
-icount = s3
-ilength = level * (s1+2*level)
-ijump = (s1+2*level) * (s2+2*level)
-call MPI_TYPE_VECTOR(icount, ilength, ijump, &
-data_type, halo32, ierror)
-call MPI_TYPE_COMMIT(halo32, ierror)
-! receive from south
-call MPI_IRECV(out(xs,ys,zs), 1, halo32, &
-neighbour(3,4), tag_s, DECOMP_2D_COMM_CART_Z, &
-requests(1), ierror)
-! receive from north
-call MPI_IRECV(out(xs,ye-level+1,zs), 1, halo32, &
-neighbour(3,3), tag_n, DECOMP_2D_COMM_CART_Z, &
-requests(2), ierror)
-! send to south
-call MPI_ISSEND(out(xs,ys+level,zs), 1, halo32, &
-neighbour(3,4), tag_s, DECOMP_2D_COMM_CART_Z, &
-requests(3), ierror)
-! send to north
-call MPI_ISSEND(out(xs,ye-level-level+1,zs), 1, halo32, &
-neighbour(3,3), tag_n, DECOMP_2D_COMM_CART_Z, &
-requests(4), ierror)
-call MPI_WAITALL(4, requests, status, ierror)
-call MPI_TYPE_FREE(halo32, ierror)
-#ifdef HALO_DEBUG
-if (nrank==4) then
-write(*,*) 'After exchange in Y'
-do i=xe,xs,-1
-write(*,'(10F4.0)') (out(i,j,1),j=ys,ye)
-end do
-end if
-#endif
-
-! *** top/bottom ***
-! all data in local memory already, no halo exchange
-
-end if  ! pencil
diff --git a/decomp2d/io.f90 b/decomp2d/io.f90
deleted file mode 100644
index 4cb7f28d5..000000000
--- a/decomp2d/io.f90
+++ /dev/null
@@ -1,1900 +0,0 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2013 Ning Li, the Numerical Algorithms Group (NAG)
-! Copyright (C) 2021               the University of Edinburgh (UoE)
-!
-!=======================================================================
-
-! This module provides parallel IO facilities for applications based on
-! 2D decomposition.
-
-module decomp_2d_io
-
-  use decomp_2d
-  use MPI
-#ifdef T3PIO
-  use t3pio
-#endif
-
-#ifdef ADIOS2
-  use adios2
-#endif
-
-  implicit none
-
-  integer, parameter, public :: decomp_2d_write_mode = 1, decomp_2d_read_mode = 2, &
-       decomp_2d_append_mode = 3
-  integer, parameter :: MAX_IOH = 10 ! How many live IO things should we handle?
-  character(len=*), parameter :: io_sep = "::"
-  integer, save :: nreg_io = 0
-  integer, dimension(MAX_IOH), save :: io_step
-#ifndef ADIOS2
-  integer, dimension(MAX_IOH), save :: fh_registry
-  logical, dimension(MAX_IOH), target, save :: fh_live
-  character(len=1024), dimension(MAX_IOH), target, save :: fh_names
-  integer(kind=MPI_OFFSET_KIND), dimension(MAX_IOH), save :: fh_disp
-#else
-  type(adios2_adios) :: adios
-  character(len=1024), dimension(MAX_IOH), target, save :: engine_names
-  logical, dimension(MAX_IOH), target, save :: engine_live
-  type(adios2_engine), dimension(MAX_IOH), save :: engine_registry
-#endif
-  
-  private        ! Make everything private unless declared public
-
-  public :: decomp_2d_write_one, decomp_2d_read_one, &
-       decomp_2d_write_var, decomp_2d_read_var, &
-       decomp_2d_write_scalar, decomp_2d_read_scalar, &
-       decomp_2d_write_plane, decomp_2d_write_every, &
-       decomp_2d_write_subdomain, &
-       decomp_2d_write_outflow, decomp_2d_read_inflow, &
-       decomp_2d_io_init, decomp_2d_io_finalise, & ! XXX: initialise/finalise 2decomp&fft IO module
-       decomp_2d_init_io, & ! XXX: initialise an io process - awful naming
-       decomp_2d_register_variable, &
-       decomp_2d_open_io, decomp_2d_close_io, &
-       decomp_2d_start_io, decomp_2d_end_io, &
-       gen_iodir_name, &
-       decomp_2d_set_io_step
-
-  ! Generic interface to handle multiple data types
-
-  interface decomp_2d_write_one
-     module procedure write_one_real
-     module procedure write_one_complex
-     module procedure mpiio_write_real_coarse
-     module procedure mpiio_write_real_probe
-  end interface decomp_2d_write_one
-
-  interface decomp_2d_read_one
-     module procedure read_one_real
-     module procedure read_one_complex
-  end interface decomp_2d_read_one
-
-  interface decomp_2d_write_var
-     module procedure write_var_real
-     module procedure write_var_complex
-  end interface decomp_2d_write_var
-
-  interface decomp_2d_read_var
-     module procedure read_var_real
-     module procedure read_var_complex
-  end interface decomp_2d_read_var
-
-  interface decomp_2d_write_scalar
-     module procedure write_scalar_real
-     module procedure write_scalar_complex
-     module procedure write_scalar_integer
-     module procedure write_scalar_logical
-  end interface decomp_2d_write_scalar
-
-  interface decomp_2d_read_scalar
-     module procedure read_scalar_real
-     module procedure read_scalar_complex
-     module procedure read_scalar_integer
-     module procedure read_scalar_logical
-  end interface decomp_2d_read_scalar
-
-  interface decomp_2d_write_plane
-     module procedure write_plane_3d_real
-     module procedure write_plane_3d_complex
-     !     module procedure write_plane_2d
-  end interface decomp_2d_write_plane
-
-  interface decomp_2d_write_every
-     module procedure write_every_real
-     module procedure write_every_complex
-  end interface decomp_2d_write_every
-
-  interface decomp_2d_write_subdomain
-     module procedure write_subdomain
-  end interface decomp_2d_write_subdomain
-
-  interface decomp_2d_write_outflow
-     module procedure write_outflow
-  end interface decomp_2d_write_outflow
-
-  interface decomp_2d_read_inflow
-     module procedure read_inflow
-  end interface decomp_2d_read_inflow
-
-contains
-
-  subroutine decomp_2d_io_init()
-
-#ifdef ADIOS2
-  integer :: ierror
-  character(len=80) :: config_file="adios2_config.xml"
-#endif
-
-#ifdef ADIOS2
-  call adios2_init(adios, trim(config_file), MPI_COMM_WORLD, ierror)
-  if (ierror.ne.0) then
-     print *, "Error initialising ADIOS2 - is adios2_config.xml present and valid?"
-     call MPI_ABORT(MPI_COMM_WORLD, -1, ierror)
-  endif
-
-  engine_live(:) = .false.
-#endif
-  
-  end subroutine decomp_2d_io_init
-  subroutine decomp_2d_io_finalise()
-
-#ifdef ADIOS2
-    use adios2
-#endif
-
-    implicit none
-
-#ifdef ADIOS2
-    integer :: ierror
-#endif
-    
-#ifdef ADIOS2
-    call adios2_finalize(adios, ierror)
-#endif
-
-  end subroutine decomp_2d_io_finalise
-  
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Using MPI-IO library to write a single 3D array to a file
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine write_one_real(ipencil,var,filename,opt_decomp)
-
-    implicit none
-
-    integer, intent(IN) :: ipencil
-    real(mytype), dimension(:,:,:), intent(IN) :: var
-    character(len=*), intent(IN) :: filename
-    TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-    TYPE(DECOMP_INFO) :: decomp
-    integer(kind=MPI_OFFSET_KIND) :: filesize, disp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: ierror, newtype, fh, data_type, info, gs
-
-    data_type = real_type
-
-#include "io_write_one.inc"
-
-    return
-  end subroutine write_one_real
-
-  subroutine write_one_complex(ipencil,var,filename,opt_decomp)
-
-    implicit none
-
-    integer, intent(IN) :: ipencil
-    complex(mytype), dimension(:,:,:), intent(IN) :: var
-    character(len=*), intent(IN) :: filename
-    TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-    TYPE(DECOMP_INFO) :: decomp
-    integer(kind=MPI_OFFSET_KIND) :: filesize, disp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: ierror, newtype, fh, data_type, info, gs
-
-    data_type = complex_type
-
-#include "io_write_one.inc"
-
-    return
-  end subroutine write_one_complex
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Using MPI-IO library to read from a file a single 3D array
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine read_one_real(ipencil,var,dirname,varname,io_name,opt_decomp,reduce_prec)
-
-    implicit none
-
-    integer, intent(IN) :: ipencil
-    real(mytype), dimension(:,:,:), intent(INOUT) :: var
-    character(len=*), intent(IN) :: varname, dirname, io_name
-    TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-    logical, intent(in), optional :: reduce_prec
-
-    logical :: read_reduce_prec
-    
-    TYPE(DECOMP_INFO) :: decomp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: ierror, newtype, data_type
-    real(mytype_single), allocatable, dimension(:,:,:) :: varsingle
-    integer :: idx
-    integer :: disp_bytes
-    character(len=:), allocatable :: full_io_name
-    logical :: opened_new, dir_exists
- 
-    ! Safety check
-    if (.not. is_contiguous(var)) call decomp_2d_abort(-1, "read_one_real : argument must be contiguous")
-
-    read_reduce_prec = .true.
-    
-    idx = get_io_idx(io_name, dirname)
-#ifndef ADIOS2
-    opened_new = .false.
-    if (idx .lt. 1) then
-       ! Check file exists
-       full_io_name = trim(dirname)//"/"//trim(varname)
-       if (nrank==0) then
-          inquire(file=full_io_name, exist=dir_exists)
-          if (.not.dir_exists) then
-             print *, "ERROR: cannot read from", full_io_name, " directory doesn't exist!"
-             stop
-          end if
-       end if
-       
-       call decomp_2d_open_io(io_name, full_io_name, decomp_2d_read_mode)
-       idx = get_io_idx(io_name, full_io_name)
-       opened_new = .true.
-    end if
-    
-    if (present(reduce_prec)) then
-       if (.not. reduce_prec) then
-          read_reduce_prec = .false.
-       end if
-    end if
-    if (read_reduce_prec) then
-       data_type = real_type_single
-    else
-       data_type = real_type
-    end if
-    call MPI_TYPE_SIZE(data_type,disp_bytes,ierror)
-
-    !! Use MPIIO
-    if (read_reduce_prec) then
-       allocate (varsingle(xstV(1):xenV(1),xstV(2):xenV(2),xstV(3):xenV(3)))
-    end if
-    
-    if (present(opt_decomp)) then
-       decomp = opt_decomp
-    else
-       call get_decomp_info(decomp)
-    end if
-
-    ! determine subarray parameters
-    sizes(1) = decomp%xsz(1)
-    sizes(2) = decomp%ysz(2)
-    sizes(3) = decomp%zsz(3)
-
-    if (ipencil == 1) then
-       subsizes(1) = decomp%xsz(1)
-       subsizes(2) = decomp%xsz(2)
-       subsizes(3) = decomp%xsz(3)
-       starts(1) = decomp%xst(1)-1  ! 0-based index
-       starts(2) = decomp%xst(2)-1
-       starts(3) = decomp%xst(3)-1
-    else if (ipencil == 2) then
-       subsizes(1) = decomp%ysz(1)
-       subsizes(2) = decomp%ysz(2)
-       subsizes(3) = decomp%ysz(3)
-       starts(1) = decomp%yst(1)-1
-       starts(2) = decomp%yst(2)-1
-       starts(3) = decomp%yst(3)-1
-    else if (ipencil == 3) then
-       subsizes(1) = decomp%zsz(1)
-       subsizes(2) = decomp%zsz(2)
-       subsizes(3) = decomp%zsz(3)
-       starts(1) = decomp%zst(1)-1
-       starts(2) = decomp%zst(2)-1
-       starts(3) = decomp%zst(3)-1
-    else
-       call decomp_2d_abort(-1, "IO/read_one_real : Wrong value for ipencil")
-    endif
-
-    if ((subsizes(1) > size(var, 1)) .or. (subsizes(2) > size(var, 2)) &
-         .or. (subsizes(3) > size(var, 3))) then
-       print *, "ERROR: trying to read ", subsizes, "sized array for "//varname//" variable array is ", size(var)
-       stop
-    end if
-
-    associate(fh => fh_registry(idx), &
-         disp => fh_disp(idx))
-      call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
-           MPI_ORDER_FORTRAN, data_type, newtype, ierror)
-      call MPI_TYPE_COMMIT(newtype,ierror)
-      call MPI_FILE_SET_VIEW(fh,disp,data_type, &
-           newtype,'native',MPI_INFO_NULL,ierror)
-      if (read_reduce_prec) then
-         call MPI_FILE_READ_ALL(fh, varsingle, &
-              subsizes(1)*subsizes(2)*subsizes(3), &
-              data_type, MPI_STATUS_IGNORE, ierror)
-         var = real(varsingle,mytype)
-         deallocate(varsingle)
-      else
-         call MPI_FILE_READ_ALL(fh, var, &
-              subsizes(1)*subsizes(2)*subsizes(3), &
-              data_type, MPI_STATUS_IGNORE, ierror)
-      endif
-      if (ierror /= 0) then
-         print *, "ERROR in MPI_FILE_READ_ALL"
-         stop
-      end if
-      call MPI_TYPE_FREE(newtype,ierror)
-
-      disp = disp + int(sizes(1), MPI_OFFSET_KIND) * &
-                    int(sizes(2), MPI_OFFSET_KIND) * &
-                    int(sizes(3), MPI_OFFSET_KIND) * &
-                    int(disp_bytes, MPI_OFFSET_KIND)
-    end associate
-
-    if (opened_new) then
-       call decomp_2d_close_io(io_name, full_io_name)
-       deallocate(full_io_name)
-    end if
-#else
-    call adios2_read_one_real(ipencil, var, dirname, varname, io_name)
-#endif
-    return
-  end subroutine read_one_real
-
-
-  subroutine read_one_complex(ipencil,var,filename,opt_decomp)
-
-    implicit none
-
-    integer, intent(IN) :: ipencil
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: var
-    character(len=*), intent(IN) :: filename
-    TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-    TYPE(DECOMP_INFO) :: decomp
-    integer(kind=MPI_OFFSET_KIND) :: disp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: ierror, newtype, fh, data_type
-
-    data_type = complex_type
-
-#include "io_read_one.inc"
-
-    return
-
-  end subroutine read_one_complex
-
-#ifdef ADIOS2
-  subroutine adios2_read_one_real(ipencil,var,engine_name,varname,io_name)
-
-    implicit none
-
-    integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
-    character(len=*), intent(in) :: engine_name
-    character(len=*), intent(in) :: io_name
-    character*(*), intent(in) :: varname
-    real(mytype), dimension(:,:,:), intent(out) :: var
-    
-    integer (kind=MPI_OFFSET_KIND) :: filesize, disp
-    integer :: i,j,k, ierror, newtype, fh
-    type(adios2_io) :: io_handle
-    type(adios2_variable) :: var_handle
-    integer :: idx
-
-    call adios2_at_io(io_handle, adios, io_name, ierror)
-    call adios2_inquire_variable(var_handle, io_handle, varname, ierror)
-    if (.not.var_handle % valid) then
-       print *, "ERROR: trying to read variable without registering first! ", varname
-       stop
-    endif
-
-    idx = get_io_idx(io_name, engine_name)
-    call adios2_get(engine_registry(idx), var_handle, var, adios2_mode_deferred, ierror)
-
-    return
-    
-  end subroutine adios2_read_one_real
-#endif
-  
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Write a 3D array as part of a big MPI-IO file, starting from 
-  !  displacement 'disp'; 'disp' will be updated after the writing
-  !  operation to prepare the writing of next chunk of data.
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine write_var_real(fh,disp,ipencil,var,opt_decomp)
-
-    implicit none
-
-    integer, intent(IN) :: fh
-    integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
-    integer, intent(IN) :: ipencil
-    real(mytype), dimension(:,:,:), intent(IN) :: var
-    TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-    TYPE(DECOMP_INFO) :: decomp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: ierror, newtype, data_type
-
-    data_type = real_type
-
-#include "io_write_var.inc"
-
-    return
-  end subroutine write_var_real
-
-  subroutine write_var_complex(fh,disp,ipencil,var,opt_decomp)
-
-    implicit none
-
-    integer, intent(IN) :: fh
-    integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
-    integer, intent(IN) :: ipencil
-    complex(mytype), dimension(:,:,:), intent(IN) :: var
-    TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-    TYPE(DECOMP_INFO) :: decomp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: ierror, newtype, data_type
-
-    data_type = complex_type
-
-#include "io_write_var.inc"
-
-    return
-  end subroutine write_var_complex
-
-
-  subroutine write_outflow(dirname,varname,ntimesteps,var,io_name,opt_decomp)
-
-    implicit none
-
-    character(len=*), intent(in) :: dirname, varname, io_name
-    integer, intent(IN) :: ntimesteps 
-    real(mytype), dimension(:,:,:), intent(IN) :: var
-    TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-    TYPE(DECOMP_INFO) :: decomp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: ierror, newtype, data_type
-    integer :: idx
-#ifdef ADIOS2
-    type(adios2_io) :: io_handle
-    type(adios2_variable) :: var_handle
-#endif
-
-    data_type = real_type
-
-#include "io_write_outflow.f90"
-
-    return
-  end subroutine write_outflow
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Read a 3D array as part of a big MPI-IO file, starting from 
-  !  displacement 'disp'; 'disp' will be updated after the reading
-  !  operation to prepare the reading of next chunk of data.
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine read_var_real(fh,disp,ipencil,var,opt_decomp)
-
-    implicit none
-
-    integer, intent(IN) :: fh
-    integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
-    integer, intent(IN) :: ipencil
-    real(mytype), dimension(:,:,:), intent(INOUT) :: var
-    TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-    TYPE(DECOMP_INFO) :: decomp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: ierror, newtype, data_type
-
-    data_type = real_type
-
-#include "io_read_var.inc"
-
-    return
-  end subroutine read_var_real
-
-
-  subroutine read_var_complex(fh,disp,ipencil,var,opt_decomp)
-
-    implicit none
-
-    integer, intent(IN) :: fh
-    integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
-    integer, intent(IN) :: ipencil
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: var
-    TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-    TYPE(DECOMP_INFO) :: decomp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: ierror, newtype, data_type
-
-    data_type = complex_type
-
-#include "io_read_var.inc"
-
-    return
-  end subroutine read_var_complex
-
-
-  subroutine read_inflow(dirname,varname,ntimesteps,var,io_name,opt_decomp)
- 
-    implicit none
-
-    character(len=*), intent(in) :: dirname, varname, io_name
-    integer, intent(IN) :: ntimesteps
-    real(mytype), dimension(:,:,:), intent(INOUT) :: var
-    TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-    TYPE(DECOMP_INFO) :: decomp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: ierror, newtype, data_type
-    integer :: idx
-#ifdef ADIOS2
-    integer(kind=8) :: steps
-    type(adios2_io) :: io_handle
-    type(adios2_variable) :: var_handle
-#endif
-    
-    data_type = real_type
-
-#include "io_read_inflow.f90"
-
-    return
-  end subroutine read_inflow
-
-  subroutine decomp_2d_set_io_step(io_name, io_dir, step)
-
-    character(len=*), intent(in) :: io_name, io_dir
-    integer, intent(in) :: step
-
-    integer :: idx
-
-    idx = get_io_idx(io_name, io_dir)
-    if (idx < 1) then
-       print *, "ERROR!"
-       stop
-    end if
-
-    io_step(idx) = step
-    
-  end subroutine decomp_2d_set_io_step
-  
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Write scalar variables as part of a big MPI-IO file, starting from 
-  !  displacement 'disp'; 'disp' will be updated after the reading
-  !  operation to prepare the reading of next chunk of data.
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine write_scalar_real(fh,disp,n,var)
-
-    implicit none
-
-    integer, intent(IN) :: fh             ! file handle
-    integer(KIND=MPI_OFFSET_KIND), &
-         intent(INOUT) :: disp            ! displacement
-    integer, intent(IN) :: n              ! number of scalars
-    real(mytype), dimension(n), &
-         intent(IN) :: var                ! array of scalars
-
-    integer :: m, ierror
-
-    call MPI_FILE_SET_VIEW(fh,disp,real_type, &
-         real_type,'native',MPI_INFO_NULL,ierror)
-    if (nrank==0) then
-       m = n ! only one rank needs to write
-    else
-       m = 0
-    end if
-    call MPI_FILE_WRITE_ALL(fh, var, m, real_type, &
-         MPI_STATUS_IGNORE, ierror)
-    disp = disp + n*mytype_bytes
-
-    return
-  end subroutine write_scalar_real
-
-
-  subroutine write_scalar_complex(fh,disp,n,var)
-
-    implicit none
-
-    integer, intent(IN) :: fh
-    integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
-    integer, intent(IN) :: n
-    complex(mytype), dimension(n), intent(IN) :: var
-
-    integer :: m, ierror
-
-    call MPI_FILE_SET_VIEW(fh,disp,complex_type, &
-         complex_type,'native',MPI_INFO_NULL,ierror)
-    if (nrank==0) then
-       m = n
-    else
-       m = 0
-    end if
-    call MPI_FILE_WRITE_ALL(fh, var, m, complex_type, &
-         MPI_STATUS_IGNORE, ierror)
-    disp = disp + n*mytype_bytes*2
-
-    return
-  end subroutine write_scalar_complex
-
-
-  subroutine write_scalar_integer(fh,disp,n,var)
-
-    implicit none
-
-    integer, intent(IN) :: fh
-    integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
-    integer, intent(IN) :: n
-    integer, dimension(n), intent(IN) :: var
-
-    integer :: m, ierror
-
-    call MPI_FILE_SET_VIEW(fh,disp,MPI_INTEGER, &
-         MPI_INTEGER,'native',MPI_INFO_NULL,ierror)
-    if (nrank==0) then
-       m = n
-    else
-       m = 0
-    end if
-    call MPI_FILE_WRITE_ALL(fh, var, m, MPI_INTEGER, &
-         MPI_STATUS_IGNORE, ierror)
-    call MPI_TYPE_SIZE(MPI_INTEGER,m,ierror)
-    disp = disp + n*m
-
-    return
-  end subroutine write_scalar_integer
-
-
-  subroutine write_scalar_logical(fh,disp,n,var)
-
-    implicit none
-
-    integer, intent(IN) :: fh
-    integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
-    integer, intent(IN) :: n
-    logical, dimension(n), intent(IN) :: var
-
-    integer :: m, ierror
-
-    call MPI_FILE_SET_VIEW(fh,disp,MPI_LOGICAL, &
-         MPI_LOGICAL,'native',MPI_INFO_NULL,ierror)
-    if (nrank==0) then
-       m = n
-    else
-       m = 0
-    end if
-    call MPI_FILE_WRITE_ALL(fh, var, m, MPI_LOGICAL, &
-         MPI_STATUS_IGNORE, ierror)
-    call MPI_TYPE_SIZE(MPI_LOGICAL,m,ierror)
-    disp = disp + n*m
-
-    return
-  end subroutine write_scalar_logical
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Read scalar variables as part of a big MPI-IO file, starting from 
-  !  displacement 'disp'; 'disp' will be updated after the reading
-  !  operation to prepare the reading of next chunk of data.
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine read_scalar_real(fh,disp,n,var)
-
-    implicit none
-
-    integer, intent(IN) :: fh             ! file handle
-    integer(KIND=MPI_OFFSET_KIND), &
-         intent(INOUT) :: disp            ! displacement
-    integer, intent(IN) :: n              ! number of scalars
-    real(mytype), dimension(n), &
-         intent(INOUT) :: var             ! array of scalars
-
-    integer :: ierror
-
-    call MPI_FILE_SET_VIEW(fh,disp,real_type, &
-         real_type,'native',MPI_INFO_NULL,ierror)
-    call MPI_FILE_READ_ALL(fh, var, n, real_type, &
-         MPI_STATUS_IGNORE, ierror)
-    disp = disp + n*mytype_bytes
-
-    return
-  end subroutine read_scalar_real
-
-
-  subroutine read_scalar_complex(fh,disp,n,var)
-
-    implicit none
-
-    integer, intent(IN) :: fh
-    integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
-    integer, intent(IN) :: n
-    complex(mytype), dimension(n), intent(INOUT) :: var
-
-    integer :: ierror
-
-    call MPI_FILE_SET_VIEW(fh,disp,complex_type, &
-         complex_type,'native',MPI_INFO_NULL,ierror)
-    call MPI_FILE_READ_ALL(fh, var, n, complex_type, &
-         MPI_STATUS_IGNORE, ierror)
-    disp = disp + n*mytype_bytes*2
-
-    return
-  end subroutine read_scalar_complex
-
-
-  subroutine read_scalar_integer(fh,disp,n,var)
-
-    implicit none
-
-    integer, intent(IN) :: fh
-    integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
-    integer, intent(IN) :: n
-    integer, dimension(n), intent(INOUT) :: var
-
-    integer :: m, ierror
-
-    call MPI_FILE_SET_VIEW(fh,disp,MPI_INTEGER, &
-         MPI_INTEGER,'native',MPI_INFO_NULL,ierror)
-    call MPI_FILE_READ_ALL(fh, var, n, MPI_INTEGER, &
-         MPI_STATUS_IGNORE, ierror)
-    call MPI_TYPE_SIZE(MPI_INTEGER,m,ierror)
-    disp = disp + n*m
-
-    return
-  end subroutine read_scalar_integer
-
-
-  subroutine read_scalar_logical(fh,disp,n,var)
-
-    implicit none
-
-    integer, intent(IN) :: fh
-    integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
-    integer, intent(IN) :: n
-    logical, dimension(n), intent(INOUT) :: var
-
-    integer :: m, ierror
-
-    call MPI_FILE_SET_VIEW(fh,disp,MPI_LOGICAL, &
-         MPI_LOGICAL,'native',MPI_INFO_NULL,ierror)
-    call MPI_FILE_READ_ALL(fh, var, n, MPI_LOGICAL, &
-         MPI_STATUS_IGNORE, ierror)
-    call MPI_TYPE_SIZE(MPI_LOGICAL,m,ierror)
-    disp = disp + n*m
-
-    return
-  end subroutine read_scalar_logical
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Write a 2D slice of the 3D data to a file
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine plane_extents (sizes, subsizes, starts, iplane, opt_decomp, opt_nplanes)
-
-    integer, intent(in) :: iplane
-    type(decomp_info), intent(in), optional :: opt_decomp
-    integer, intent(in), optional :: opt_nplanes
-    
-    integer, dimension(3), intent(out) :: sizes, subsizes, starts
-
-    integer :: nplanes
-    type(decomp_info) :: decomp
-
-    if (present(opt_decomp)) then
-       decomp = opt_decomp
-    else
-       call get_decomp_info(decomp)
-    end if
-
-    if (present(opt_nplanes)) then
-       nplanes = opt_nplanes
-    else
-       nplanes = 1
-    end if
-    
-    if (iplane == 1) then
-       sizes(1) = nplanes
-       sizes(2) = decomp%ysz(2)
-       sizes(3) = decomp%zsz(3)
-       subsizes(1) = nplanes
-       subsizes(2) = decomp%xsz(2)
-       subsizes(3) = decomp%xsz(3)
-       starts(1) = 0
-       starts(2) = decomp%xst(2)-1
-       starts(3) = decomp%xst(3)-1
-    else if (iplane == 2) then
-       sizes(1) = decomp%xsz(1)
-       sizes(2) = nplanes
-       sizes(3) = decomp%zsz(3)
-       subsizes(1) = decomp%ysz(1)
-       subsizes(2) = nplanes
-       subsizes(3) = decomp%ysz(3)
-       starts(1) = decomp%yst(1)-1
-       starts(2) = 0
-       starts(3) = decomp%yst(3)-1
-    else if (iplane == 3) then
-       sizes(1) = decomp%xsz(1)
-       sizes(2) = decomp%ysz(2)
-       sizes(3) = nplanes
-       subsizes(1) = decomp%zsz(1)
-       subsizes(2) = decomp%zsz(2)
-       subsizes(3) = nplanes
-       starts(1) = decomp%zst(1)-1
-       starts(2) = decomp%zst(2)-1
-       starts(3) = 0
-    else
-       print *, "Can't work with plane ", iplane
-       stop
-    endif
-    
-  end subroutine plane_extents
-
-  subroutine write_plane_3d_real(ipencil,var,iplane,n,dirname,varname,io_name, &
-       opt_decomp)
-
-    implicit none
-
-    integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
-    real(mytype), dimension(:,:,:), intent(IN) :: var
-    integer, intent(IN) :: iplane !(x-plane=1; y-plane=2; z-plane=3)
-    integer, intent(IN) :: n ! which plane to write (global coordinate)
-    character(len=*), intent(IN) :: dirname,varname,io_name
-    TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-    real(mytype), allocatable, dimension(:,:,:) :: wk, wk2
-    real(mytype), allocatable, dimension(:,:,:) :: wk2d
-    TYPE(DECOMP_INFO) :: decomp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: i,j,k, ierror, newtype, data_type
-
-    logical :: opened_new, dir_exists
-    character(len=:), allocatable :: full_io_name
-    integer :: idx
-#ifdef ADIOS2
-    type(adios2_io) :: io_handle
-    type(adios2_variable) :: var_handle
-#endif
-    
-    data_type = real_type
-
-#include "io_write_plane.inc"
-
-    return
-  end subroutine write_plane_3d_real
-
-
-  subroutine write_plane_3d_complex(ipencil,var,iplane,n, &
-       dirname,varname,io_name,opt_decomp)
-
-    implicit none
-
-    integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
-    complex(mytype), dimension(:,:,:), intent(IN) :: var
-    integer, intent(IN) :: iplane !(x-plane=1; y-plane=2; z-plane=3)
-    integer, intent(IN) :: n ! which plane to write (global coordinate)
-    character(len=*), intent(IN) :: dirname,varname,io_name
-    TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-    complex(mytype), allocatable, dimension(:,:,:) :: wk, wk2
-    complex(mytype), allocatable, dimension(:,:,:) :: wk2d
-    TYPE(DECOMP_INFO) :: decomp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: i,j,k, ierror, newtype, data_type
-    logical :: opened_new, dir_exists
-    character(len=:), allocatable :: full_io_name
-    integer :: idx
-#ifdef ADIOS2
-    type(adios2_io) :: io_handle
-    type(adios2_variable) :: var_handle
-#endif
-
-    data_type = complex_type
-
-#include "io_write_plane.inc"
-
-    return
-  end subroutine write_plane_3d_complex
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Write a 2D array to a file
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !************** TO DO ***************
-  !* Consider handling distributed 2D data set
-  !  subroutine write_plane_2d(ipencil,var,filename)
-  !    integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
-  !    real(mytype), dimension(:,:), intent(IN) :: var ! 2D array
-  !    character(len=*), intent(IN) :: filename
-  !
-  !    if (ipencil==1) then
-  !       ! var should be defined as var(xsize(2)
-  !
-  !    else if (ipencil==2) then
-  !
-  !    else if (ipencil==3) then
-  !
-  !    end if
-  !
-  !    return
-  !  end subroutine write_plane_2d
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Write 3D array data for every specified mesh point
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine write_every_real(ipencil,var,iskip,jskip,kskip, &
-       filename, from1)
-
-    implicit none
-
-    integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
-    real(mytype), dimension(:,:,:), intent(IN) :: var
-    integer, intent(IN) :: iskip,jskip,kskip 
-    character(len=*), intent(IN) :: filename
-    logical, intent(IN) :: from1  ! .true.  - save 1,n+1,2n+1...
-    ! .false. - save n,2n,3n...
-
-    real(mytype), allocatable, dimension(:,:,:) :: wk, wk2
-    integer(kind=MPI_OFFSET_KIND) :: filesize, disp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: i,j,k, ierror, newtype, fh, key,color,newcomm, data_type
-    integer, dimension(3) :: xsz,ysz,zsz,xst,yst,zst,xen,yen,zen,skip
-
-    data_type = real_type
-
-#include "io_write_every.inc"
-
-    return
-  end subroutine write_every_real
-
-
-  subroutine write_every_complex(ipencil,var,iskip,jskip,kskip, &
-       filename, from1)
-
-    implicit none
-
-    integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
-    complex(mytype), dimension(:,:,:), intent(IN) :: var
-    integer, intent(IN) :: iskip,jskip,kskip 
-    character(len=*), intent(IN) :: filename
-    logical, intent(IN) :: from1  ! .true.  - save 1,n+1,2n+1...
-    ! .false. - save n,2n,3n...
-
-    complex(mytype), allocatable, dimension(:,:,:) :: wk, wk2
-    integer(kind=MPI_OFFSET_KIND) :: filesize, disp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: i,j,k, ierror, newtype, fh, key,color,newcomm, data_type
-    integer, dimension(3) :: xsz,ysz,zsz,xst,yst,zst,xen,yen,zen,skip
-
-    data_type = complex_type
-
-#include "io_write_every.inc"
-
-    return
-  end subroutine write_every_complex
-
-  subroutine coarse_extents(ipencil, icoarse, sizes, subsizes, starts, opt_decomp)
-
-    implicit none
-    
-    integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
-    integer, intent(IN) :: icoarse !(nstat=1; nvisu=2)
-    type(decomp_info), intent(in), optional :: opt_decomp
-
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: ierror
-    type(decomp_info) :: decomp
-
-    if ((icoarse.lt.0).or.(icoarse.gt.2)) then
-       print *, "Error invalid value of icoarse: ", icoarse
-       call MPI_ABORT(MPI_COMM_WORLD, -1, ierror)
-    endif
-    if ((ipencil.lt.1).or.(ipencil.gt.3)) then
-       print *, "Error invalid value of ipencil: ", ipencil
-       call MPI_ABORT(MPI_COMM_WORLD, -1, ierror)
-    endif
-
-    if (icoarse==0) then
-       !! Use full fields
-       
-       if (present(opt_decomp)) then
-          decomp = opt_decomp
-       else
-          call get_decomp_info(decomp)
-       endif
-
-       sizes(1) = decomp%xsz(1)
-       sizes(2) = decomp%ysz(2)
-       sizes(3) = decomp%zsz(3)
-
-       if (ipencil == 1) then
-          subsizes(1:3) = decomp%xsz(1:3)
-          starts(1:3) = decomp%xst(1:3) - 1
-       elseif (ipencil == 2) then
-          subsizes(1:3) = decomp%ysz(1:3)
-          starts(1:3) = decomp%yst(1:3) - 1
-       elseif (ipencil == 3) then
-          subsizes(1:3) = decomp%zsz(1:3)
-          starts(1:3) = decomp%zst(1:3) - 1
-       else
-          call decomp_2d_abort(-1, "IO/coarse_extents : Wrong value for ipencil")
-       endif
-    elseif (icoarse==1) then
-       sizes(1) = xszS(1)
-       sizes(2) = yszS(2)
-       sizes(3) = zszS(3)
-
-       if (ipencil == 1) then
-          subsizes(1) = xszS(1)
-          subsizes(2) = xszS(2)
-          subsizes(3) = xszS(3)
-          starts(1) = xstS(1)-1  ! 0-based index
-          starts(2) = xstS(2)-1
-          starts(3) = xstS(3)-1
-       else if (ipencil == 2) then
-          subsizes(1) = yszS(1)
-          subsizes(2) = yszS(2)
-          subsizes(3) = yszS(3)
-          starts(1) = ystS(1)-1
-          starts(2) = ystS(2)-1
-          starts(3) = ystS(3)-1
-       else if (ipencil == 3) then
-          subsizes(1) = zszS(1)
-          subsizes(2) = zszS(2)
-          subsizes(3) = zszS(3)
-          starts(1) = zstS(1)-1
-          starts(2) = zstS(2)-1
-          starts(3) = zstS(3)-1
-       else
-          call decomp_2d_abort(-1, "IO/coarse_extents : Wrong value for ipencil")
-       endif
-    elseif (icoarse==2) then
-       sizes(1) = xszV(1)
-       sizes(2) = yszV(2)
-       sizes(3) = zszV(3)
-
-       if (ipencil == 1) then
-          subsizes(1) = xszV(1)
-          subsizes(2) = xszV(2)
-          subsizes(3) = xszV(3)
-          starts(1) = xstV(1)-1  ! 0-based index
-          starts(2) = xstV(2)-1
-          starts(3) = xstV(3)-1
-       else if (ipencil == 2) then
-          subsizes(1) = yszV(1)
-          subsizes(2) = yszV(2)
-          subsizes(3) = yszV(3)
-          starts(1) = ystV(1)-1
-          starts(2) = ystV(2)-1
-          starts(3) = ystV(3)-1
-       else if (ipencil == 3) then
-          subsizes(1) = zszV(1)
-          subsizes(2) = zszV(2)
-          subsizes(3) = zszV(3)
-          starts(1) = zstV(1)-1
-          starts(2) = zstV(2)-1
-          starts(3) = zstV(3)-1
-       else
-          call decomp_2d_abort(-1, "IO/coarse_extents : Wrong value for ipencil")
-       endif
-    endif
-    
-  end subroutine coarse_extents
-
-  subroutine mpiio_write_real_coarse(ipencil,var,dirname,varname,icoarse,io_name,opt_decomp,reduce_prec,opt_deferred_writes)
-
-    ! USE param
-    ! USE variables
-
-    implicit none
-
-    integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
-    integer, intent(IN) :: icoarse !(nstat=1; nvisu=2)
-    real(mytype), dimension(:,:,:), intent(IN) :: var
-    character(len=*), intent(in) :: dirname, varname, io_name
-    type(decomp_info), intent(in), optional :: opt_decomp
-    logical, intent(in), optional :: reduce_prec
-    logical, intent(in), optional :: opt_deferred_writes
-
-    real(mytype_single), allocatable, dimension(:,:,:) :: varsingle
-    real(mytype), allocatable, dimension(:,:,:) :: varfull
-    logical :: write_reduce_prec
-    logical :: deferred_writes
-    
-    integer (kind=MPI_OFFSET_KIND) :: filesize
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: i,j,k, ierror, newtype
-    integer :: idx
-    logical :: opened_new
-    integer :: disp_bytes
-    logical :: dir_exists
-    character(len=:), allocatable :: full_io_name
-#ifdef ADIOS2
-    type(adios2_io) :: io_handle
-    type(adios2_variable) :: var_handle
-    integer :: write_mode
-#endif
-
-    ! Safety check
-    if (.not. is_contiguous(var)) call decomp_2d_abort(-1, "mpiio_write_real_coarse : argument must be contiguous")
-
-    !! Set defaults
-    write_reduce_prec = .true.
-    deferred_writes = .true.
-    
-    opened_new = .false.
-    idx = get_io_idx(io_name, dirname)
-#ifndef ADIOS2
-    if (present(reduce_prec)) then
-       if (.not. reduce_prec) then
-          write_reduce_prec = .false.
-       end if
-    end if
-    if (write_reduce_prec) then
-       call MPI_TYPE_SIZE(real_type_single,disp_bytes,ierror)
-    else
-       call MPI_TYPE_SIZE(real_type,disp_bytes,ierror)
-    end if
-    
-    !! Use original MPIIO writers
-    if (present(opt_decomp)) then
-       call coarse_extents(ipencil, icoarse, sizes, subsizes, starts, opt_decomp)
-    else
-       call coarse_extents(ipencil, icoarse, sizes, subsizes, starts)
-    end if
-
-    if ((subsizes(1) > size(var, 1)) .or. (subsizes(2) > size(var, 2)) &
-         .or. (subsizes(3) > size(var, 3))) then
-       print *, "ERROR: trying to write ", subsizes, "sized array for "//varname//" variable array is ", size(var)
-       stop
-    end if
-    
-    if (write_reduce_prec) then
-       allocate (varsingle(xstV(1):xenV(1),xstV(2):xenV(2),xstV(3):xenV(3)))
-       varsingle=real(var, mytype_single)
-    end if
-
-    if (write_reduce_prec) then
-       call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
-            MPI_ORDER_FORTRAN, real_type_single, newtype, ierror)
-    else
-       call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
-            MPI_ORDER_FORTRAN, real_type, newtype, ierror)
-    end if
-    call MPI_TYPE_COMMIT(newtype,ierror)
-
-    if (idx .lt. 1) then
-       ! Create folder if needed
-       if (nrank==0) then
-          inquire(file=dirname, exist=dir_exists)
-          if (.not.dir_exists) then
-             call system("mkdir "//dirname//" 2> /dev/null")
-          end if
-       end if
-       full_io_name = trim(dirname)//"/"//trim(varname)
-       call decomp_2d_open_io(io_name, full_io_name, decomp_2d_write_mode)
-       idx = get_io_idx(io_name, full_io_name)
-       opened_new = .true.
-    end if
-
-    if (write_reduce_prec) then
-       call MPI_FILE_SET_VIEW(fh_registry(idx),fh_disp(idx),real_type_single, &
-            newtype,'native',MPI_INFO_NULL,ierror)
-       call MPI_FILE_WRITE_ALL(fh_registry(idx), varsingle, &
-            subsizes(1)*subsizes(2)*subsizes(3), &
-            real_type_single, MPI_STATUS_IGNORE, ierror)
-    else
-       call MPI_FILE_SET_VIEW(fh_registry(idx),fh_disp(idx),real_type, &
-            newtype,'native',MPI_INFO_NULL,ierror)
-       call MPI_FILE_WRITE_ALL(fh_registry(idx), var, &
-            subsizes(1)*subsizes(2)*subsizes(3), &
-            real_type, MPI_STATUS_IGNORE, ierror)
-    end if
-    if (ierror /= 0) then
-       print *, "ERROR in MPI_FILE_WRITE_ALL"
-       stop
-    end if
-    
-    fh_disp(idx) = fh_disp(idx) + int(sizes(1), MPI_OFFSET_KIND) * &
-                                  int(sizes(2), MPI_OFFSET_KIND) * &
-                                  int(sizes(3), MPI_OFFSET_KIND) * &
-                                  int(disp_bytes, MPI_OFFSET_KIND)
-    
-    if (opened_new) then
-       call decomp_2d_close_io(io_name, full_io_name)
-       deallocate(full_io_name)
-    end if
-
-    call MPI_TYPE_FREE(newtype,ierror)
-    if (write_reduce_prec) then
-       deallocate(varsingle)
-    end if
-#else
-    if (idx < 1) then
-       print *, "ERROR: failed to find engine for ", io_name, " ", dirname
-    end if
-    if (.not. engine_live(idx)) then
-       print *, "ERROR: Engine is not live!"
-       stop
-    end if
-    
-    call adios2_at_io(io_handle, adios, io_name, ierror)
-    call adios2_inquire_variable(var_handle, io_handle, varname, ierror)
-    if (.not.var_handle % valid) then
-       print *, "ERROR: trying to write variable before registering!", varname
-       stop
-    endif
-
-    if (idx .lt. 1) then
-       print *, "You haven't opened ", io_name, ":", dirname
-       stop
-    end if
-
-    if (present(opt_deferred_writes)) then
-       deferred_writes = opt_deferred_writes
-    end if
-
-    if (deferred_writes) then
-       write_mode = adios2_mode_deferred
-    else
-       write_mode = adios2_mode_sync
-    end if
-
-    if (engine_registry(idx)%valid) then
-       call adios2_put(engine_registry(idx), var_handle, var, write_mode, ierror)
-       if (ierror /= 0) then
-          print *, "ERROR: something went wrong in adios2_put"
-          stop
-       end if
-    else
-       print *, "ERROR: decomp2d thinks engine is live, but adios2 engine object is not valid"
-       stop
-    end if
-#endif
-
-    return
-  end subroutine mpiio_write_real_coarse
-
-  subroutine decomp_2d_register_variable(io_name, varname, ipencil, icoarse, iplane, type, opt_decomp, opt_nplanes)
-
-    implicit none
-
-    integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
-    integer, intent(IN) :: icoarse !(nstat=1; nvisu=2)
-    character(len=*), intent(in) :: io_name
-    integer, intent(in) :: type
-    integer, intent(in) :: iplane
-    type(decomp_info), intent(in), optional :: opt_decomp
-    integer, intent(in), optional :: opt_nplanes
-
-    integer :: nplanes
-    character*(*), intent(in) :: varname
-#ifdef ADIOS2
-    integer, dimension(3) :: sizes, subsizes, starts
-    type(adios2_io) :: io_handle
-    type(adios2_variable) :: var_handle
-    integer, parameter :: ndims = 3
-    logical, parameter :: adios2_constant_dims = .true.
-    integer :: data_type
-    integer :: ierror
-
-    if (iplane .eq. 0) then
-       if (present(opt_decomp)) then
-          call coarse_extents(ipencil, icoarse, sizes, subsizes, starts, opt_decomp)
-       else
-          call coarse_extents(ipencil, icoarse, sizes, subsizes, starts)
-       endif
-    else
-       if (present(opt_nplanes)) then
-          nplanes = opt_nplanes
-       else
-          nplanes = 1
-       end if
-       if (present(opt_decomp)) then
-          call plane_extents(sizes, subsizes, starts, iplane, opt_decomp, opt_nplanes=nplanes)
-       else
-          call plane_extents(sizes, subsizes, starts, iplane, opt_nplanes=nplanes)
-       endif
-    end if
-    
-    ! Check if variable already exists, if not create it
-    call adios2_at_io(io_handle, adios, io_name, ierror)
-    if (io_handle%valid) then
-       call adios2_inquire_variable(var_handle, io_handle, varname, ierror)
-       if (.not.var_handle % valid) then
-          !! New variable
-          if (nrank .eq. 0) then
-             print *, "Registering variable for IO: ", varname
-          endif
-
-          ! Need to set the ADIOS2 data type
-          if (type.eq.kind(0.0d0)) then
-             !! Double
-             data_type = adios2_type_dp
-          else if (type.eq.kind(0.0)) then
-             !! Single
-             data_type = adios2_type_real
-          else
-             print *, "Trying to write unknown data type!"
-             call MPI_ABORT(MPI_COMM_WORLD, -1, ierror)
-          endif
-
-          call adios2_define_variable(var_handle, io_handle, varname, data_type, &
-               ndims, int(sizes, kind=8), int(starts, kind=8), int(subsizes, kind=8), &
-               adios2_constant_dims, ierror)
-          if (ierror /= 0) then
-             print *, "ERROR registering variable"
-             stop
-          end if
-       endif
-    else
-       print *, "ERROR trying to register variable with invalid IO!"
-       stop
-    end if
-#endif
-    
-  end subroutine decomp_2d_register_variable
-  
-  subroutine mpiio_write_real_probe(ipencil,var,filename,nlength)
-
-    ! USE param
-    ! USE variables
-
-    implicit none
-
-    integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
-    integer, intent(in) :: nlength
-    real(mytype), dimension(:,:,:,:), intent(IN) :: var
-
-    character(len=*) :: filename
-
-    integer (kind=MPI_OFFSET_KIND) :: filesize, disp
-    integer, dimension(4) :: sizes, subsizes, starts
-    integer :: i,j,k, ierror, newtype, fh
-
-    sizes(1) = xszP(1)
-    sizes(2) = yszP(2)
-    sizes(3) = zszP(3)
-    sizes(4) = nlength
-    if (ipencil == 1) then
-       subsizes(1) = xszP(1)
-       subsizes(2) = xszP(2)
-       subsizes(3) = xszP(3)
-       subsizes(4) = nlength
-       starts(1) = xstP(1)-1  ! 0-based index
-       starts(2) = xstP(2)-1
-       starts(3) = xstP(3)-1
-       starts(4) = 0
-    else if (ipencil == 2) then
-       subsizes(1) = yszP(1)
-       subsizes(2) = yszP(2)
-       subsizes(3) = yszP(3)
-       starts(1) = ystP(1)-1
-       starts(2) = ystP(2)-1
-       starts(3) = ystP(3)-1
-    else if (ipencil == 3) then
-       subsizes(1) = zszP(1)
-       subsizes(2) = zszP(2)
-       subsizes(3) = zszP(3)
-       starts(1) = zstP(1)-1
-       starts(2) = zstP(2)-1
-       starts(3) = zstP(3)-1
-    endif
-    !   print *,nrank,starts(1),starts(2),starts(3),starts(4)
-    call MPI_TYPE_CREATE_SUBARRAY(4, sizes, subsizes, starts,  &
-         MPI_ORDER_FORTRAN, real_type, newtype, ierror)
-    call MPI_TYPE_COMMIT(newtype,ierror)
-    call MPI_FILE_OPEN(MPI_COMM_WORLD, filename, &
-         MPI_MODE_CREATE+MPI_MODE_WRONLY, MPI_INFO_NULL, &
-         fh, ierror)
-    filesize = 0_MPI_OFFSET_KIND
-    call MPI_FILE_SET_SIZE(fh,filesize,ierror)  ! guarantee overwriting
-    disp = 0_MPI_OFFSET_KIND
-    call MPI_FILE_SET_VIEW(fh,disp,real_type, &
-         newtype,'native',MPI_INFO_NULL,ierror)
-    call MPI_FILE_WRITE_ALL(fh, var, &
-         subsizes(1)*subsizes(2)*subsizes(3)*subsizes(4), &
-         real_type, MPI_STATUS_IGNORE, ierror)
-    call MPI_FILE_CLOSE(fh,ierror)
-    call MPI_TYPE_FREE(newtype,ierror)
-
-
-    return
-  end subroutine mpiio_write_real_probe
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Write a 3D data set covering a smaller sub-domain only
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine write_subdomain(ipencil,var,is,ie,js,je,ks,ke,filename)
-
-    implicit none
-
-    integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
-    real(mytype), dimension(:,:,:), intent(IN) :: var
-    integer, intent(IN) :: is, ie, js, je, ks, ke
-    character(len=*), intent(IN) :: filename
-
-    real(mytype), allocatable, dimension(:,:,:) :: wk, wk2
-    integer(kind=MPI_OFFSET_KIND) :: filesize, disp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: color, key, errorcode, newcomm, ierror
-    integer :: newtype, fh, data_type, i, j, k
-    integer :: i1, i2, j1, j2, k1, k2
-
-    data_type = real_type
-
-    ! validate the input paramters
-    if (is<1 .OR. ie>nx_global .OR. js<1 .OR. je>ny_global .OR. &
-         ks<1 .OR. ke>nz_global) then
-       errorcode = 10
-       call decomp_2d_abort(errorcode, &
-            'Invalid subdomain specified in I/O')
-    end if
-
-    ! create a communicator for all those MPI ranks containing the subdomain
-    color = 1
-    key = 1
-    if (ipencil==1) then
-       if (xstart(1)>ie .OR. xend(1)<is .OR. xstart(2)>je .OR. xend(2)<js &
-            .OR. xstart(3)>ke .OR. xend(3)<ks) then
-          color = 2
-       end if
-    else if (ipencil==2) then
-       if (ystart(1)>ie .OR. yend(1)<is .OR. ystart(2)>je .OR. yend(2)<js &
-            .OR. ystart(3)>ke .OR. yend(3)<ks) then
-          color = 2
-       end if
-    else if (ipencil==3) then
-       if (zstart(1)>ie .OR. zend(1)<is .OR. zstart(2)>je .OR. zend(2)<js &
-            .OR. zstart(3)>ke .OR. zend(3)<ks) then
-          color = 2
-       end if
-    end if
-    call MPI_COMM_SPLIT(MPI_COMM_WORLD,color,key,newcomm,ierror)
-
-    if (color==1) then ! only ranks in this group do IO collectively
-
-       ! generate MPI-IO subarray information
-
-       ! global size of the sub-domain to write
-       sizes(1) = ie - is + 1
-       sizes(2) = je - js + 1
-       sizes(3) = ke - ks + 1
-
-       ! 'subsizes' & 'starts' as required by MPI_TYPE_CREATE_SUBARRAY
-       ! note the special code whe subdomain only occupy part of the pencil
-       if (ipencil==1) then
-
-          subsizes(1) = xsize(1)
-          starts(1) = xstart(1) - is
-          if (xend(1)>ie .AND. xstart(1)<is) then
-             subsizes(1) = ie - is + 1
-             starts(1) = 0
-          else if (xstart(1)<is) then
-             subsizes(1) = xend(1) - is + 1
-             starts(1) = 0
-          else if (xend(1)>ie) then
-             subsizes(1) = ie - xstart(1) + 1
-          end if
-          subsizes(2) = xsize(2)
-          starts(2) = xstart(2) - js
-          if (xend(2)>je .AND. xstart(2)<js) then
-             subsizes(2) = je - js + 1
-             starts(2) = 0
-          else if (xstart(2)<js) then
-             subsizes(2) = xend(2) - js + 1
-             starts(2) = 0
-          else if (xend(2)>je) then
-             subsizes(2) = je - xstart(2) + 1
-          end if
-          subsizes(3) = xsize(3)
-          starts(3) = xstart(3) - ks
-          if (xend(3)>ke .AND. xstart(3)<ks) then
-             subsizes(3) = ke - ks + 1
-             starts(3) = 0
-          else if (xstart(3)<ks) then
-             subsizes(3) = xend(3) - ks + 1
-             starts(3) = 0
-          else if (xend(3)>ke) then
-             subsizes(3) = ke - xstart(3) + 1
-          end if
-
-       else if (ipencil==2) then
-
-          ! TODO 
-
-       else if (ipencil==3) then
-
-          ! TODO
-
-       end if
-
-
-       ! copy data from orginal to a temp array
-       ! pay attention to blocks only partially cover the sub-domain
-       if (ipencil==1) then
-
-          if (xend(1)>ie .AND. xstart(1)<is) then
-             i1 = is
-             i2 = ie
-          else if (xend(1)>ie) then
-             i1 = xstart(1)
-             i2 = ie
-          else if (xstart(1)<is) then
-             i1 = is
-             i2 = xend(1)
-          else
-             i1 = xstart(1)
-             i2 = xend(1)
-          end if
-
-          if (xend(2)>je .AND. xstart(2)<js) then
-             j1 = js
-             j2 = je
-          else if (xend(2)>je) then
-             j1 = xstart(2)
-             j2 = je
-          else if (xstart(2)<js) then
-             j1 = js
-             j2 = xend(2)
-          else
-             j1 = xstart(2)
-             j2 = xend(2)
-          end if
-
-          if (xend(3)>ke .AND. xstart(3)<ks) then
-             k1 = ks
-             k2 = ke
-          else if (xend(3)>ke) then
-             k1 = xstart(3)
-             k2 = ke
-          else if (xstart(3)<ks) then
-             k1 = ks
-             k2 = xend(3)
-          else
-             k1 = xstart(3)
-             k2 = xend(3)
-          end if
-
-          allocate(wk(i1:i2, j1:j2, k1:k2))
-          allocate(wk2(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3)))
-          wk2 = var
-          do k=k1,k2
-             do j=j1,j2
-                do i=i1,i2
-                   wk(i,j,k) = wk2(i,j,k)
-                end do
-             end do
-          end do
-
-       else if (ipencil==2) then
-
-          ! TODO
-
-       else if (ipencil==3) then
-
-          ! TODO
-
-       end if
-
-       deallocate(wk2)
-
-       ! MPI-IO
-       call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
-            MPI_ORDER_FORTRAN, data_type, newtype, ierror)
-       call MPI_TYPE_COMMIT(newtype,ierror)
-       call MPI_FILE_OPEN(newcomm, filename, &
-            MPI_MODE_CREATE+MPI_MODE_WRONLY, MPI_INFO_NULL, &
-            fh, ierror)
-       filesize = 0_MPI_OFFSET_KIND
-       call MPI_FILE_SET_SIZE(fh,filesize,ierror)  ! guarantee overwriting
-       disp = 0_MPI_OFFSET_KIND
-       call MPI_FILE_SET_VIEW(fh,disp,data_type, &
-            newtype,'native',MPI_INFO_NULL,ierror)
-       call MPI_FILE_WRITE_ALL(fh, wk, &
-            subsizes(1)*subsizes(2)*subsizes(3), &
-            data_type, MPI_STATUS_IGNORE, ierror)
-       call MPI_FILE_CLOSE(fh,ierror)
-       call MPI_TYPE_FREE(newtype,ierror)
-
-       deallocate(wk)
-
-    end if
-
-    return
-  end subroutine write_subdomain
-
-  subroutine decomp_2d_init_io(io_name)
-
-    implicit none
-
-    character(len=*), intent(in) :: io_name
-    integer :: ierror
-#ifdef ADIOS2
-    type(adios2_io) :: io
-#endif
-    
-    if (nrank .eq. 0) then
-       print *, "Initialising IO for ", io_name
-    end if
-
-#ifdef ADIOS2
-    if (adios%valid) then
-       call adios2_declare_io(io, adios, io_name, ierror)
-    else
-       print *, "ERROR: couldn't declare IO - adios object not valid"
-       stop
-    end if
-    
-    if (ierror /= 0) then
-       print *, "ERROR: Initialising IO"
-       stop
-    end if
-#endif
-    
-  end subroutine decomp_2d_init_io
-  
-  subroutine decomp_2d_open_io(io_name, io_dir, mode)
-
-    implicit none
-
-    logical :: dir_exists
-    character(len=*), intent(in) :: io_name, io_dir
-    integer, intent(in) :: mode
-
-    logical, dimension(:), pointer :: live_ptrh
-    character(len=1024), dimension(:), pointer :: names_ptr
-    character(len=(len(io_name)+len(io_sep)+len(io_dir))) :: full_name
-    
-    integer :: idx, ierror
-    integer :: access_mode
-#ifndef ADIOS2
-    integer(MPI_OFFSET_KIND) :: filesize
-#else
-    type(adios2_io) :: io
-#endif
-
-#ifndef ADIOS2
-    live_ptrh => fh_live
-    names_ptr => fh_names
-#else
-    live_ptrh => engine_live
-    names_ptr => engine_names
-#endif
-    
-    idx = get_io_idx(io_name, io_dir)
-    if (idx .lt. 1) then
-       !! New io destination
-       if (nreg_io .lt. MAX_IOH) then
-          nreg_io = nreg_io + 1
-          do idx = 1, MAX_IOH
-             if (.not. live_ptrh(idx)) then
-                live_ptrh(idx) = .true.
-                exit
-             end if
-          end do
-          
-          full_name = io_name//io_sep//io_dir
-          names_ptr(idx) = full_name
-
-          if (mode .eq. decomp_2d_write_mode) then
-             !! Setup writers
-#ifndef ADIOS2
-             filesize = 0_MPI_OFFSET_KIND
-             fh_disp(idx) = 0_MPI_OFFSET_KIND
-             access_mode = MPI_MODE_CREATE + MPI_MODE_WRONLY
-#else
-             access_mode = adios2_mode_write
-#endif
-          else if (mode .eq. decomp_2d_read_mode) then
-             !! Setup readers
-#ifndef ADIOS2
-             fh_disp(idx) = 0_MPI_OFFSET_KIND
-             access_mode = MPI_MODE_RDONLY
-#else
-             access_mode = adios2_mode_read
-#endif
-          else if (mode .eq. decomp_2d_append_mode) then
-#ifndef ADIOS2
-             filesize = 0_MPI_OFFSET_KIND
-             fh_disp(idx) = 0_MPI_OFFSET_KIND
-             access_mode = MPI_MODE_CREATE + MPI_MODE_WRONLY
-#else
-             access_mode = adios2_mode_append
-#endif
-          else
-             print *, "ERROR: Unknown mode!"
-             stop
-          endif
-
-          !! Open IO
-#ifndef ADIOS2
-          call MPI_FILE_OPEN(MPI_COMM_WORLD, io_dir, &
-               access_mode, MPI_INFO_NULL, &
-               fh_registry(idx), ierror)
-          if (mode .eq. decomp_2d_write_mode) then
-             !! Guarantee overwriting
-             call MPI_FILE_SET_SIZE(fh_registry(idx), filesize, ierror)
-          end if
-#else
-          call adios2_at_io(io, adios, io_name, ierror)
-          if (io%valid) then
-             call adios2_open(engine_registry(idx), io, trim(gen_iodir_name(io_dir, io_name)), access_mode, ierror)
-             if (ierror /= 0) then
-                print *, "ERROR opening engine!"
-                stop
-             end if
-          else
-             print *, "ERROR: Couldn't find IO handle"
-             stop
-          end if
-#endif
-       end if
-    end if
-    
-  end subroutine decomp_2d_open_io
-
-  subroutine decomp_2d_close_io(io_name, io_dir)
-
-    implicit none
-
-    character(len=*), intent(in) :: io_name, io_dir
-    
-    character(len=1024), dimension(:), pointer :: names_ptr
-    logical, dimension(:), pointer :: live_ptrh
-    integer :: idx, ierror
-
-    idx = get_io_idx(io_name, io_dir)
-#ifndef ADIOS2
-    names_ptr => fh_names
-    live_ptrh => fh_live
-    call MPI_FILE_CLOSE(fh_registry(idx), ierror)
-#else
-    names_ptr => engine_names
-    live_ptrh => engine_live
-    call adios2_close(engine_registry(idx), ierror)
-    if (ierror /= 0) then
-       print *, "ERROR closing IO"
-    end if
-#endif
-    names_ptr(idx) = ""
-    live_ptrh(idx) = .false.
-    nreg_io = nreg_io - 1
-
-  end subroutine decomp_2d_close_io
-
-  subroutine decomp_2d_start_io(io_name, io_dir)
-
-    implicit none
-
-    character(len=*), intent(in) :: io_name, io_dir
-#ifdef ADIOS2
-    integer :: idx, ierror
-
-    idx = get_io_idx(io_name, io_dir)
-    associate(engine => engine_registry(idx))
-      if (engine%valid) then
-         call adios2_begin_step(engine, ierror)
-         if (ierror /= 0) then
-            print *, "ERROR beginning step"
-            stop
-         end if
-      else
-         print *, "ERROR trying to begin step with invalid engine"
-         stop
-      end if
-    end associate
-
-    io_step(idx) = -1
-#endif
-    
-  end subroutine decomp_2d_start_io
-
-  subroutine decomp_2d_end_io(io_name, io_dir)
-
-    implicit none
-
-    character(len=*), intent(in) :: io_name, io_dir
-#ifdef ADIOS2
-    integer :: idx, ierror
-
-    idx = get_io_idx(io_name, io_dir)
-    associate(engine => engine_registry(idx))
-      if (engine%valid) then
-         call adios2_end_step(engine, ierror)
-         if (ierror /= 0) then
-            print *, "ERROR ending step"
-            stop
-         end if
-      else
-         print *, "ERROR trying to end step with invalid engine"
-         stop
-      end if
-    end associate
-
-    io_step(idx) = -1
-#endif
-
-  end subroutine decomp_2d_end_io
-  
-  integer function get_io_idx(io_name, engine_name)
-
-    implicit none
-
-    character(len=*), intent(in) :: io_name
-    character(len=*), intent(in) :: engine_name
-
-    character(len=(len(io_name)+len(io_sep)+len(engine_name))) :: full_name
-    integer :: idx
-    logical :: found
-
-    character(len=1024), dimension(:), pointer :: names_ptr
-
-#ifndef ADIOS2
-    names_ptr => fh_names
-#else
-    names_ptr => engine_names
-#endif
-
-    full_name = io_name//io_sep//engine_name
-    
-    found = .false.
-    do idx = 1, MAX_IOH
-       if (names_ptr(idx) .eq. full_name) then
-          found = .true.
-          exit
-       end if
-    end do
-
-    if (.not. found) then
-       idx = -1
-    end if
-
-    get_io_idx = idx
-    
-  end function get_io_idx
-
-  function gen_iodir_name(io_dir, io_name)
-
-    character(len=*), intent(in) :: io_dir, io_name
-    character(len=(len(io_dir) + 5)) :: gen_iodir_name
-#ifdef ADIOS2
-    integer :: ierror
-    type(adios2_io) :: io
-    character(len=5) :: ext
-#endif
-
-#ifndef ADIOS2
-    write(gen_iodir_name, "(A)") io_dir
-#else
-    call adios2_at_io(io, adios, io_name, ierror)
-    if (io%engine_type .eq. "BP4") then
-       ext = ".bp4"
-    else if (io%engine_type .eq. "HDF5") then
-       ext = ".hdf5"
-    else
-       print *, "ERROR: Unkown engine type! ", io%engine_type
-       print *, "-  IO: ", io_name
-       print *, "- DIR:", io_dir
-       stop
-    endif
-    write(gen_iodir_name, "(A,A)") trim(io_dir), trim(ext)
-#endif
-    
-  end function gen_iodir_name
-
-end module decomp_2d_io
diff --git a/decomp2d/io_read_inflow.f90 b/decomp2d/io_read_inflow.f90
deleted file mode 100644
index 88b38df9a..000000000
--- a/decomp2d/io_read_inflow.f90
+++ /dev/null
@@ -1,81 +0,0 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-! This file contain common code to be included by subroutines 
-! 'read_var_...' in io.f90
-
-! Using MPI-IO to read a distributed 3D variable from a file. File 
-! operations (open/close) need to be done in calling application. This 
-! allows multiple variables to be read from a single file. Together 
-! with the corresponding write operation, this is the perfect solution
-! for applications to perform restart/checkpointing.
-
-if (present(opt_decomp)) then
-   decomp = opt_decomp
-else
-   call get_decomp_info(decomp)
-end if
-
-! Create file type and set file view
-sizes(1) = ntimesteps 
-sizes(2) = decomp%ysz(2)
-sizes(3) = decomp%zsz(3)
-subsizes(1) = ntimesteps
-subsizes(2) = decomp%xsz(2)
-subsizes(3) = decomp%xsz(3)
-starts(1) = 0  ! 0-based index
-starts(2) = decomp%xst(2)-1
-starts(3) = decomp%xst(3)-1
-
-idx = get_io_idx(io_name, dirname)
-
-#ifndef ADIOS2
-!! Use default MPIIO
-associate(fh=>fh_registry(idx), &
-  disp => fh_disp(idx))
-  call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
-       MPI_ORDER_FORTRAN, data_type, newtype, ierror)
-  call MPI_TYPE_COMMIT(newtype,ierror)
-  call MPI_FILE_SET_VIEW(fh,disp,data_type, &
-       newtype,'native',MPI_INFO_NULL,ierror)
-  call MPI_FILE_READ_ALL(fh, var, &
-       subsizes(1)*subsizes(2)*subsizes(3), &
-       data_type, MPI_STATUS_IGNORE, ierror)
-  call MPI_TYPE_FREE(newtype,ierror)
-
-  ! update displacement for the next read operation
-  disp = disp + sizes(1)*sizes(2)*sizes(3)*mytype_bytes
-  if (data_type == complex_type) then
-     disp = disp + sizes(1)*sizes(2)*sizes(3)*mytype_bytes
-  end if
-end associate
-#else
-!! Use ADIOS2
-call adios2_at_io(io_handle, adios, io_name, ierror)
-call adios2_inquire_variable(var_handle, io_handle, varname, ierror)
-if (.not.var_handle % valid) then
-   print *, "ERROR: trying to write variable before registering!", varname
-   stop
-endif
-
-if (io_step(idx) >= 0) then
-   call adios2_variable_steps(steps, var_handle, ierror)
-   if (io_step(idx) >= steps) then
-      print *, "ERROR: trying to read a step that doesn't exist!"
-      stop
-   end if
-   call adios2_set_step_selection(var_handle, int(io_step(idx), kind=8), int(1, kind=8), ierror)
-end if
-
-!! Note - need to use sync mode as we are using a view into the array - unsure how this works with deferred writes
-! call adios2_set_step_selection(var_handle, int(0, kind=8), int(1, kind=8), ierror)
-call adios2_get(engine_registry(idx), var_handle, var, adios2_mode_sync, ierror)
-#endif
diff --git a/decomp2d/io_read_one.inc b/decomp2d/io_read_one.inc
deleted file mode 100644
index aaa77113c..000000000
--- a/decomp2d/io_read_one.inc
+++ /dev/null
@@ -1,65 +0,0 @@
-!                          -*- mode: f90 -*-                            
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-! This file contain common code to be included by subroutines 
-! 'mpiio_read_one_...' in io.f90
-
-! Using MPI-IO to write a distributed 3D array into a file
-
-if (present(opt_decomp)) then
-   decomp = opt_decomp
-else
-   call get_decomp_info(decomp)
-end if
-
-! determine subarray parameters
-sizes(1) = decomp%xsz(1)
-sizes(2) = decomp%ysz(2)
-sizes(3) = decomp%zsz(3)
-
-if (ipencil == 1) then
-   subsizes(1) = decomp%xsz(1)
-   subsizes(2) = decomp%xsz(2)
-   subsizes(3) = decomp%xsz(3)
-   starts(1) = decomp%xst(1)-1  ! 0-based index
-   starts(2) = decomp%xst(2)-1
-   starts(3) = decomp%xst(3)-1
-else if (ipencil == 2) then
-   subsizes(1) = decomp%ysz(1)
-   subsizes(2) = decomp%ysz(2)
-   subsizes(3) = decomp%ysz(3)
-   starts(1) = decomp%yst(1)-1
-   starts(2) = decomp%yst(2)-1
-   starts(3) = decomp%yst(3)-1
-else if (ipencil == 3) then
-   subsizes(1) = decomp%zsz(1)
-   subsizes(2) = decomp%zsz(2)
-   subsizes(3) = decomp%zsz(3)
-   starts(1) = decomp%zst(1)-1
-   starts(2) = decomp%zst(2)-1
-   starts(3) = decomp%zst(3)-1
-endif
-
-call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
-     MPI_ORDER_FORTRAN, data_type, newtype, ierror)
-call MPI_TYPE_COMMIT(newtype,ierror)
-call MPI_FILE_OPEN(MPI_COMM_WORLD, filename, &
-     MPI_MODE_RDONLY, MPI_INFO_NULL, &
-     fh, ierror)
-disp = 0_MPI_OFFSET_KIND
-call MPI_FILE_SET_VIEW(fh,disp,data_type, &
-     newtype,'native',MPI_INFO_NULL,ierror)
-call MPI_FILE_READ_ALL(fh, var, &
-     subsizes(1)*subsizes(2)*subsizes(3), &
-     data_type, MPI_STATUS_IGNORE, ierror)
-call MPI_FILE_CLOSE(fh,ierror)
-call MPI_TYPE_FREE(newtype,ierror)
diff --git a/decomp2d/io_read_var.inc b/decomp2d/io_read_var.inc
deleted file mode 100644
index 398f46a23..000000000
--- a/decomp2d/io_read_var.inc
+++ /dev/null
@@ -1,69 +0,0 @@
-!                          -*- mode: f90 -*-                            
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-! This file contain common code to be included by subroutines 
-! 'read_var_...' in io.f90
-
-! Using MPI-IO to read a distributed 3D variable from a file. File 
-! operations (open/close) need to be done in calling application. This 
-! allows multiple variables to be read from a single file. Together 
-! with the corresponding write operation, this is the perfect solution
-! for applications to perform restart/checkpointing.
-
-if (present(opt_decomp)) then
-   decomp = opt_decomp
-else
-   call get_decomp_info(decomp)
-end if
-
-! Create file type and set file view
-sizes(1) = decomp%xsz(1)
-sizes(2) = decomp%ysz(2)
-sizes(3) = decomp%zsz(3)
-if (ipencil == 1) then
-   subsizes(1) = decomp%xsz(1)
-   subsizes(2) = decomp%xsz(2)
-   subsizes(3) = decomp%xsz(3)
-   starts(1) = decomp%xst(1)-1  ! 0-based index
-   starts(2) = decomp%xst(2)-1
-   starts(3) = decomp%xst(3)-1
-else if (ipencil == 2) then
-   subsizes(1) = decomp%ysz(1)
-   subsizes(2) = decomp%ysz(2)
-   subsizes(3) = decomp%ysz(3)
-   starts(1) = decomp%yst(1)-1
-   starts(2) = decomp%yst(2)-1
-   starts(3) = decomp%yst(3)-1
-else if (ipencil == 3) then
-   subsizes(1) = decomp%zsz(1)
-   subsizes(2) = decomp%zsz(2)
-   subsizes(3) = decomp%zsz(3)
-   starts(1) = decomp%zst(1)-1
-   starts(2) = decomp%zst(2)-1
-   starts(3) = decomp%zst(3)-1
-endif
-
-call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
-     MPI_ORDER_FORTRAN, data_type, newtype, ierror)
-call MPI_TYPE_COMMIT(newtype,ierror)
-call MPI_FILE_SET_VIEW(fh,disp,data_type, &
-     newtype,'native',MPI_INFO_NULL,ierror)
-call MPI_FILE_READ_ALL(fh, var, &
-     subsizes(1)*subsizes(2)*subsizes(3), &
-     data_type, MPI_STATUS_IGNORE, ierror)
-call MPI_TYPE_FREE(newtype,ierror)
-
-! update displacement for the next read operation
-disp = disp + sizes(1)*sizes(2)*sizes(3)*mytype_bytes
-if (data_type == complex_type) then
-   disp = disp + sizes(1)*sizes(2)*sizes(3)*mytype_bytes
-end if
diff --git a/decomp2d/io_write_every.inc b/decomp2d/io_write_every.inc
deleted file mode 100644
index ec55c1cd8..000000000
--- a/decomp2d/io_write_every.inc
+++ /dev/null
@@ -1,224 +0,0 @@
-!                          -*- mode: f90 -*-                            
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-! This file contain common code to be included by subroutines 
-! 'write_every_...' in io.f90
-
-! To write every few points of a 3D array to a file
-
-! work out the distribution parameters, which may be different from 
-! the default distribution used by the decomposition library
-!  For exmample if nx=17 and p_row=4
-!    distribution is: 4 4 4 5
-
-! If writing from the 1st element
-!  If saving every 3 points, then 5 points to be saved (17/3)
-!    default distribution would be 1 1 1 2
-!    However, 1st block (1-4) contains the 3rd point
-!             2nd block (5-8) contains the 6th point
-!             3rd block (9-12) contains the 9th and 12th point
-!             4th block (13-17) contains then 15th point
-!    giving a 1 1 2 1 distribution
-!    So cannot use the base decomposition library for such IO
-
-! If writing from the n-th element (n=?skip)
-!  If saving every 3 points, then 6 points to be saved
-!    However, 1st block (1-4) contains the 1st & 4th point
-!             2nd block (5-8) contains the 7th point
-!             3rd block (9-12) contains the 10th point
-!             4th block (13-17) contains then 12th & 15th point
-!    giving a 1 2 2 1 distribution
-
-skip(1)=iskip
-skip(2)=jskip
-skip(3)=kskip
-
-do i=1,3
-   if (from1) then
-      xst(i) = (xstart(i)+skip(i)-1)/skip(i)
-      if (mod(xstart(i)+skip(i)-1,skip(i))/=0) xst(i)=xst(i)+1
-      xen(i) = (xend(i)+skip(i)-1)/skip(i)
-   else
-      xst(i) = xstart(i)/skip(i)
-      if (mod(xstart(i),skip(i))/=0) xst(i)=xst(i)+1
-      xen(i) = xend(i)/skip(i)
-   end if
-   xsz(i) = xen(i)-xst(i)+1
-end do
-
-do i=1,3
-   if (from1) then
-      yst(i) = (ystart(i)+skip(i)-1)/skip(i)
-      if (mod(ystart(i)+skip(i)-1,skip(i))/=0) yst(i)=yst(i)+1
-      yen(i) = (yend(i)+skip(i)-1)/skip(i)
-   else
-      yst(i) = ystart(i)/skip(i)
-      if (mod(ystart(i),skip(i))/=0) yst(i)=yst(i)+1
-      yen(i) = yend(i)/skip(i)
-   end if
-   ysz(i) = yen(i)-yst(i)+1
-end do
-
-do i=1,3
-   if (from1) then
-      zst(i) = (zstart(i)+skip(i)-1)/skip(i)
-      if (mod(zstart(i)+skip(i)-1,skip(i))/=0) zst(i)=zst(i)+1
-      zen(i) = (zend(i)+skip(i)-1)/skip(i)
-   else
-      zst(i) = zstart(i)/skip(i)
-      if (mod(zstart(i),skip(i))/=0) zst(i)=zst(i)+1
-      zen(i) = zend(i)/skip(i)
-   end if
-   zsz(i) = zen(i)-zst(i)+1
-end do
-
-! if 'skip' value is large it is possible that some ranks do not 
-! contain any points to be written. Subarray constructor requires 
-! nonzero size so it is not possible to use MPI_COMM_WORLD for IO.
-! Create a sub communicator for this...
-color = 1
-key = 0  ! rank order doesn't matter
-if (ipencil==1) then
-   if (xsz(1)==0 .or. xsz(2)==0 .or. xsz(3)==0) then
-      color = 2
-   end if
-else if (ipencil==2) then
-   if (ysz(1)==0 .or. ysz(2)==0 .or. ysz(3)==0) then
-      color = 2
-   end if
-else if (ipencil==3) then
-   if (zsz(1)==0 .or. zsz(2)==0 .or. zsz(3)==0) then
-      color = 2
-   end if
-end if
-call MPI_COMM_SPLIT(MPI_COMM_WORLD,color,key,newcomm,ierror)
-
-if (color==1) then ! only ranks in this group do IO collectively
-
-   ! generate subarray information
-   sizes(1) = xsz(1)
-   sizes(2) = ysz(2)
-   sizes(3) = zsz(3)
-   if (ipencil==1) then
-      subsizes(1) = xsz(1)
-      subsizes(2) = xsz(2)
-      subsizes(3) = xsz(3)
-      starts(1) = xst(1)-1
-      starts(2) = xst(2)-1
-      starts(3) = xst(3)-1
-   else if (ipencil==2) then
-      subsizes(1) = ysz(1)
-      subsizes(2) = ysz(2)
-      subsizes(3) = ysz(3)
-      starts(1) = yst(1)-1
-      starts(2) = yst(2)-1
-      starts(3) = yst(3)-1
-   else if (ipencil==3) then
-      subsizes(1) = zsz(1)
-      subsizes(2) = zsz(2)
-      subsizes(3) = zsz(3)
-      starts(1) = zst(1)-1
-      starts(2) = zst(2)-1
-      starts(3) = zst(3)-1
-   end if
-
-   ! copy data from original array
-   ! needs a copy of original array in global coordinate 
-   if (ipencil==1) then
-      allocate(wk(xst(1):xen(1),xst(2):xen(2),xst(3):xen(3)))
-      allocate(wk2(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3)))
-      wk2=var
-      if (from1) then
-         do k=xst(3),xen(3)
-            do j=xst(2),xen(2)
-               do i=xst(1),xen(1)
-                  wk(i,j,k) = wk2((i-1)*iskip+1,(j-1)*jskip+1,(k-1)*kskip+1)
-               end do
-            end do
-         end do
-      else
-         do k=xst(3),xen(3)
-            do j=xst(2),xen(2)
-               do i=xst(1),xen(1)
-                  wk(i,j,k) = wk2(i*iskip,j*jskip,k*kskip)
-               end do
-            end do
-         end do
-      end if
-   else if (ipencil==2) then
-      allocate(wk(yst(1):yen(1),yst(2):yen(2),yst(3):yen(3)))
-      allocate(wk2(ystart(1):yend(1),ystart(2):yend(2),ystart(3):yend(3)))
-      wk2=var
-      if (from1) then
-         do k=yst(3),yen(3)
-            do j=yst(2),yen(2)
-               do i=yst(1),yen(1)
-                  wk(i,j,k) = wk2((i-1)*iskip+1,(j-1)*jskip+1,(k-1)*kskip+1)
-               end do
-            end do
-         end do
-      else
-         do k=yst(3),yen(3)
-            do j=yst(2),yen(2)
-               do i=yst(1),yen(1)
-                  wk(i,j,k) = wk2(i*iskip,j*jskip,k*kskip)
-               end do
-            end do
-         end do
-      end if
-   else if (ipencil==3) then
-      allocate(wk(zst(1):zen(1),zst(2):zen(2),zst(3):zen(3)))
-      allocate(wk2(zstart(1):zend(1),zstart(2):zend(2),zstart(3):zend(3)))
-      wk2=var
-      if (from1) then
-         do k=zst(3),zen(3)
-            do j=zst(2),zen(2)
-               do i=zst(1),zen(1)
-                  wk(i,j,k) = wk2((i-1)*iskip+1,(j-1)*jskip+1,(k-1)*kskip+1)
-               end do
-            end do
-         end do
-      else
-         do k=zst(3),zen(3)
-            do j=zst(2),zen(2)
-               do i=zst(1),zen(1)
-                  wk(i,j,k) = wk2(i*iskip,j*jskip,k*kskip)
-               end do
-            end do
-         end do
-      end if
-   end if
-   deallocate(wk2)
-
-   ! MPI-IO
-   call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
-        MPI_ORDER_FORTRAN, data_type, newtype, ierror)
-   call MPI_TYPE_COMMIT(newtype,ierror)
-   call MPI_FILE_OPEN(newcomm, filename, &
-        MPI_MODE_CREATE+MPI_MODE_WRONLY, MPI_INFO_NULL, &
-        fh, ierror)
-   filesize = 0_MPI_OFFSET_KIND
-   call MPI_FILE_SET_SIZE(fh,filesize,ierror)  ! guarantee overwriting
-   disp = 0_MPI_OFFSET_KIND
-   call MPI_FILE_SET_VIEW(fh,disp,data_type, &
-        newtype,'native',MPI_INFO_NULL,ierror)
-   call MPI_FILE_WRITE_ALL(fh, wk, &
-        subsizes(1)*subsizes(2)*subsizes(3), &
-        data_type, MPI_STATUS_IGNORE, ierror)
-   call MPI_FILE_CLOSE(fh,ierror)
-   call MPI_TYPE_FREE(newtype,ierror)
-
-   deallocate(wk)
-
-end if ! color==1
-
-call MPI_BARRIER(MPI_COMM_WORLD, ierror)
diff --git a/decomp2d/io_write_one.inc b/decomp2d/io_write_one.inc
deleted file mode 100644
index f7d86926a..000000000
--- a/decomp2d/io_write_one.inc
+++ /dev/null
@@ -1,84 +0,0 @@
-!                          -*- mode: f90 -*-                            
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2012 Ning Li, the Numerical Algorithms Group (NAG) 
-! Copyright (C) 2021               the University of Edinburgh (UoE)
-!
-!=======================================================================
-
-! This file contain common code to be included by subroutines 
-! 'mpiio_write_one_...' in io.f90
-
-! Using MPI-IO to write a distributed 3D array into a file
-
-if (present(opt_decomp)) then
-   decomp = opt_decomp
-else
-   call get_decomp_info(decomp)
-end if
-
-! determine subarray parameters
-sizes(1) = decomp%xsz(1)
-sizes(2) = decomp%ysz(2)
-sizes(3) = decomp%zsz(3)
-
-if (ipencil == 1) then
-   subsizes(1) = decomp%xsz(1)
-   subsizes(2) = decomp%xsz(2)
-   subsizes(3) = decomp%xsz(3)
-   starts(1) = decomp%xst(1)-1  ! 0-based index
-   starts(2) = decomp%xst(2)-1
-   starts(3) = decomp%xst(3)-1
-else if (ipencil == 2) then
-   subsizes(1) = decomp%ysz(1)
-   subsizes(2) = decomp%ysz(2)
-   subsizes(3) = decomp%ysz(3)
-   starts(1) = decomp%yst(1)-1
-   starts(2) = decomp%yst(2)-1
-   starts(3) = decomp%yst(3)-1
-else if (ipencil == 3) then
-   subsizes(1) = decomp%zsz(1)
-   subsizes(2) = decomp%zsz(2)
-   subsizes(3) = decomp%zsz(3)
-   starts(1) = decomp%zst(1)-1
-   starts(2) = decomp%zst(2)-1
-   starts(3) = decomp%zst(3)-1
-endif
-
-#ifdef T3PIO
-call MPI_INFO_CREATE(info, ierror)
-gs = ceiling(real(sizes(1),mytype)*real(sizes(2),mytype)* &
-     real(sizes(3),mytype)/1024./1024.)
-call t3pio_set_info(MPI_COMM_WORLD, info, "./", ierror, &
-     GLOBAL_SIZE=gs, factor=1)
-#endif
-
-call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
-     MPI_ORDER_FORTRAN, data_type, newtype, ierror)
-call MPI_TYPE_COMMIT(newtype,ierror)
-#ifdef T3PIO
-call MPI_FILE_OPEN(MPI_COMM_WORLD, filename, &
-     MPI_MODE_CREATE+MPI_MODE_WRONLY, info, fh, ierror)
-#else
-call MPI_FILE_OPEN(MPI_COMM_WORLD, filename, &
-     MPI_MODE_CREATE+MPI_MODE_WRONLY, MPI_INFO_NULL, &
-     fh, ierror)
-#endif
-filesize = 0_MPI_OFFSET_KIND
-call MPI_FILE_SET_SIZE(fh,filesize,ierror)  ! guarantee overwriting
-disp = 0_MPI_OFFSET_KIND
-call MPI_FILE_SET_VIEW(fh,disp,data_type, &
-     newtype,'native',MPI_INFO_NULL,ierror)
-call MPI_FILE_WRITE_ALL(fh, var, &
-     subsizes(1)*subsizes(2)*subsizes(3), &
-     data_type, MPI_STATUS_IGNORE, ierror)
-call MPI_FILE_CLOSE(fh,ierror)
-call MPI_TYPE_FREE(newtype,ierror)
-#ifdef T3PIO
-call MPI_INFO_FREE(info,ierror)
-#endif
diff --git a/decomp2d/io_write_outflow.f90 b/decomp2d/io_write_outflow.f90
deleted file mode 100644
index 9634bf0f5..000000000
--- a/decomp2d/io_write_outflow.f90
+++ /dev/null
@@ -1,71 +0,0 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-! This file contain common code to be included by subroutines 
-! 'write_var_...' in io.f90
-
-! Using MPI-IO to write a distributed 3D variable to a file. File 
-! operations (open/close) need to be done in calling application. This
-! allows multiple variables to be written to a single file. Together 
-! with the corresponding read operation, this is the perfect solution
-! for applications to perform restart/checkpointing.
-
-if (present(opt_decomp)) then
-   decomp = opt_decomp
-else
-   call get_decomp_info(decomp)
-end if
-
-! Create file type and set file view
-sizes(1) = ntimesteps 
-sizes(2) = decomp%ysz(2)
-sizes(3) = decomp%zsz(3)
-subsizes(1) = ntimesteps 
-subsizes(2) = decomp%xsz(2)
-subsizes(3) = decomp%xsz(3)
-starts(1) = 0  ! 0-based index
-starts(2) = decomp%xst(2)-1
-starts(3) = decomp%xst(3)-1
-
-idx = get_io_idx(io_name, dirname)
-
-#ifndef ADIOS2
-!! Use default MPIIO
-associate(fh=>fh_registry(idx), &
-     disp=>fh_disp(idx))
-  call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
-       MPI_ORDER_FORTRAN, data_type, newtype, ierror)
-  call MPI_TYPE_COMMIT(newtype,ierror)
-  call MPI_FILE_SET_VIEW(fh,disp,data_type, &
-       newtype,'native',MPI_INFO_NULL,ierror)
-  call MPI_FILE_WRITE_ALL(fh, var, &
-       subsizes(1)*subsizes(2)*subsizes(3), &
-       data_type, MPI_STATUS_IGNORE, ierror)
-  call MPI_TYPE_FREE(newtype,ierror)
-
-  ! update displacement for the next write operation
-  disp = disp + sizes(1)*sizes(2)*sizes(3)*mytype_bytes
-  if (data_type == complex_type) then
-     disp = disp + sizes(1)*sizes(2)*sizes(3)*mytype_bytes
-  end if
-end associate
-#else
-!! Use ADIOS2
-call adios2_at_io(io_handle, adios, io_name, ierror)
-call adios2_inquire_variable(var_handle, io_handle, varname, ierror)
-if (.not.var_handle % valid) then
-   print *, "ERROR: trying to write variable before registering!", varname
-   stop
-endif
-
-!! Note - need to use sync mode as we are using a view into the array - unsure how this works with deferred writes
-call adios2_put(engine_registry(idx), var_handle, var, adios2_mode_sync, ierror)
-#endif
diff --git a/decomp2d/io_write_plane.inc b/decomp2d/io_write_plane.inc
deleted file mode 100644
index c1031e4ab..000000000
--- a/decomp2d/io_write_plane.inc
+++ /dev/null
@@ -1,155 +0,0 @@
-!                          -*- mode: f90 -*-                            
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-! This file contain common code to be included by subroutines 
-! 'mpiio_write_plane_3d_...' in io.f90
-
-! It is much easier to implement if all mpi ranks participate I/O.
-! Transpose the 3D data if necessary.
-
-if (present(opt_decomp)) then
-   decomp = opt_decomp
-else
-   call get_decomp_info(decomp)
-end if
-
-opened_new = .false.
-
-if (iplane==1) then
-   allocate(wk(decomp%xsz(1),decomp%xsz(2),decomp%xsz(3)))
-   if (ipencil==1) then
-      wk = var
-   else if (ipencil==2) then
-      call transpose_y_to_x(var,wk,decomp)
-   else if (ipencil==3) then
-      allocate(wk2(decomp%ysz(1),decomp%ysz(2),decomp%ysz(3)))
-      call transpose_z_to_y(var,wk2,decomp)
-      call transpose_y_to_x(wk2,wk,decomp)
-      deallocate(wk2)
-   end if
-   allocate(wk2d(1,decomp%xsz(2),decomp%xsz(3)))
-   if (n.ge.1) then
-      do k=1,decomp%xsz(3)
-         do j=1,decomp%xsz(2)
-            wk2d(1,j,k)=wk(n,j,k)
-         end do
-      end do
-   else
-      do k=1,decomp%xsz(3)
-         do j=1,decomp%xsz(2)
-            wk2d(1,j,k)=sum(wk(:,j,k))/real(decomp%xsz(1),kind=mytype)
-         end do
-      end do
-   endif
-else if (iplane==2) then
-   allocate(wk(decomp%ysz(1),decomp%ysz(2),decomp%ysz(3)))
-   if (ipencil==1) then
-      call transpose_x_to_y(var,wk,decomp)
-   else if (ipencil==2) then
-      wk = var
-   else if (ipencil==3) then
-      call transpose_z_to_y(var,wk,decomp)
-   end if
-   allocate(wk2d(decomp%ysz(1),1,decomp%ysz(3)))
-   if (n.ge.1) then
-      do k=1,decomp%ysz(3)
-         do i=1,decomp%ysz(1)
-            wk2d(i,1,k)=wk(i,n,k)
-         end do
-      end do
-   else
-      do k=1,decomp%ysz(3)
-         do i=1,decomp%ysz(1)
-            wk2d(i,1,k)=sum(wk(i,:,k))/real(decomp%ysz(2),kind=mytype)
-         end do
-      end do
-   endif
-else if (iplane==3) then
-   allocate(wk(decomp%zsz(1),decomp%zsz(2),decomp%zsz(3)))
-   if (ipencil==1) then
-      allocate(wk2(decomp%ysz(1),decomp%ysz(2),decomp%ysz(3)))
-      call transpose_x_to_y(var,wk2,decomp)
-      call transpose_y_to_z(wk2,wk,decomp)
-      deallocate(wk2)
-   else if (ipencil==2) then
-      call transpose_y_to_z(var,wk,decomp)
-   else if (ipencil==3) then
-      wk = var
-   end if
-   allocate(wk2d(decomp%zsz(1),decomp%zsz(2),1))
-   if (n.ge.1) then
-      do j=1,decomp%zsz(2)
-         do i=1,decomp%zsz(1) 
-            wk2d(i,j,1)=wk(i,j,n)
-         end do
-      end do
-   else
-      do j=1,decomp%zsz(2)
-         do i=1,decomp%zsz(1)
-            wk2d(i,j,1)=sum(wk(i,j,:))/real(decomp%zsz(3),kind=mytype)
-         end do
-      end do
-   endif
-end if
-
-idx = get_io_idx(io_name, dirname)
-
-#ifndef ADIOS2
-!! Use default MPIIO writers
-
-if (idx .lt. 1) then
-   ! Create folder if needed
-   if (nrank==0) then
-      inquire(file=dirname, exist=dir_exists)
-      if (.not.dir_exists) then
-         call system("mkdir "//dirname//" 2> /dev/null")
-      end if
-   end if
-   allocate(character(len(trim(dirname)) + 1 + len(trim(varname))) :: full_io_name)
-   full_io_name = dirname//"/"//varname
-   call decomp_2d_open_io(io_name, full_io_name, decomp_2d_write_mode)
-   idx = get_io_idx(io_name, full_io_name)
-   opened_new = .true.
-end if
-
-call plane_extents(sizes, subsizes, starts, iplane, decomp)
-associate(fh=>fh_registry(idx), &
-     disp=>fh_disp(idx))
-  call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
-       MPI_ORDER_FORTRAN, data_type, newtype, ierror)
-  call MPI_TYPE_COMMIT(newtype,ierror)
-  call MPI_FILE_SET_VIEW(fh,disp,data_type, &
-       newtype,'native',MPI_INFO_NULL,ierror)
-  call MPI_FILE_WRITE_ALL(fh, wk2d, &
-       subsizes(1)*subsizes(2)*subsizes(3), &
-       data_type, MPI_STATUS_IGNORE, ierror)
-  call MPI_TYPE_FREE(newtype,ierror)
-end associate
-    
-if (opened_new) then
-   call decomp_2d_close_io(io_name, full_io_name)
-   deallocate(full_io_name)
-end if
-#else
-!! Write using ADIOS2
-call adios2_at_io(io_handle, adios, io_name, ierror)
-call adios2_inquire_variable(var_handle, io_handle, varname, ierror)
-if (.not.var_handle % valid) then
-   print *, "ERROR: trying to write variable before registering!", varname
-   stop
-endif
-
-!! Note - need to use sync mode as the array for the output plane gets reused.
-call adios2_put(engine_registry(idx), var_handle, wk2d, adios2_mode_sync, ierror)
-#endif
-
-deallocate(wk,wk2d)
diff --git a/decomp2d/io_write_var.inc b/decomp2d/io_write_var.inc
deleted file mode 100644
index c511bf4f7..000000000
--- a/decomp2d/io_write_var.inc
+++ /dev/null
@@ -1,69 +0,0 @@
-!                          -*- mode: f90 -*-                            
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-! This file contain common code to be included by subroutines 
-! 'write_var_...' in io.f90
-
-! Using MPI-IO to write a distributed 3D variable to a file. File 
-! operations (open/close) need to be done in calling application. This
-! allows multiple variables to be written to a single file. Together 
-! with the corresponding read operation, this is the perfect solution
-! for applications to perform restart/checkpointing.
-
-if (present(opt_decomp)) then
-   decomp = opt_decomp
-else
-   call get_decomp_info(decomp)
-end if
-
-! Create file type and set file view
-sizes(1) = decomp%xsz(1)
-sizes(2) = decomp%ysz(2)
-sizes(3) = decomp%zsz(3)
-if (ipencil == 1) then
-   subsizes(1) = decomp%xsz(1)
-   subsizes(2) = decomp%xsz(2)
-   subsizes(3) = decomp%xsz(3)
-   starts(1) = decomp%xst(1)-1  ! 0-based index
-   starts(2) = decomp%xst(2)-1
-   starts(3) = decomp%xst(3)-1
-else if (ipencil == 2) then
-   subsizes(1) = decomp%ysz(1)
-   subsizes(2) = decomp%ysz(2)
-   subsizes(3) = decomp%ysz(3)
-   starts(1) = decomp%yst(1)-1
-   starts(2) = decomp%yst(2)-1
-   starts(3) = decomp%yst(3)-1
-else if (ipencil == 3) then
-   subsizes(1) = decomp%zsz(1)
-   subsizes(2) = decomp%zsz(2)
-   subsizes(3) = decomp%zsz(3)
-   starts(1) = decomp%zst(1)-1
-   starts(2) = decomp%zst(2)-1
-   starts(3) = decomp%zst(3)-1
-endif
-
-call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
-     MPI_ORDER_FORTRAN, data_type, newtype, ierror)
-call MPI_TYPE_COMMIT(newtype,ierror)
-call MPI_FILE_SET_VIEW(fh,disp,data_type, &
-     newtype,'native',MPI_INFO_NULL,ierror)
-call MPI_FILE_WRITE_ALL(fh, var, &
-     subsizes(1)*subsizes(2)*subsizes(3), &
-     data_type, MPI_STATUS_IGNORE, ierror)
-call MPI_TYPE_FREE(newtype,ierror)
-
-! update displacement for the next write operation
-disp = disp + sizes(1)*sizes(2)*sizes(3)*mytype_bytes
-if (data_type == complex_type) then
-   disp = disp + sizes(1)*sizes(2)*sizes(3)*mytype_bytes
-end if
diff --git a/decomp2d/mem_merge.f90 b/decomp2d/mem_merge.f90
deleted file mode 100644
index e1ff36fb7..000000000
--- a/decomp2d/mem_merge.f90
+++ /dev/null
@@ -1,92 +0,0 @@
-  !=======================================================================
-  ! This is part of the 2DECOMP&FFT library
-  ! 
-  ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-  ! decomposition. It also implements a highly scalable distributed
-  ! three-dimensional Fast Fourier Transform (FFT).
-  !
-  ! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-  !
-  !=======================================================================
-  
-  ! This file contain duplicated code that scatters data from the
-  ! MPI_ALLTOALLV receive buffer to destinations. It is 'included' by two 
-  ! subroutines in decomp_2d.f90
-  
-  ! Note:
-  !   in  --> receive buffer
-  !   out --> destination array
-  !   pos --> pointer for the receive buffer
-  !    - for normal ALLTOALLV, points to the beginning of receive buffer (=1)
-  !    - for shared memory code, note the receive buffer is shared by all cores
-  !      on same node, so 'pos' points to the correct location for this core
-  
-  integer, intent(IN) :: ndir
-  integer, intent(IN) :: iproc
-  integer, dimension(0:iproc-1), intent(IN) :: dist
-  TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-  integer :: i,j,k, m,i1,i2,pos
-
-#ifndef SHM
-  pos = 1
-#endif
-  do m=0,iproc-1
-     if (m==0) then
-        i1 = 1
-        i2 = dist(0)
-     else
-        i1 = i2+1
-        i2 = i1+dist(m)-1
-     endif
-
-     if (ndir==1) then
-#ifdef SHM
-        pos = decomp%y1disp_o(m) + 1
-#endif
-        do k=1,n3
-           do j=i1,i2
-              do i=1,n1
-                 out(i,j,k) = in(pos)
-                 pos = pos + 1
-              enddo
-           enddo
-        enddo
-     else if (ndir==2) then
-#ifdef SHM
-        pos = decomp%z2disp_o(m) + 1
-#endif
-        do k=i1,i2
-           do j=1,n2
-              do i=1,n1
-                 out(i,j,k) = in(pos)
-                 pos = pos + 1
-              enddo
-           enddo
-        enddo
-     else if (ndir==3) then
-#ifdef SHM
-        pos = decomp%y2disp_o(m) + 1
-#endif
-        do k=1,n3
-           do j=i1,i2
-              do i=1,n1
-                 out(i,j,k) = in(pos)
-                 pos = pos + 1
-              enddo
-           enddo
-        enddo
-     else if (ndir==4) then
-#ifdef SHM
-        pos = decomp%x1disp_o(m) + 1
-#endif
-        do k=1,n3
-           do j=1,n2
-              do i=i1,i2
-                 out(i,j,k) = in(pos)
-                 pos = pos + 1
-              enddo
-           enddo
-        enddo
-     endif
-  enddo
diff --git a/decomp2d/mem_split.f90 b/decomp2d/mem_split.f90
deleted file mode 100644
index bb56c9534..000000000
--- a/decomp2d/mem_split.f90
+++ /dev/null
@@ -1,92 +0,0 @@
-  !=======================================================================
-  ! This is part of the 2DECOMP&FFT library
-  ! 
-  ! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-  ! decomposition. It also implements a highly scalable distributed
-  ! three-dimensional Fast Fourier Transform (FFT).
-  !
-  ! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-  !
-  !=======================================================================
-  
-  ! This file contain duplicated code that gathers data from source to 
-  ! MPI_ALLTOALLV send buffer. It is 'included' by two subroutines in 
-  ! decomp_2d.f90
-  
-  ! Note:
-  !   in  --> source array
-  !   out --> send buffer
-  !   pos --> pointer for the send buffer
-  !    - for normal ALLTOALLV, points to the beginning of send buffer (=1)
-  !    - for shared memory code, note the send buffer is shared by all cores
-  !      on same node, so 'pos' points to the correct location for this core 
-  
-  integer, intent(IN) :: ndir
-  integer, intent(IN) :: iproc
-  integer, dimension(0:iproc-1), intent(IN) :: dist
-  TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-  integer :: i,j,k, m,i1,i2,pos
-
-#ifndef SHM
-  pos = 1
-#endif
-  do m=0,iproc-1
-     if (m==0) then 
-        i1 = 1
-        i2 = dist(0)
-     else
-        i1 = i2+1
-        i2 = i1+dist(m)-1
-     endif
-
-     if (ndir==1) then
-#ifdef SHM
-        pos = decomp%x1disp_o(m) + 1
-#endif
-        do k=1,n3
-           do j=1,n2
-              do i=i1,i2
-                 out(pos) = in(i,j,k)
-                 pos = pos + 1
-              enddo
-           enddo
-        enddo
-     else if (ndir==2) then
-#ifdef SHM
-        pos = decomp%y2disp_o(m) + 1
-#endif
-        do k=1,n3
-           do j=i1,i2
-              do i=1,n1
-                 out(pos) = in(i,j,k)
-                 pos = pos + 1
-              enddo
-           enddo
-        enddo
-     else if (ndir==3) then
-#ifdef SHM
-        pos = decomp%z2disp_o(m) + 1
-#endif
-        do k=i1,i2
-           do j=1,n2
-              do i=1,n1
-                 out(pos) = in(i,j,k)
-                 pos = pos + 1
-              enddo
-           enddo
-        enddo
-     else if (ndir==4) then
-#ifdef SHM
-        pos = decomp%y1disp_o(m) + 1
-#endif
-        do k=1,n3
-           do j=i1,i2
-              do i=1,n1
-                 out(pos) = in(i,j,k)
-                 pos = pos + 1
-              enddo
-           enddo
-        enddo
-     endif
-  enddo
diff --git a/decomp2d/mkl_dfti.f90 b/decomp2d/mkl_dfti.f90
deleted file mode 100644
index 483206ce0..000000000
--- a/decomp2d/mkl_dfti.f90
+++ /dev/null
@@ -1,776 +0,0 @@
-!===============================================================================
-! Copyright 2002-2015 Intel Corporation All Rights Reserved.
-!
-! The source code,  information  and material  ("Material") contained  herein is
-! owned by Intel Corporation or its  suppliers or licensors,  and  title to such
-! Material remains with Intel  Corporation or its  suppliers or  licensors.  The
-! Material  contains  proprietary  information  of  Intel or  its suppliers  and
-! licensors.  The Material is protected by  worldwide copyright  laws and treaty
-! provisions.  No part  of  the  Material   may  be  used,  copied,  reproduced,
-! modified, published,  uploaded, posted, transmitted,  distributed or disclosed
-! in any way without Intel's prior express written permission.  No license under
-! any patent,  copyright or other  intellectual property rights  in the Material
-! is granted to  or  conferred  upon  you,  either   expressly,  by implication,
-! inducement,  estoppel  or  otherwise.  Any  license   under such  intellectual
-! property rights must be express and approved by Intel in writing.
-!
-! Unless otherwise agreed by Intel in writing,  you may not remove or alter this
-! notice or  any  other  notice   embedded  in  Materials  by  Intel  or Intel's
-! suppliers or licensors in any way.
-!===============================================================================
-
-! Content:
-!    Intel(R) Math Kernel Library (MKL)
-!    Discrete Fourier Transform Interface (DFTI)
-!*****************************************************************************
-
-MODULE MKL_DFT_TYPE
-
-  TYPE, PUBLIC :: DFTI_DESCRIPTOR
-     PRIVATE
-     INTEGER :: dontuse
-     ! Structure of this type is not used in Fortran code
-     ! the pointer to this type is used only
-  END TYPE DFTI_DESCRIPTOR
-
-  !======================================================================
-  ! These real type kind parameters are not for direct use
-  !======================================================================
-
-  INTEGER, PARAMETER :: DFTI_SPKP = SELECTED_REAL_KIND(6,37)
-  INTEGER, PARAMETER :: DFTI_DPKP = SELECTED_REAL_KIND(15,307)
-
-  !======================================================================
-  ! Descriptor configuration parameters [default values in brackets]
-  !======================================================================
-
-  ! Domain for forward transform. No default value
-  INTEGER, PARAMETER :: DFTI_FORWARD_DOMAIN = 0
-
-  ! Dimensionality, or rank. No default value
-  INTEGER, PARAMETER :: DFTI_DIMENSION = 1
-
-  ! Length(s) of transform. No default value
-  INTEGER, PARAMETER :: DFTI_LENGTHS = 2
-
-  ! Floating point precision. No default value
-  INTEGER, PARAMETER :: DFTI_PRECISION = 3
-
-  ! Scale factor for forward transform [1.0]
-  INTEGER, PARAMETER :: DFTI_FORWARD_SCALE = 4
-
-  ! Scale factor for backward transform [1.0]
-  INTEGER, PARAMETER :: DFTI_BACKWARD_SCALE = 5
-
-  ! Exponent sign for forward transform [DFTI_NEGATIVE]
-  ! INTEGER, PARAMETER :: DFTI_FORWARD_SIGN = 6 ! NOT IMPLEMENTED
-
-  ! Number of data sets to be transformed [1]
-  INTEGER, PARAMETER :: DFTI_NUMBER_OF_TRANSFORMS = 7
-
-  ! Storage of finite complex-valued sequences in complex domain
-  ! [DFTI_COMPLEX_COMPLEX]
-  INTEGER, PARAMETER :: DFTI_COMPLEX_STORAGE = 8
-
-  ! Storage of finite real-valued sequences in real domain
-  ! [DFTI_REAL_REAL]
-  INTEGER, PARAMETER :: DFTI_REAL_STORAGE = 9
-
-  ! Storage of finite complex-valued sequences in conjugate-even
-  ! domain [DFTI_COMPLEX_REAL]
-  INTEGER, PARAMETER :: DFTI_CONJUGATE_EVEN_STORAGE = 10
-
-  ! Placement of result [DFTI_INPLACE]
-  INTEGER, PARAMETER :: DFTI_PLACEMENT = 11
-
-  ! Generalized strides for input data layout
-  ! [tigth, col-major for Fortran]
-  INTEGER, PARAMETER :: DFTI_INPUT_STRIDES = 12
-
-  ! Generalized strides for output data layout
-  ! [tigth, col-major for Fortran]
-  INTEGER, PARAMETER :: DFTI_OUTPUT_STRIDES = 13
-
-  ! Distance between first input elements for multiple transforms [0]
-  INTEGER, PARAMETER :: DFTI_INPUT_DISTANCE = 14
-
-  ! Distance between first output elements for multiple transforms [0]
-  INTEGER, PARAMETER :: DFTI_OUTPUT_DISTANCE = 15
-
-  ! Effort spent in initialization [DFTI_MEDIUM]
-  ! INTEGER, PARAMETER :: DFTI_INITIALIZATION_EFFORT = 16 ! NOT IMPLEMENTED
-
-  ! Use of workspace during computation [DFTI_ALLOW]
-  INTEGER, PARAMETER :: DFTI_WORKSPACE = 17
-
-  ! Ordering of the result [DFTI_ORDERED]
-  INTEGER, PARAMETER :: DFTI_ORDERING = 18
-
-  ! Possible transposition of result [DFTI_NONE]
-  INTEGER, PARAMETER :: DFTI_TRANSPOSE = 19
-
-  ! User-settable descriptor name [""]
-  INTEGER, PARAMETER :: DFTI_DESCRIPTOR_NAME = 20
-
-  ! Packing format for DFTI_COMPLEX_REAL storage of finite
-  ! conjugate-even sequences [DFTI_CCS_FORMAT]
-  INTEGER, PARAMETER :: DFTI_PACKED_FORMAT = 21
-
-  ! Commit status of the descriptor. Read-only parameter
-  INTEGER, PARAMETER :: DFTI_COMMIT_STATUS = 22
-
-  ! Version string for this DFTI implementation. Read-only parameter
-  INTEGER, PARAMETER :: DFTI_VERSION = 23
-
-  ! Ordering of the forward transform. Read-only parameter
-  ! INTEGER, PARAMETER :: DFTI_FORWARD_ORDERING = 24 ! NOT IMPLEMENTED
-
-  ! Ordering of the backward transform. Read-only parameter
-  ! INTEGER, PARAMETER :: DFTI_BACKWARD_ORDERING = 25 ! NOT IMPLEMENTED
-
-  ! Number of user threads that share the descriptor [1]
-  INTEGER, PARAMETER :: DFTI_NUMBER_OF_USER_THREADS = 26
-
-  ! Limit the number of threads used by this descriptor [0 = don't care]
-  INTEGER, PARAMETER :: DFTI_THREAD_LIMIT = 27
-
-  !======================================================================
-  ! Values of the descriptor configuration parameters
-  !======================================================================
-
-  ! DFTI_COMMIT_STATUS
-  INTEGER, PARAMETER :: DFTI_COMMITTED = 30
-  INTEGER, PARAMETER :: DFTI_UNCOMMITTED = 31
-
-  ! DFTI_FORWARD_DOMAIN
-  INTEGER, PARAMETER :: DFTI_COMPLEX = 32
-  INTEGER, PARAMETER :: DFTI_REAL = 33
-  ! INTEGER, PARAMETER :: DFTI_CONJUGATE_EVEN = 34 ! NOT IMPLEMENTED
-
-  ! DFTI_PRECISION
-  INTEGER, PARAMETER :: DFTI_SINGLE = 35
-  INTEGER, PARAMETER :: DFTI_DOUBLE = 36
-
-  ! DFTI_PRECISION for reduced size of statically linked application.
-  ! Recommended use: modify statement 'USE MKL_DFTI' in your program,
-  ! so that it reads as either of:
-  ! USE MKL_DFTI, FORGET=>DFTI_SINGLE, DFTI_SINGLE=>DFTI_SINGLE_R
-  ! USE MKL_DFTI, FORGET=>DFTI_DOUBLE, DFTI_DOUBLE=>DFTI_DOUBLE_R
-  ! where word 'FORGET' can be any name not used in the program.
-  REAL(DFTI_SPKP), PARAMETER :: DFTI_SINGLE_R = 35
-  REAL(DFTI_DPKP), PARAMETER :: DFTI_DOUBLE_R = 36
-
-  ! DFTI_FORWARD_SIGN
-  ! INTEGER, PARAMETER :: DFTI_NEGATIVE = 37 ! NOT IMPLEMENTED
-  ! INTEGER, PARAMETER :: DFTI_POSITIVE = 38 ! NOT IMPLEMENTED
-
-  ! DFTI_COMPLEX_STORAGE and DFTI_CONJUGATE_EVEN_STORAGE
-  INTEGER, PARAMETER :: DFTI_COMPLEX_COMPLEX = 39
-  INTEGER, PARAMETER :: DFTI_COMPLEX_REAL = 40
-
-  ! DFTI_REAL_STORAGE
-  INTEGER, PARAMETER :: DFTI_REAL_COMPLEX = 41
-  INTEGER, PARAMETER :: DFTI_REAL_REAL = 42
-
-  ! DFTI_PLACEMENT
-  INTEGER, PARAMETER :: DFTI_INPLACE = 43 ! Result overwrites input
-  INTEGER, PARAMETER :: DFTI_NOT_INPLACE  = 44 ! Have another place for result
-
-  ! DFTI_INITIALIZATION_EFFORT
-  ! INTEGER, PARAMETER :: DFTI_LOW = 45 ! NOT IMPLEMENTED
-  ! INTEGER, PARAMETER :: DFTI_MEDIUM = 46 ! NOT IMPLEMENTED
-  ! INTEGER, PARAMETER :: DFTI_HIGH = 47 ! NOT IMPLEMENTED
-
-  ! DFTI_ORDERING
-  INTEGER, PARAMETER :: DFTI_ORDERED = 48
-  INTEGER, PARAMETER :: DFTI_BACKWARD_SCRAMBLED = 49
-  ! INTEGER, PARAMETER :: DFTI_FORWARD_SCRAMBLED  = 50 ! NOT IMPLEMENTED
-
-  ! Allow/avoid certain usages
-  INTEGER, PARAMETER :: DFTI_ALLOW = 51 ! Allow transposition or workspace
-  INTEGER, PARAMETER :: DFTI_AVOID = 52 ! Avoid auxiliary storage
-  INTEGER, PARAMETER :: DFTI_NONE = 53
-
-  ! DFTI_PACKED_FORMAT
-  ! (for storing congugate-even finite sequence in real array)
-  INTEGER, PARAMETER :: DFTI_CCS_FORMAT = 54  ! Complex conjugate-symmetric
-  INTEGER, PARAMETER :: DFTI_PACK_FORMAT = 55 ! Pack format for real DFT
-  INTEGER, PARAMETER :: DFTI_PERM_FORMAT = 56 ! Perm format for real DFT
-  INTEGER, PARAMETER :: DFTI_CCE_FORMAT = 57  ! Complex conjugate-even
-
-  !======================================================================
-  ! Error classes
-  !======================================================================
-  INTEGER, PARAMETER :: DFTI_NO_ERROR = 0
-  INTEGER, PARAMETER :: DFTI_MEMORY_ERROR = 1
-  INTEGER, PARAMETER :: DFTI_INVALID_CONFIGURATION = 2
-  INTEGER, PARAMETER :: DFTI_INCONSISTENT_CONFIGURATION = 3
-  INTEGER, PARAMETER :: DFTI_MULTITHREADED_ERROR = 4
-  INTEGER, PARAMETER :: DFTI_BAD_DESCRIPTOR = 5
-  INTEGER, PARAMETER :: DFTI_UNIMPLEMENTED = 6
-  INTEGER, PARAMETER :: DFTI_MKL_INTERNAL_ERROR = 7
-  INTEGER, PARAMETER :: DFTI_NUMBER_OF_THREADS_ERROR = 8
-  INTEGER, PARAMETER :: DFTI_1D_LENGTH_EXCEEDS_INT32 = 9
-
-  ! Maximum length of error string
-  INTEGER, PARAMETER :: DFTI_MAX_MESSAGE_LENGTH = 80
-
-  ! Maximum length of user-settable descriptor name
-  INTEGER, PARAMETER :: DFTI_MAX_NAME_LENGTH = 10
-
-  ! Maximum length of MKL version string
-  INTEGER, PARAMETER :: DFTI_VERSION_LENGTH = 198
-
-END MODULE MKL_DFT_TYPE
-
-MODULE MKL_DFTI
-
-  USE MKL_DFT_TYPE
-
-  INTERFACE DftiCreateDescriptor
-
-     ! overloading of DftiCreateDescriptor for 1D DFT
-     FUNCTION dfti_create_descriptor_1d(desc, precision, domain, dim, length)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_create_descriptor_1d
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_create_descriptor_1d
-       INTEGER dfti_create_descriptor_1d
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       INTEGER, INTENT(IN) :: precision
-       INTEGER, INTENT(IN) :: domain
-       INTEGER, INTENT(IN) :: dim, length
-     END FUNCTION dfti_create_descriptor_1d
-
-     ! overloading of DftiCreateDescriptor for nD DFT
-     FUNCTION dfti_create_descriptor_highd(desc, precision, domain, dim,length)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_create_descriptor_highd
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_create_descriptor_highd
-       INTEGER dfti_create_descriptor_highd
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       INTEGER, INTENT(IN) :: precision
-       INTEGER, INTENT(IN) :: domain
-       INTEGER, INTENT(IN) :: dim
-       INTEGER, INTENT(IN), DIMENSION(*) :: length
-     END FUNCTION dfti_create_descriptor_highd
-
-     ! overloading of DftiCreateDescriptor for SP 1D DFT
-     ! second parameter (precision) should be any REAL*4 value
-     ! for dispatching during compile time
-     FUNCTION dfti_create_descriptor_s_1d(desc, s, dom, one, dim)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_create_descriptor_s_1d
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_create_descriptor_s_1d
-       INTEGER dfti_create_descriptor_s_1d
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       REAL(DFTI_SPKP), INTENT(IN) :: s
-       INTEGER, INTENT(IN) :: dom
-       INTEGER, INTENT(IN) :: one
-       INTEGER, INTENT(IN) :: dim
-     END FUNCTION dfti_create_descriptor_s_1d
-
-     ! overloading of DftiCreateDescriptor for SP nD DFT
-     ! second parameter (precision) should be any REAL*4 value
-     ! for dispatching during compile time
-     FUNCTION dfti_create_descriptor_s_md(desc, s, dom, many, dims)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_create_descriptor_s_md
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_create_descriptor_s_md
-       INTEGER dfti_create_descriptor_s_md
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       REAL(DFTI_SPKP), INTENT(IN) :: s
-       INTEGER, INTENT(IN) :: dom
-       INTEGER, INTENT(IN) :: many
-       INTEGER, INTENT(IN), DIMENSION(*) :: dims
-     END FUNCTION dfti_create_descriptor_s_md
-
-     ! overloading of DftiCreateDescriptor for DP 1D DFT
-     ! second parameter (precision) should be any REAL*8 value
-     ! for dispatching during compile time
-     FUNCTION dfti_create_descriptor_d_1d(desc, d, dom, one, dim)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_create_descriptor_d_1d
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_create_descriptor_d_1d
-       INTEGER dfti_create_descriptor_d_1d
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       REAL(DFTI_DPKP), INTENT(IN) :: d
-       INTEGER, INTENT(IN) :: dom
-       INTEGER, INTENT(IN) :: one
-       INTEGER, INTENT(IN) :: dim
-     END FUNCTION dfti_create_descriptor_d_1d
-
-     ! overloading of DftiCreateDescriptor for DP nD DFT
-     ! second parameter (precision) should be any REAL*8 value
-     ! for dispatching during compile time
-     FUNCTION dfti_create_descriptor_d_md(desc, d, dom, many, dims)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_create_descriptor_d_md
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_create_descriptor_d_md
-       INTEGER dfti_create_descriptor_d_md
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       REAL(DFTI_DPKP), INTENT(IN) :: d
-       INTEGER, INTENT(IN) :: dom
-       INTEGER, INTENT(IN) :: many
-       INTEGER, INTENT(IN), DIMENSION(*) :: dims
-     END FUNCTION dfti_create_descriptor_d_md
-
-  END INTERFACE
-
-  INTERFACE DftiCopyDescriptor
-
-     FUNCTION dfti_copy_descriptor_external(desc, new_desc)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_copy_descriptor_external
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_copy_descriptor_external
-       INTEGER dfti_copy_descriptor_external
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       TYPE(DFTI_DESCRIPTOR), POINTER :: new_desc
-     END FUNCTION dfti_copy_descriptor_external
-
-  END INTERFACE
-
-  INTERFACE DftiCommitDescriptor
-
-     FUNCTION dfti_commit_descriptor_external(desc)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_commit_descriptor_external
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_commit_descriptor_external
-       INTEGER dfti_commit_descriptor_external
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-     END FUNCTION dfti_commit_descriptor_external
-
-  END INTERFACE
-
-  INTERFACE DftiSetValue
-
-     ! overloading of DftiSetValue for integer value
-     FUNCTION dfti_set_value_intval(desc, OptName, IntVal)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_set_value_intval
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_set_value_intval
-       INTEGER dfti_set_value_intval
-       INTEGER, INTENT(IN) :: OptName
-       INTEGER, INTENT(IN) :: IntVal
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-     END FUNCTION dfti_set_value_intval
-
-     ! overloading of DftiSetValue for SP value
-     FUNCTION dfti_set_value_sglval(desc, OptName, sglval)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_set_value_sglval
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_set_value_sglval
-       INTEGER dfti_set_value_sglval
-       INTEGER, INTENT(IN) :: OptName
-       REAL(DFTI_SPKP), INTENT(IN) :: sglval
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-     END FUNCTION dfti_set_value_sglval
-
-     ! overloading of DftiSetValue for DP value
-     FUNCTION dfti_set_value_dblval(desc, OptName, DblVal)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_set_value_dblval
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_set_value_dblval
-       INTEGER dfti_set_value_dblval
-       INTEGER, INTENT(IN) :: OptName
-       REAL(DFTI_DPKP), INTENT(IN) :: DblVal
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-     END FUNCTION dfti_set_value_dblval
-
-     ! overloading of DftiSetValue for integer vector
-     FUNCTION dfti_set_value_intvec(desc, OptName, IntVec)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_set_value_intvec
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_set_value_intvec
-       INTEGER dfti_set_value_intvec
-       INTEGER, INTENT(IN) :: OptName
-       INTEGER, INTENT(IN), DIMENSION(*) :: IntVec
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-     END FUNCTION dfti_set_value_intvec
-
-     ! overloading of DftiSetValue for char vector
-     FUNCTION dfti_set_value_chars(desc, OptName, Chars)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_set_value_chars
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_set_value_chars
-       INTEGER dfti_set_value_chars
-       INTEGER, INTENT(IN) :: OptName
-       CHARACTER(*), INTENT(IN) :: Chars
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-     END FUNCTION dfti_set_value_chars
-
-  END INTERFACE
-
-  INTERFACE DftiGetValue
-
-     ! overloading of DftiGetValue for integer value
-     FUNCTION dfti_get_value_intval(desc, OptName, IntVal)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_get_value_intval
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_get_value_intval
-       INTEGER dfti_get_value_intval
-       INTEGER, INTENT(IN) :: OptName
-       INTEGER, INTENT(OUT) :: IntVal
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-     END FUNCTION dfti_get_value_intval
-
-     ! overloading of DftiGetValue for SP value
-     FUNCTION dfti_get_value_sglval(desc, OptName, sglval)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_get_value_sglval
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_get_value_sglval
-       INTEGER dfti_get_value_sglval
-       INTEGER, INTENT(IN) :: OptName
-       REAL(DFTI_SPKP), INTENT(OUT) :: sglval
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-     END FUNCTION dfti_get_value_sglval
-
-     ! overloading of DftiGetValue for DP value
-     FUNCTION dfti_get_value_dblval(desc, OptName, DblVal)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_get_value_dblval
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_get_value_dblval
-       INTEGER dfti_get_value_dblval
-       INTEGER, INTENT(IN) :: OptName
-       REAL(DFTI_DPKP), INTENT(OUT) :: DblVal
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-     END FUNCTION dfti_get_value_dblval
-
-     ! overloading of DftiGetValue for integer vector
-     FUNCTION dfti_get_value_intvec(desc, OptName, IntVec)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_get_value_intvec
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_get_value_intvec
-       INTEGER dfti_get_value_intvec
-       INTEGER, INTENT(IN) :: OptName
-       INTEGER, INTENT(OUT), DIMENSION(*) :: IntVec
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-     END FUNCTION dfti_get_value_intvec
-
-     ! overloading of DftiGetValue for char vector
-     FUNCTION dfti_get_value_chars(desc, OptName, Chars)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_get_value_chars
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_get_value_chars
-       INTEGER dfti_get_value_chars
-       INTEGER, INTENT(IN) :: OptName
-       CHARACTER(*), INTENT(OUT) :: Chars
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-     END FUNCTION dfti_get_value_chars
-
-  END INTERFACE
-
-  INTERFACE DftiComputeForward
-
-     ! overloading of DftiComputeForward for SP R2C DFT (inplace)
-     FUNCTION dfti_compute_forward_s(desc,sSrcDst)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_compute_forward_s
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_compute_forward_s
-       INTEGER dfti_compute_forward_s
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       REAL(DFTI_SPKP), INTENT(INOUT), DIMENSION(*) :: sSrcDst
-     END FUNCTION dfti_compute_forward_s
-
-     ! overloading of DftiComputeForward for SP C2C DFT (inplace)
-     FUNCTION dfti_compute_forward_c(desc,cSrcDst)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_compute_forward_c
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_compute_forward_c
-       INTEGER dfti_compute_forward_c
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       COMPLEX(DFTI_SPKP), INTENT(INOUT), DIMENSION(*) :: cSrcDst
-     END FUNCTION dfti_compute_forward_c
-
-     ! overloading of DftiComputeForward for SP C2C DFT (inplace, split complex)
-     FUNCTION dfti_compute_forward_ss(desc,sSrcDstRe,sSrcDstIm)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_compute_forward_ss
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_compute_forward_ss
-       INTEGER dfti_compute_forward_ss
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       REAL(DFTI_SPKP), DIMENSION(*) :: sSrcDstRe
-       REAL(DFTI_SPKP), DIMENSION(*) :: sSrcDstIm
-     END FUNCTION dfti_compute_forward_ss
-
-     ! overloading of DftiComputeForward for SP R2C DFT (out-of-place)
-     FUNCTION dfti_compute_forward_sc(desc,sSrc,cDst)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_compute_forward_sc
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_compute_forward_sc
-       INTEGER dfti_compute_forward_sc
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       REAL(DFTI_SPKP), INTENT(IN), DIMENSION(*) :: sSrc
-       COMPLEX(DFTI_SPKP), INTENT(OUT), DIMENSION(*) :: cDst
-     END FUNCTION dfti_compute_forward_sc
-
-     ! overloading of DftiComputeForward for SP C2C DFT (out-of-place)
-     FUNCTION dfti_compute_forward_cc(desc,cSrc,cDst)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_compute_forward_cc
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_compute_forward_cc
-       INTEGER dfti_compute_forward_cc
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       COMPLEX(DFTI_SPKP), INTENT(IN), DIMENSION(*) :: cSrc
-       COMPLEX(DFTI_SPKP), INTENT(OUT), DIMENSION(*) :: cDst
-     END FUNCTION dfti_compute_forward_cc
-
-     ! overloading of DftiComputeForward for SP C2C DFT (out-of-place, split
-     ! complex)
-     FUNCTION dfti_compute_forward_ssss(desc,sSrcRe,sSrcIm,sDstRe,sDstIm)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_compute_forward_ssss
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_compute_forward_ssss
-       INTEGER dfti_compute_forward_ssss
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       REAL(DFTI_SPKP), INTENT(IN), DIMENSION(*) :: sSrcRe
-       REAL(DFTI_SPKP), INTENT(IN), DIMENSION(*) :: sSrcIm
-       REAL(DFTI_SPKP), INTENT(OUT), DIMENSION(*) :: sDstRe
-       REAL(DFTI_SPKP), INTENT(OUT), DIMENSION(*) :: sDstIm
-     END FUNCTION dfti_compute_forward_ssss
-
-     ! overloading of DftiComputeForward for DP R2C DFT (inplace)
-     FUNCTION dfti_compute_forward_d(desc,dSrcDst)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_compute_forward_d
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_compute_forward_d
-       INTEGER dfti_compute_forward_d
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       REAL(DFTI_DPKP), INTENT(INOUT), DIMENSION(*) :: dSrcDst
-     END FUNCTION dfti_compute_forward_d
-
-     ! overloading of DftiComputeForward for DP C2C DFT (inplace)
-     FUNCTION dfti_compute_forward_z(desc,zSrcDst)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_compute_forward_z
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_compute_forward_z
-       INTEGER dfti_compute_forward_z
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       COMPLEX(DFTI_DPKP), INTENT(INOUT), DIMENSION(*) :: zSrcDst
-     END FUNCTION dfti_compute_forward_z
-
-     ! overloading of DftiComputeForward for DP C2C DFT (inplace, split complex)
-     FUNCTION dfti_compute_forward_dd(desc,dSrcDstRe,dSrcDstIm)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_compute_forward_dd
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_compute_forward_dd
-       INTEGER dfti_compute_forward_dd
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       REAL(DFTI_DPKP), DIMENSION(*) :: dSrcDstRe
-       REAL(DFTI_DPKP), DIMENSION(*) :: dSrcDstIm
-     END FUNCTION dfti_compute_forward_dd
-
-     ! overloading of DftiComputeForward for DP R2C DFT (out-of-place)
-     FUNCTION dfti_compute_forward_dz(desc,dSrc,zDst)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_compute_forward_dz
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_compute_forward_dz
-       INTEGER dfti_compute_forward_dz
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       REAL(DFTI_DPKP), INTENT(IN), DIMENSION(*) :: dSrc
-       COMPLEX(DFTI_DPKP), INTENT(OUT), DIMENSION(*) :: zDst
-     END FUNCTION dfti_compute_forward_dz
-
-     ! overloading of DftiComputeForward for DP C2C DFT (out-of-place)
-     FUNCTION dfti_compute_forward_zz(desc,zSrc,zDst)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_compute_forward_zz
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_compute_forward_zz
-       INTEGER dfti_compute_forward_zz
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       COMPLEX(DFTI_DPKP), INTENT(IN), DIMENSION(*) :: zSrc
-       COMPLEX(DFTI_DPKP), INTENT(OUT), DIMENSION(*) :: zDst
-     END FUNCTION dfti_compute_forward_zz
-
-     ! overloading of DftiComputeForward for DP C2C DFT (out-of-place, split
-     ! complex)
-     FUNCTION dfti_compute_forward_dddd(desc,dSrcRe,dSrcIm,dDstRe,dDstIm)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_compute_forward_dddd
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_compute_forward_dddd
-       INTEGER dfti_compute_forward_dddd
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       REAL(DFTI_DPKP), INTENT(IN), DIMENSION(*) :: dSrcRe
-       REAL(DFTI_DPKP), INTENT(IN), DIMENSION(*) :: dSrcIm
-       REAL(DFTI_DPKP), INTENT(OUT), DIMENSION(*) :: dDstRe
-       REAL(DFTI_DPKP), INTENT(OUT), DIMENSION(*) :: dDstIm
-     END FUNCTION dfti_compute_forward_dddd
-
-  END INTERFACE DftiComputeForward
-
-  INTERFACE DftiComputeBackward
-
-
-     ! overloading of DftiComputeBackward for SP C2R DFT (inplace)
-     FUNCTION dfti_compute_backward_s(desc,sSrcDst)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_compute_backward_s
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_compute_backward_s
-       INTEGER dfti_compute_backward_s
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       REAL(DFTI_SPKP), INTENT(INOUT), DIMENSION(*) :: sSrcDst
-     END FUNCTION dfti_compute_backward_s
-
-     ! overloading of DftiComputeBackward for SP C2C DFT (inplace)
-     FUNCTION dfti_compute_backward_c(desc,cSrcDst)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_compute_backward_c
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_compute_backward_c
-       INTEGER dfti_compute_backward_c
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       COMPLEX(DFTI_SPKP), INTENT(INOUT), DIMENSION(*) :: cSrcDst
-     END FUNCTION dfti_compute_backward_c
-
-     ! overloading of DftiComputeBackward for SP C2C DFT (inplace, split complex)
-     FUNCTION dfti_compute_backward_ss(desc,sSrcDstRe,sSrcDstIm)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_compute_backward_ss
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_compute_backward_ss
-       INTEGER dfti_compute_backward_ss
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       REAL(DFTI_SPKP), DIMENSION(*) :: sSrcDstRe
-       REAL(DFTI_SPKP), DIMENSION(*) :: sSrcDstIm
-     END FUNCTION dfti_compute_backward_ss
-
-     ! overloading of DftiComputeBackward for SP C2R DFT (out-of-place)
-     FUNCTION dfti_compute_backward_cs(desc,cSrc,sDst)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_compute_backward_cs
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_compute_backward_cs
-       INTEGER dfti_compute_backward_cs
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       COMPLEX(DFTI_SPKP), INTENT(IN), DIMENSION(*) :: cSrc
-       REAL(DFTI_SPKP), INTENT(OUT), DIMENSION(*) :: sDst
-     END FUNCTION dfti_compute_backward_cs
-
-     ! overloading of DftiComputeBackward for SP C2C DFT (out-of-place)
-     FUNCTION dfti_compute_backward_cc(desc,cSrc,cDst)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_compute_backward_cc
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_compute_backward_cc
-       INTEGER dfti_compute_backward_cc
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       COMPLEX(DFTI_SPKP), INTENT(IN), DIMENSION(*) :: cSrc
-       COMPLEX(DFTI_SPKP), INTENT(OUT), DIMENSION(*) :: cDst
-     END FUNCTION dfti_compute_backward_cc
-
-     ! overloading of DftiComputeBackward for SP C2C DFT (out-of-place, split
-     ! complex)
-     FUNCTION dfti_compute_backward_ssss(desc,sSrcRe,sSrcIm,sDstRe,sDstIm)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_compute_backward_ssss
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_compute_backward_ssss
-       INTEGER dfti_compute_backward_ssss
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       REAL(DFTI_SPKP), INTENT(IN), DIMENSION(*) :: sSrcRe
-       REAL(DFTI_SPKP), INTENT(IN), DIMENSION(*) :: sSrcIm
-       REAL(DFTI_SPKP), INTENT(OUT), DIMENSION(*) :: sDstRe
-       REAL(DFTI_SPKP), INTENT(OUT), DIMENSION(*) :: sDstIm
-     END FUNCTION dfti_compute_backward_ssss
-
-     ! overloading of DftiComputeBackward for DP C2R DFT (inplace)
-     FUNCTION dfti_compute_backward_d(desc,dSrcDst)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_compute_backward_d
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_compute_backward_d
-       INTEGER dfti_compute_backward_d
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       REAL(DFTI_DPKP), INTENT(INOUT), DIMENSION(*) :: dSrcDst
-     END FUNCTION dfti_compute_backward_d
-
-     ! overloading of DftiComputeBackward for DP C2C DFT (inplace)
-     FUNCTION dfti_compute_backward_z(desc,zSrcDst)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_compute_backward_z
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_compute_backward_z
-       INTEGER dfti_compute_backward_z
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       COMPLEX(DFTI_DPKP), INTENT(INOUT), DIMENSION(*) :: zSrcDst
-     END FUNCTION dfti_compute_backward_z
-
-     ! overloading of DftiComputeBackward for DP C2C DFT (inplace, split complex)
-     FUNCTION dfti_compute_backward_dd(desc,dSrcDstRe,dSrcDstIm)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_compute_backward_dd
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_compute_backward_dd
-       INTEGER dfti_compute_backward_dd
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       REAL(DFTI_DPKP), DIMENSION(*) :: dSrcDstRe
-       REAL(DFTI_DPKP), DIMENSION(*) :: dSrcDstIm
-     END FUNCTION dfti_compute_backward_dd
-
-     ! overloading of DftiComputeBackward for DP C2R DFT (out-of-place)
-     FUNCTION dfti_compute_backward_zd(desc,zSrc,dDst)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_compute_backward_zd
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_compute_backward_zd
-       INTEGER dfti_compute_backward_zd
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       COMPLEX(DFTI_DPKP), INTENT(IN), DIMENSION(*) :: zSrc
-       REAL(DFTI_DPKP), INTENT(OUT), DIMENSION(*) :: dDst
-     END FUNCTION dfti_compute_backward_zd
-
-     ! overloading of DftiComputeBackward for DP C2C DFT (out-of-place)
-     FUNCTION dfti_compute_backward_zz(desc,zSrc,zDst)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_compute_backward_zz
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_compute_backward_zz
-       INTEGER dfti_compute_backward_zz
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       COMPLEX(DFTI_DPKP), INTENT(IN), DIMENSION(*) :: zSrc
-       COMPLEX(DFTI_DPKP), INTENT(OUT), DIMENSION(*) :: zDst
-     END FUNCTION dfti_compute_backward_zz
-
-     ! overloading of DftiComputeBackward for DP C2C DFT (out-of-place, split
-     ! complex)
-     FUNCTION dfti_compute_backward_dddd(desc,dSrcRe,dSrcIm,dDstRe,dDstIm)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_compute_backward_dddd
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_compute_backward_dddd
-       INTEGER dfti_compute_backward_dddd
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-       REAL(DFTI_DPKP), INTENT(IN), DIMENSION(*) :: dSrcRe
-       REAL(DFTI_DPKP), INTENT(IN), DIMENSION(*) :: dSrcIm
-       REAL(DFTI_DPKP), INTENT(OUT), DIMENSION(*) :: dDstRe
-       REAL(DFTI_DPKP), INTENT(OUT), DIMENSION(*) :: dDstIm
-     END FUNCTION dfti_compute_backward_dddd
-
-  END INTERFACE DftiComputeBackward
-
-  INTERFACE DftiFreeDescriptor
-
-     FUNCTION dfti_free_descriptor_external(desc)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_free_descriptor_external
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_free_descriptor_external
-       INTEGER dfti_free_descriptor_external
-       TYPE(DFTI_DESCRIPTOR), POINTER :: desc
-     END FUNCTION dfti_free_descriptor_external
-
-  END INTERFACE
-
-  INTERFACE DftiErrorClass
-
-     FUNCTION dfti_error_class_external(Status, ErrorClass)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_error_class_external
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_error_class_external
-       LOGICAL dfti_error_class_external
-       INTEGER, INTENT(IN) :: Status
-       INTEGER, INTENT(IN) :: ErrorClass
-     END FUNCTION dfti_error_class_external
-
-  END INTERFACE
-
-  INTERFACE DftiErrorMessage
-
-     FUNCTION dfti_error_message_external(Status)
-       USE MKL_DFT_TYPE
-       !DEC$ ATTRIBUTES C :: dfti_error_message_external
-       !DEC$ ATTRIBUTES REFERENCE :: dfti_error_message_external
-       CHARACTER(LEN=DFTI_MAX_MESSAGE_LENGTH) :: dfti_error_message_external
-       INTEGER, INTENT(IN) :: Status
-     END FUNCTION dfti_error_message_external
-
-  END INTERFACE
-
-END MODULE MKL_DFTI
diff --git a/decomp2d/transpose_x_to_y.inc b/decomp2d/transpose_x_to_y.inc
deleted file mode 100644
index ddb7f3528..000000000
--- a/decomp2d/transpose_x_to_y.inc
+++ /dev/null
@@ -1,514 +0,0 @@
-! -*- mode: f90 -*-
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-! This file contains the routines that transpose data from X to Y pencil
-
-subroutine transpose_x_to_y_real(src, dst, opt_decomp)
-
-  implicit none
-
-  real(mytype), dimension(:,:,:), intent(IN) :: src
-  real(mytype), dimension(:,:,:), intent(OUT) :: dst
-  TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-  TYPE(DECOMP_INFO) :: decomp
-
-#ifdef SHM
-  real(mytype) :: work1(*), work2(*)
-  POINTER  (work1_p, work1), (work2_p, work2)  ! Cray pointers
-#endif
-
-  integer :: s1,s2,s3,d1,d2,d3
-  integer :: ierror
-
-  if (present(opt_decomp)) then
-     decomp = opt_decomp
-  else
-     decomp = decomp_main
-  end if
-
-  s1 = SIZE(src,1)
-  s2 = SIZE(src,2)
-  s3 = SIZE(src,3)
-  d1 = SIZE(dst,1)
-  d2 = SIZE(dst,2)
-  d3 = SIZE(dst,3)
-
-  ! rearrange source array as send buffer
-#ifdef SHM
-  work1_p = decomp%COL_INFO%SND_P
-  call mem_split_xy_real(src, s1, s2, s3, work1, dims(1), &
-       decomp%x1dist, decomp)
-#else
-  call mem_split_xy_real(src, s1, s2, s3, work1_r, dims(1), &
-       decomp%x1dist, decomp)
-#endif
-
-  ! define receive buffer
-#ifdef SHM
-  work2_p = decomp%COL_INFO%RCV_P
-  call MPI_BARRIER(decomp%COL_INFO%CORE_COMM, ierror)
-#endif
-
-  ! transpose using MPI_ALLTOALL(V)
-#ifdef SHM
-  if (decomp%COL_INFO%CORE_ME==1) THEN
-     call MPI_ALLTOALLV(work1, decomp%x1cnts_s, decomp%x1disp_s, &
-          real_type, work2, decomp%y1cnts_s, decomp%y1disp_s, &
-          real_type, decomp%COL_INFO%SMP_COMM, ierror)
-  end if
-#else
-#ifdef EVEN
-  call MPI_ALLTOALL(work1_r, decomp%x1count, &
-       real_type, work2_r, decomp%y1count, &
-       real_type, DECOMP_2D_COMM_COL, ierror)
-#else
-  call MPI_ALLTOALLV(work1_r, decomp%x1cnts, decomp%x1disp, &
-       real_type, work2_r, decomp%y1cnts, decomp%y1disp, &
-       real_type, DECOMP_2D_COMM_COL, ierror)
-#endif
-#endif
-
-  ! rearrange receive buffer
-#ifdef SHM
-  call MPI_BARRIER(decomp%COL_INFO%CORE_COMM, ierror)
-  call mem_merge_xy_real(work2, d1, d2, d3, dst, dims(1), &
-       decomp%y1dist, decomp)
-#else
-  call mem_merge_xy_real(work2_r, d1, d2, d3, dst, dims(1), &
-       decomp%y1dist, decomp)
-#endif
-
-  return
-end subroutine transpose_x_to_y_real
-
-
-#ifdef OCC
-subroutine transpose_x_to_y_real_start(handle, src, dst, sbuf, rbuf, &
-     opt_decomp)
-
-  implicit none
-
-  integer :: handle
-  real(mytype), dimension(:,:,:) :: src, dst, sbuf, rbuf
-  TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-  TYPE(DECOMP_INFO) :: decomp
-
-  integer :: s1,s2,s3
-  integer :: ierror
-
-  if (present(opt_decomp)) then
-     decomp = opt_decomp
-  else
-     decomp = decomp_main
-  end if
-
-  s1 = SIZE(src,1)
-  s2 = SIZE(src,2)
-  s3 = SIZE(src,3)
-
-  ! rearrange source array as send buffer
-  call mem_split_xy_real(src, s1, s2, s3, sbuf, dims(1), &
-       decomp%x1dist, decomp)
-
-#ifdef EVEN
-  call NBC_IALLTOALL(sbuf, decomp%x1count, real_type, &
-       rbuf, decomp%y1count, real_type, &
-       DECOMP_2D_COMM_COL, handle, ierror)
-#else
-  call NBC_IALLTOALLV(sbuf, decomp%x1cnts, decomp%x1disp, real_type, &
-       rbuf, decomp%y1cnts, decomp%y1disp, real_type, &
-       DECOMP_2D_COMM_COL, handle, ierror)
-#endif
-
-  return
-end subroutine transpose_x_to_y_real_start
-
-
-subroutine transpose_x_to_y_real_wait(handle, src, dst, sbuf, rbuf, &
-     opt_decomp)
-
-  implicit none
-
-  integer :: handle
-  real(mytype), dimension(:,:,:) :: src, dst, sbuf, rbuf
-  TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-  TYPE(DECOMP_INFO) :: decomp
-
-  integer :: d1,d2,d3
-  integer :: ierror
-
-  if (present(opt_decomp)) then
-     decomp = opt_decomp
-  else
-     decomp = decomp_main
-  end if
-
-  d1 = SIZE(dst,1)
-  d2 = SIZE(dst,2)
-  d3 = SIZE(dst,3)
-
-  call NBC_WAIT(handle, ierror)
-
-  ! rearrange receive buffer
-  call mem_merge_xy_real(rbuf, d1, d2, d3, dst, dims(1), &
-       decomp%y1dist, decomp)
-
-  return
-end subroutine transpose_x_to_y_real_wait
-#endif
-
-
-subroutine transpose_x_to_y_complex(src, dst, opt_decomp)
-
-  implicit none
-
-  complex(mytype), dimension(:,:,:), intent(IN) :: src
-  complex(mytype), dimension(:,:,:), intent(OUT) :: dst
-  TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-  TYPE(DECOMP_INFO) :: decomp
-
-#ifdef SHM
-  complex(mytype) :: work1(*), work2(*)
-  POINTER  (work1_p, work1), (work2_p, work2)  ! Cray pointers
-#endif
-
-  integer :: s1,s2,s3,d1,d2,d3
-  integer :: ierror
-
-  if (present(opt_decomp)) then
-     decomp = opt_decomp
-  else
-     decomp = decomp_main
-  end if
-
-  s1 = SIZE(src,1)
-  s2 = SIZE(src,2)
-  s3 = SIZE(src,3)
-  d1 = SIZE(dst,1)
-  d2 = SIZE(dst,2)
-  d3 = SIZE(dst,3)
-
-  ! rearrange source array as send buffer
-#ifdef SHM
-  work1_p = decomp%COL_INFO%SND_P_c
-  call mem_split_xy_complex(src, s1, s2, s3, work1, dims(1), &
-       decomp%x1dist, decomp)
-#else
-  call mem_split_xy_complex(src, s1, s2, s3, work1_c, dims(1), &
-       decomp%x1dist, decomp)
-#endif
-
-  ! define receive buffer
-#ifdef SHM
-  work2_p = decomp%COL_INFO%RCV_P_c
-  call MPI_BARRIER(decomp%COL_INFO%CORE_COMM, ierror)
-#endif
-
-  ! transpose using MPI_ALLTOALL(V)
-#ifdef SHM
-  if (decomp%COL_INFO%CORE_ME==1) THEN
-     call MPI_ALLTOALLV(work1, decomp%x1cnts_s, decomp%x1disp_s, &
-          complex_type, work2, decomp%y1cnts_s, decomp%y1disp_s, &
-          complex_type, decomp%COL_INFO%SMP_COMM, ierror)
-  end if
-#else
-#ifdef EVEN
-  call MPI_ALLTOALL(work1_c, decomp%x1count, &
-       complex_type, work2_c, decomp%y1count, &
-       complex_type, DECOMP_2D_COMM_COL, ierror)
-#else
-  call MPI_ALLTOALLV(work1_c, decomp%x1cnts, decomp%x1disp, &
-       complex_type, work2_c, decomp%y1cnts, decomp%y1disp, &
-       complex_type, DECOMP_2D_COMM_COL, ierror)
-#endif
-#endif
-
-  ! rearrange receive buffer
-#ifdef SHM
-  call MPI_BARRIER(decomp%COL_INFO%CORE_COMM, ierror)
-  call mem_merge_xy_complex(work2, d1, d2, d3, dst, dims(1), &
-       decomp%y1dist, decomp)
-#else
-  call mem_merge_xy_complex(work2_c, d1, d2, d3, dst, dims(1), &
-       decomp%y1dist, decomp)
-#endif
-
-  return
-end subroutine transpose_x_to_y_complex
-
-
-#ifdef OCC
-subroutine transpose_x_to_y_complex_start(handle, src, dst, sbuf, &
-     rbuf, opt_decomp)
-
-  implicit none
-
-  integer :: handle
-  complex(mytype), dimension(:,:,:) :: src, dst, sbuf, rbuf
-  TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-  TYPE(DECOMP_INFO) :: decomp
-
-  integer :: s1,s2,s3
-  integer :: ierror
-
-  if (present(opt_decomp)) then
-     decomp = opt_decomp
-  else
-     decomp = decomp_main
-  end if
-
-  s1 = SIZE(src,1)
-  s2 = SIZE(src,2)
-  s3 = SIZE(src,3)
-
-  ! rearrange source array as send buffer
-  call mem_split_xy_complex(src, s1, s2, s3, sbuf, dims(1), &
-       decomp%x1dist, decomp)
-
-#ifdef EVEN
-  call NBC_IALLTOALL(sbuf, decomp%x1count, &
-       complex_type, rbuf, decomp%y1count, &
-       complex_type, DECOMP_2D_COMM_COL, handle, ierror)
-#else
-  call NBC_IALLTOALLV(sbuf, decomp%x1cnts, decomp%x1disp, &
-       complex_type, rbuf, decomp%y1cnts, decomp%y1disp, &
-       complex_type, DECOMP_2D_COMM_COL, handle, ierror)
-#endif
-
-  return
-end subroutine transpose_x_to_y_complex_start
-
-
-subroutine transpose_x_to_y_complex_wait(handle, src, dst, sbuf, &
-     rbuf, opt_decomp)
-
-  implicit none
-
-  integer :: handle
-  complex(mytype), dimension(:,:,:) :: src, dst, sbuf, rbuf
-  TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-  TYPE(DECOMP_INFO) :: decomp
-
-  integer :: d1,d2,d3
-  integer :: ierror
-
-  if (present(opt_decomp)) then
-     decomp = opt_decomp
-  else
-     decomp = decomp_main
-  end if
-
-  d1 = SIZE(dst,1)
-  d2 = SIZE(dst,2)
-  d3 = SIZE(dst,3)
-
-  call NBC_WAIT(handle, ierror)
-
-  ! rearrange receive buffer
-  call mem_merge_xy_complex(rbuf, d1, d2, d3, dst, dims(1), &
-       decomp%y1dist, decomp)
-
-  return
-end subroutine transpose_x_to_y_complex_wait
-#endif
-
-
-! pack/unpack ALLTOALL(V) buffers
-
-subroutine mem_split_xy_real(in,n1,n2,n3,out,iproc,dist,decomp)
-
-  implicit none
-
-  integer, intent(IN) :: n1,n2,n3
-  real(mytype), dimension(n1,n2,n3), intent(IN) :: in
-  real(mytype), dimension(*), intent(OUT) :: out
-  integer, intent(IN) :: iproc
-  integer, dimension(0:iproc-1), intent(IN) :: dist
-  TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-  integer :: i,j,k, m,i1,i2,pos
-
-  do m=0,iproc-1
-     if (m==0) then 
-        i1 = 1
-        i2 = dist(0)
-     else
-        i1 = i2+1
-        i2 = i1+dist(m)-1
-     end if
-
-#ifdef SHM
-     pos = decomp%x1disp_o(m) + 1
-#else
-#ifdef EVEN
-     pos = m * decomp%x1count + 1
-#else
-     pos = decomp%x1disp(m) + 1
-#endif
-#endif
-
-     do k=1,n3
-        do j=1,n2
-           do i=i1,i2
-              out(pos) = in(i,j,k)
-              pos = pos + 1
-           end do
-        end do
-     end do
-  end do
-
-  return
-end subroutine mem_split_xy_real
-
-
-subroutine mem_split_xy_complex(in,n1,n2,n3,out,iproc,dist,decomp)
-
-  implicit none
-
-  integer, intent(IN) :: n1,n2,n3
-  complex(mytype), dimension(n1,n2,n3), intent(IN) :: in
-  complex(mytype), dimension(*), intent(OUT) :: out
-  integer, intent(IN) :: iproc
-  integer, dimension(0:iproc-1), intent(IN) :: dist
-  TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-  integer :: i,j,k, m,i1,i2,pos
-
-  do m=0,iproc-1
-     if (m==0) then 
-        i1 = 1
-        i2 = dist(0)
-     else
-        i1 = i2+1
-        i2 = i1+dist(m)-1
-     end if
-
-#ifdef SHM
-     pos = decomp%x1disp_o(m) + 1
-#else
-#ifdef EVEN
-     pos = m * decomp%x1count + 1
-#else
-     pos = decomp%x1disp(m) + 1
-#endif
-#endif
-
-     do k=1,n3
-        do j=1,n2
-           do i=i1,i2
-              out(pos) = in(i,j,k)
-              pos = pos + 1
-           end do
-        end do
-     end do
-  end do
-
-  return
-end subroutine mem_split_xy_complex
-
-
-subroutine mem_merge_xy_real(in,n1,n2,n3,out,iproc,dist,decomp)
-
-  implicit none
-
-  integer, intent(IN) :: n1,n2,n3
-  real(mytype), dimension(*), intent(IN) :: in
-  real(mytype), dimension(n1,n2,n3), intent(OUT) :: out
-  integer, intent(IN) :: iproc
-  integer, dimension(0:iproc-1), intent(IN) :: dist
-  TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-  integer :: i,j,k, m,i1,i2, pos
-
-  do m=0,iproc-1
-     if (m==0) then
-        i1 = 1
-        i2 = dist(0)
-     else
-        i1 = i2+1
-        i2 = i1+dist(m)-1
-     end if
-
-#ifdef SHM
-     pos = decomp%y1disp_o(m) + 1
-#else
-#ifdef EVEN
-     pos = m * decomp%y1count + 1
-#else
-     pos = decomp%y1disp(m) + 1
-#endif
-#endif
-
-     do k=1,n3
-        do j=i1,i2
-           do i=1,n1
-              out(i,j,k) = in(pos)
-              pos = pos + 1
-           end do
-        end do
-     end do
-  end do
-
-  return
-end subroutine mem_merge_xy_real
-
-
-subroutine mem_merge_xy_complex(in,n1,n2,n3,out,iproc,dist,decomp)
-
-  implicit none
-
-  integer, intent(IN) :: n1,n2,n3
-  complex(mytype), dimension(*), intent(IN) :: in
-  complex(mytype), dimension(n1,n2,n3), intent(OUT) :: out
-  integer, intent(IN) :: iproc
-  integer, dimension(0:iproc-1), intent(IN) :: dist
-  TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-  integer :: i,j,k, m,i1,i2, pos
-
-  do m=0,iproc-1
-     if (m==0) then
-        i1 = 1
-        i2 = dist(0)
-     else
-        i1 = i2+1
-        i2 = i1+dist(m)-1
-     end if
-
-#ifdef SHM
-     pos = decomp%y1disp_o(m) + 1
-#else
-#ifdef EVEN
-     pos = m * decomp%y1count + 1
-#else
-     pos = decomp%y1disp(m) + 1
-#endif
-#endif
-
-     do k=1,n3
-        do j=i1,i2
-           do i=1,n1
-              out(i,j,k) = in(pos)
-              pos = pos + 1
-           end do
-        end do
-     end do
-  end do
-
-  return
-end subroutine mem_merge_xy_complex
diff --git a/decomp2d/transpose_y_to_x.inc b/decomp2d/transpose_y_to_x.inc
deleted file mode 100644
index ec82699ef..000000000
--- a/decomp2d/transpose_y_to_x.inc
+++ /dev/null
@@ -1,513 +0,0 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-! This file contains the routines that transpose data from Y to X pencil
-
-subroutine transpose_y_to_x_real(src, dst, opt_decomp)
-
-implicit none
-
-real(mytype), dimension(:,:,:), intent(IN) :: src
-real(mytype), dimension(:,:,:), intent(OUT) :: dst
-TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-TYPE(DECOMP_INFO) :: decomp
-
-#ifdef SHM
-real(mytype) :: work1(*), work2(*)
-POINTER  (work1_p, work1), (work2_p, work2)  ! Cray pointers
-#endif
-
-integer :: s1,s2,s3,d1,d2,d3
-integer :: ierror
-
-if (present(opt_decomp)) then
-decomp = opt_decomp
-else
-decomp = decomp_main
-end if
-
-s1 = SIZE(src,1)
-s2 = SIZE(src,2)
-s3 = SIZE(src,3)
-d1 = SIZE(dst,1)
-d2 = SIZE(dst,2)
-d3 = SIZE(dst,3)
-
-! rearrange source array as send buffer
-#ifdef SHM
-work1_p = decomp%COL_INFO%SND_P
-call mem_split_yx_real(src, s1, s2, s3, work1, dims(1), &
-decomp%y1dist, decomp)
-#else
-call mem_split_yx_real(src, s1, s2, s3, work1_r, dims(1), &
-decomp%y1dist, decomp)
-#endif
-
-! define receive buffer
-#ifdef SHM
-work2_p = decomp%COL_INFO%RCV_P
-call MPI_BARRIER(decomp%COL_INFO%CORE_COMM, ierror)
-#endif
-
-! transpose using MPI_ALLTOALL(V)
-#ifdef SHM
-if (decomp%COL_INFO%CORE_ME==1) THEN
-call MPI_ALLTOALLV(work1, decomp%y1cnts_s, decomp%y1disp_s, &
-real_type, work2, decomp%x1cnts_s, decomp%x1disp_s, &
-real_type, decomp%COL_INFO%SMP_COMM, ierror)
-end if
-#else
-#ifdef EVEN
-call MPI_ALLTOALL(work1_r, decomp%y1count, &
-real_type, work2_r, decomp%x1count, &
-real_type, DECOMP_2D_COMM_COL, ierror)
-#else
-call MPI_ALLTOALLV(work1_r, decomp%y1cnts, decomp%y1disp, &
-real_type, work2_r, decomp%x1cnts, decomp%x1disp, &
-real_type, DECOMP_2D_COMM_COL, ierror)
-#endif
-#endif
-
-! rearrange receive buffer
-#ifdef SHM
-call MPI_BARRIER(decomp%COL_INFO%CORE_COMM, ierror)
-call mem_merge_yx_real(work2, d1, d2, d3, dst, dims(1), &
-decomp%x1dist, decomp)
-#else
-call mem_merge_yx_real(work2_r, d1, d2, d3, dst, dims(1), &
-decomp%x1dist, decomp)
-#endif
-
-return
-end subroutine transpose_y_to_x_real
-
-
-#ifdef OCC
-subroutine transpose_y_to_x_real_start(handle, src, dst, sbuf, rbuf, &
-opt_decomp)
-
-implicit none
-
-integer :: handle
-real(mytype), dimension(:,:,:) :: src, dst, sbuf, rbuf
-TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-TYPE(DECOMP_INFO) :: decomp
-
-integer :: s1,s2,s3
-integer :: ierror
-
-if (present(opt_decomp)) then
-decomp = opt_decomp
-else
-decomp = decomp_main
-end if
-
-s1 = SIZE(src,1)
-s2 = SIZE(src,2)
-s3 = SIZE(src,3)
-
-! rearrange source array as send buffer
-call mem_split_yx_real(src, s1, s2, s3, sbuf, dims(1), &
-decomp%y1dist, decomp)
-
-#ifdef EVEN
-call NBC_IALLTOALL(sbuf, decomp%y1count, real_type, &
-rbuf, decomp%x1count, real_type, &
-DECOMP_2D_COMM_COL, handle, ierror)
-#else
-call NBC_IALLTOALLV(sbuf, decomp%y1cnts, decomp%y1disp, real_type, &
-rbuf, decomp%x1cnts, decomp%x1disp, real_type, &
-DECOMP_2D_COMM_COL, handle, ierror)
-#endif
-
-return
-end subroutine transpose_y_to_x_real_start
-
-
-subroutine transpose_y_to_x_real_wait(handle, src, dst, sbuf, rbuf, &
-opt_decomp)
-
-implicit none
-
-integer :: handle
-real(mytype), dimension(:,:,:) :: src, dst, sbuf, rbuf
-TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-TYPE(DECOMP_INFO) :: decomp
-
-integer :: d1,d2,d3
-integer :: ierror
-
-if (present(opt_decomp)) then
-decomp = opt_decomp
-else
-decomp = decomp_main
-end if
-
-d1 = SIZE(dst,1)
-d2 = SIZE(dst,2)
-d3 = SIZE(dst,3)
-
-call NBC_WAIT(handle, ierror)
-
-! rearrange receive buffer
-call mem_merge_yx_real(rbuf, d1, d2, d3, dst, dims(1), &
-decomp%x1dist, decomp)
-
-return
-end subroutine transpose_y_to_x_real_wait
-#endif
-
-
-subroutine transpose_y_to_x_complex(src, dst, opt_decomp)
-
-implicit none
-
-complex(mytype), dimension(:,:,:), intent(IN) :: src
-complex(mytype), dimension(:,:,:), intent(OUT) :: dst
-TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-TYPE(DECOMP_INFO) :: decomp
-
-#ifdef SHM
-complex(mytype) :: work1(*), work2(*)
-POINTER  (work1_p, work1), (work2_p, work2)  ! Cray pointers
-#endif
-
-integer :: s1,s2,s3,d1,d2,d3
-integer :: ierror
-
-if (present(opt_decomp)) then
-decomp = opt_decomp
-else
-decomp = decomp_main
-end if
-
-s1 = SIZE(src,1)
-s2 = SIZE(src,2)
-s3 = SIZE(src,3)
-d1 = SIZE(dst,1)
-d2 = SIZE(dst,2)
-d3 = SIZE(dst,3)
-
-! rearrange source array as send buffer
-#ifdef SHM
-work1_p = decomp%COL_INFO%SND_P_c
-call mem_split_yx_complex(src, s1, s2, s3, work1, dims(1), &
-decomp%y1dist, decomp)
-#else
-call mem_split_yx_complex(src, s1, s2, s3, work1_c, dims(1), &
-decomp%y1dist, decomp)
-#endif
-
-! define receive buffer
-#ifdef SHM
-work2_p = decomp%COL_INFO%RCV_P_c
-call MPI_BARRIER(decomp%COL_INFO%CORE_COMM, ierror)
-#endif
-
-! transpose using MPI_ALLTOALL(V)
-#ifdef SHM
-if (decomp%COL_INFO%CORE_ME==1) THEN
-call MPI_ALLTOALLV(work1, decomp%y1cnts_s, decomp%y1disp_s, &
-complex_type, work2, decomp%x1cnts_s, decomp%x1disp_s, &
-complex_type, decomp%COL_INFO%SMP_COMM, ierror)
-end if
-#else
-#ifdef EVEN
-call MPI_ALLTOALL(work1_c, decomp%y1count, &
-complex_type, work2_c, decomp%x1count, &
-complex_type, DECOMP_2D_COMM_COL, ierror)
-#else
-call MPI_ALLTOALLV(work1_c, decomp%y1cnts, decomp%y1disp, &
-complex_type, work2_c, decomp%x1cnts, decomp%x1disp, &
-complex_type, DECOMP_2D_COMM_COL, ierror)
-#endif
-#endif
-
-! rearrange receive buffer
-#ifdef SHM
-call MPI_BARRIER(decomp%COL_INFO%CORE_COMM, ierror)
-call mem_merge_yx_complex(work2, d1, d2, d3, dst, dims(1), &
-decomp%x1dist, decomp)
-#else
-call mem_merge_yx_complex(work2_c, d1, d2, d3, dst, dims(1), &
-decomp%x1dist, decomp)
-#endif
-
-return
-end subroutine transpose_y_to_x_complex
-
-
-#ifdef OCC
-subroutine transpose_y_to_x_complex_start(handle, src, dst, sbuf, &
-rbuf, opt_decomp)
-
-implicit none
-
-integer :: handle
-complex(mytype), dimension(:,:,:) :: src, dst, sbuf, rbuf
-TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-TYPE(DECOMP_INFO) :: decomp
-
-integer :: s1,s2,s3
-integer :: ierror
-
-if (present(opt_decomp)) then
-decomp = opt_decomp
-else
-decomp = decomp_main
-end if
-
-s1 = SIZE(src,1)
-s2 = SIZE(src,2)
-s3 = SIZE(src,3)
-
-! rearrange source array as send buffer
-call mem_split_yx_complex(src, s1, s2, s3, sbuf, dims(1), &
-decomp%y1dist, decomp)
-
-#ifdef EVEN
-call NBC_IALLTOALL(sbuf, decomp%y1count, &
-complex_type, rbuf, decomp%x1count, &
-complex_type, DECOMP_2D_COMM_COL, handle, ierror)
-#else
-call NBC_IALLTOALLV(sbuf, decomp%y1cnts, decomp%y1disp, &
-complex_type, rbuf, decomp%x1cnts, decomp%x1disp, &
-complex_type, DECOMP_2D_COMM_COL, handle, ierror)
-#endif
-
-return
-end subroutine transpose_y_to_x_complex_start
-
-
-subroutine transpose_y_to_x_complex_wait(handle, src, dst, sbuf, &
-rbuf, opt_decomp)
-
-implicit none
-
-integer :: handle
-complex(mytype), dimension(:,:,:) :: src, dst, sbuf, rbuf
-TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-TYPE(DECOMP_INFO) :: decomp
-
-integer :: d1,d2,d3
-integer :: ierror
-
-if (present(opt_decomp)) then
-decomp = opt_decomp
-else
-decomp = decomp_main
-end if
-
-d1 = SIZE(dst,1)
-d2 = SIZE(dst,2)
-d3 = SIZE(dst,3)
-
-call NBC_WAIT(handle, ierror)
-
-! rearrange receive buffer
-call mem_merge_yx_complex(rbuf, d1, d2, d3, dst, dims(1), &
-decomp%x1dist, decomp)
-
-return
-end subroutine transpose_y_to_x_complex_wait
-#endif
-
-
-! pack/unpack ALLTOALL(V) buffers
-
-subroutine mem_split_yx_real(in,n1,n2,n3,out,iproc,dist,decomp)
-
-implicit none
-
-integer, intent(IN) :: n1,n2,n3
-real(mytype), dimension(n1,n2,n3), intent(IN) :: in
-real(mytype), dimension(*), intent(OUT) :: out
-integer, intent(IN) :: iproc
-integer, dimension(0:iproc-1), intent(IN) :: dist
-TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-integer :: i,j,k, m,i1,i2,pos
-
-do m=0,iproc-1
-if (m==0) then 
-i1 = 1
-i2 = dist(0)
-else
-i1 = i2+1
-i2 = i1+dist(m)-1
-end if
-
-#ifdef SHM
-pos = decomp%y1disp_o(m) + 1
-#else
-#ifdef EVEN
-pos = m * decomp%y1count + 1
-#else
-pos = decomp%y1disp(m) + 1
-#endif
-#endif
-
-do k=1,n3
-do j=i1,i2
-do i=1,n1
-out(pos) = in(i,j,k)
-pos = pos + 1
-end do
-end do
-end do
-end do
-
-return
-end subroutine mem_split_yx_real
-
-
-subroutine mem_split_yx_complex(in,n1,n2,n3,out,iproc,dist,decomp)
-
-implicit none
-
-integer, intent(IN) :: n1,n2,n3
-complex(mytype), dimension(n1,n2,n3), intent(IN) :: in
-complex(mytype), dimension(*), intent(OUT) :: out
-integer, intent(IN) :: iproc
-integer, dimension(0:iproc-1), intent(IN) :: dist
-TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-integer :: i,j,k, m,i1,i2,pos
-
-do m=0,iproc-1
-if (m==0) then 
-i1 = 1
-i2 = dist(0)
-else
-i1 = i2+1
-i2 = i1+dist(m)-1
-end if
-
-#ifdef SHM
-pos = decomp%y1disp_o(m) + 1
-#else
-#ifdef EVEN
-pos = m * decomp%y1count + 1
-#else
-pos = decomp%y1disp(m) + 1
-#endif
-#endif
-
-do k=1,n3
-do j=i1,i2
-do i=1,n1
-out(pos) = in(i,j,k)
-pos = pos + 1
-end do
-end do
-end do
-end do
-
-return
-end subroutine mem_split_yx_complex
-
-
-subroutine mem_merge_yx_real(in,n1,n2,n3,out,iproc,dist,decomp)
-
-implicit none
-
-integer, intent(IN) :: n1,n2,n3
-real(mytype), dimension(*), intent(IN) :: in
-real(mytype), dimension(n1,n2,n3), intent(OUT) :: out
-integer, intent(IN) :: iproc
-integer, dimension(0:iproc-1), intent(IN) :: dist
-TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-integer :: i,j,k, m,i1,i2, pos
-
-do m=0,iproc-1
-if (m==0) then
-i1 = 1
-i2 = dist(0)
-else
-i1 = i2+1
-i2 = i1+dist(m)-1
-end if
-
-#ifdef SHM
-pos = decomp%x1disp_o(m) + 1
-#else
-#ifdef EVEN
-pos = m * decomp%x1count + 1
-#else
-pos = decomp%x1disp(m) + 1
-#endif
-#endif
-
-do k=1,n3
-do j=1,n2
-do i=i1,i2
-out(i,j,k) = in(pos)
-pos = pos + 1
-end do
-end do
-end do
-end do
-
-return
-end subroutine mem_merge_yx_real
-
-
-subroutine mem_merge_yx_complex(in,n1,n2,n3,out,iproc,dist,decomp)
-
-implicit none
-
-integer, intent(IN) :: n1,n2,n3
-complex(mytype), dimension(*), intent(IN) :: in
-complex(mytype), dimension(n1,n2,n3), intent(OUT) :: out
-integer, intent(IN) :: iproc
-integer, dimension(0:iproc-1), intent(IN) :: dist
-TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-integer :: i,j,k, m,i1,i2, pos
-
-do m=0,iproc-1
-if (m==0) then
-i1 = 1
-i2 = dist(0)
-else
-i1 = i2+1
-i2 = i1+dist(m)-1
-end if
-
-#ifdef SHM
-pos = decomp%x1disp_o(m) + 1
-#else
-#ifdef EVEN
-pos = m * decomp%x1count + 1
-#else
-pos = decomp%x1disp(m) + 1
-#endif
-#endif
-
-do k=1,n3
-do j=1,n2
-do i=i1,i2
-out(i,j,k) = in(pos)
-pos = pos + 1
-end do
-end do
-end do
-end do
-
-return
-end subroutine mem_merge_yx_complex
diff --git a/decomp2d/transpose_y_to_z.inc b/decomp2d/transpose_y_to_z.inc
deleted file mode 100644
index e6fbb14df..000000000
--- a/decomp2d/transpose_y_to_z.inc
+++ /dev/null
@@ -1,524 +0,0 @@
-! -*- mode: f90 -*-
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-! This file contains the routines that transpose data from Y to Z pencil
-
-subroutine transpose_y_to_z_real(src, dst, opt_decomp)
-
-  implicit none
-
-  real(mytype), dimension(:,:,:), intent(IN) :: src
-  real(mytype), dimension(:,:,:), intent(OUT) :: dst
-  TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-  TYPE(DECOMP_INFO) :: decomp
-
-#ifdef SHM
-  real(mytype) :: work1(*), work2(*)
-  POINTER  (work1_p, work1), (work2_p, work2)  ! Cray pointers
-#endif
-
-  integer :: s1,s2,s3,d1,d2,d3
-  integer :: ierror
-
-  if (present(opt_decomp)) then
-     decomp = opt_decomp
-  else
-     decomp = decomp_main
-  end if
-
-  s1 = SIZE(src,1)
-  s2 = SIZE(src,2)
-  s3 = SIZE(src,3)
-  d1 = SIZE(dst,1)
-  d2 = SIZE(dst,2)
-  d3 = SIZE(dst,3)
-
-  ! rearrange source array as send buffer
-#ifdef SHM
-  work1_p = decomp%ROW_INFO%SND_P
-  call mem_split_yz_real(src, s1, s2, s3, work1, dims(2), &
-       decomp%y2dist, decomp)
-#else
-  call mem_split_yz_real(src, s1, s2, s3, work1_r, dims(2), &
-       decomp%y2dist, decomp)
-#endif
-
-  ! define receive buffer
-#ifdef SHM
-  work2_p = decomp%ROW_INFO%RCV_P
-  call MPI_BARRIER(decomp%ROW_INFO%CORE_COMM, ierror)
-#endif
-
-#ifdef SHM
-  if (decomp%ROW_INFO%CORE_ME==1) THEN
-     call MPI_ALLTOALLV(work1, decomp%y2cnts_s, decomp%y2disp_s, &
-          real_type, work2, decomp%z2cnts_s, decomp%z2disp_s, &
-          real_type, decomp%ROW_INFO%SMP_COMM, ierror)
-  end if
-#else
-#ifdef EVEN
-  if (decomp%even) then
-     call MPI_ALLTOALL(work1_r, decomp%y2count, &
-          real_type, dst, decomp%z2count, &
-          real_type, DECOMP_2D_COMM_ROW, ierror)
-  else
-     call MPI_ALLTOALL(work1_r, decomp%y2count, &
-          real_type, work2_r, decomp%z2count, &
-          real_type, DECOMP_2D_COMM_ROW, ierror)
-  end if
-#else
-  call MPI_ALLTOALLV(work1_r, decomp%y2cnts, decomp%y2disp, &
-       real_type, dst, decomp%z2cnts, decomp%z2disp, &
-       real_type, DECOMP_2D_COMM_ROW, ierror)
-#endif
-#endif
-
-  ! rearrange receive buffer
-#ifdef SHM
-  call MPI_BARRIER(decomp%ROW_INFO%CORE_COMM, ierror)
-  call mem_merge_yz_real(work2, d1, d2, d3, dst, dims(2), &
-       decomp%z2dist, decomp)
-#else
-#ifdef EVEN
-  if (.not. decomp%even) then
-     call mem_merge_yz_real(work2_r, d1, d2, d3, dst, dims(2), &
-          decomp%z2dist, decomp)
-  end if
-#else
-  ! note the receive buffer is already in natural (i,j,k) order
-  ! so no merge operation needed
-#endif
-#endif
-
-  return
-end subroutine transpose_y_to_z_real
-
-
-#ifdef OCC
-subroutine transpose_y_to_z_real_start(handle, src, dst, sbuf, rbuf, &
-     opt_decomp)
-
-  implicit none
-
-  integer :: handle
-  real(mytype), dimension(:,:,:) :: src, dst, sbuf, rbuf
-  TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-  TYPE(DECOMP_INFO) :: decomp
-
-  integer :: s1,s2,s3
-  integer :: ierror
-
-  if (present(opt_decomp)) then
-     decomp = opt_decomp
-  else
-     decomp = decomp_main
-  end if
-
-  s1 = SIZE(src,1)
-  s2 = SIZE(src,2)
-  s3 = SIZE(src,3)
-
-  ! rearrange source array as send buffer
-  call mem_split_yz_real(src, s1, s2, s3, sbuf, dims(2), &
-       decomp%y2dist, decomp)
-
-#ifdef EVEN
-  call NBC_IALLTOALL(sbuf, decomp%y2count, real_type, &
-       rbuf, decomp%z2count, real_type, &
-       DECOMP_2D_COMM_ROW, handle, ierror)
-#else
-  call NBC_IALLTOALLV(sbuf, decomp%y2cnts, decomp%y2disp, real_type, &
-       rbuf, decomp%z2cnts, decomp%z2disp, real_type, &
-       DECOMP_2D_COMM_ROW, handle, ierror)
-#endif
-
-  return
-end subroutine transpose_y_to_z_real_start
-
-
-subroutine transpose_y_to_z_real_wait(handle, src, dst, sbuf, rbuf, &
-     opt_decomp)
-
-  implicit none
-
-  integer :: handle
-  real(mytype), dimension(:,:,:) :: src, dst, sbuf, rbuf
-  TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-  TYPE(DECOMP_INFO) :: decomp
-
-  integer :: ierror
-
-  if (present(opt_decomp)) then
-     decomp = opt_decomp
-  else
-     decomp = decomp_main
-  end if
-
-  call NBC_WAIT(handle, ierror)
-
-  dst = rbuf
-
-  return
-end subroutine transpose_y_to_z_real_wait
-#endif
-
-
-subroutine transpose_y_to_z_complex(src, dst, opt_decomp)
-
-  implicit none
-
-  complex(mytype), dimension(:,:,:), intent(IN) :: src
-  complex(mytype), dimension(:,:,:), intent(OUT) :: dst
-  TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-  TYPE(DECOMP_INFO) :: decomp
-
-#ifdef SHM
-  complex(mytype) :: work1(*), work2(*)
-  POINTER  (work1_p, work1), (work2_p, work2)  ! Cray pointers
-#endif
-
-  integer :: s1,s2,s3,d1,d2,d3
-  integer :: ierror
-
-  if (present(opt_decomp)) then
-     decomp = opt_decomp
-  else
-     decomp = decomp_main
-  end if
-
-  s1 = SIZE(src,1)
-  s2 = SIZE(src,2)
-  s3 = SIZE(src,3)
-  d1 = SIZE(dst,1)
-  d2 = SIZE(dst,2)
-  d3 = SIZE(dst,3)
-
-  ! rearrange source array as send buffer
-#ifdef SHM
-  work1_p = decomp%ROW_INFO%SND_P_c
-  call mem_split_yz_complex(src, s1, s2, s3, work1, dims(2), &
-       decomp%y2dist, decomp)
-#else
-  call mem_split_yz_complex(src, s1, s2, s3, work1_c, dims(2), &
-       decomp%y2dist, decomp)
-#endif
-
-  ! define receive buffer
-#ifdef SHM
-  work2_p = decomp%ROW_INFO%RCV_P_c
-  call MPI_BARRIER(decomp%ROW_INFO%CORE_COMM, ierror)
-#endif
-
-#ifdef SHM
-  if (decomp%ROW_INFO%CORE_ME==1) THEN
-     call MPI_ALLTOALLV(work1, decomp%y2cnts_s, decomp%y2disp_s, &
-          complex_type, work2, decomp%z2cnts_s, decomp%z2disp_s, &
-          complex_type, decomp%ROW_INFO%SMP_COMM, ierror)
-  end if
-#else
-#ifdef EVEN
-  if (decomp%even) then
-     call MPI_ALLTOALL(work1_c, decomp%y2count, &
-          complex_type, dst, decomp%z2count, &
-          complex_type, DECOMP_2D_COMM_ROW, ierror)
-  else
-     call MPI_ALLTOALL(work1_c, decomp%y2count, &
-          complex_type, work2_c, decomp%z2count, &
-          complex_type, DECOMP_2D_COMM_ROW, ierror)
-  end if
-#else
-  call MPI_ALLTOALLV(work1_c, decomp%y2cnts, decomp%y2disp, &
-       complex_type, dst, decomp%z2cnts, decomp%z2disp, &
-       complex_type, DECOMP_2D_COMM_ROW, ierror)
-#endif
-#endif
-
-  ! rearrange receive buffer
-#ifdef SHM
-  call MPI_BARRIER(decomp%ROW_INFO%CORE_COMM, ierror)
-  call mem_merge_yz_complex(work2, d1, d2, d3, dst, dims(2), &
-       decomp%z2dist, decomp)
-#else
-#ifdef EVEN
-  if (.not. decomp%even) then
-     call mem_merge_yz_complex(work2_c, d1, d2, d3, dst, dims(2), &
-          decomp%z2dist, decomp)
-  end if
-#else
-  ! note the receive buffer is already in natural (i,j,k) order
-  ! so no merge operation needed
-#endif
-#endif
-
-  return
-end subroutine transpose_y_to_z_complex
-
-
-#ifdef OCC
-subroutine transpose_y_to_z_complex_start(handle, src, dst, sbuf, &
-     rbuf, opt_decomp)
-
-  implicit none
-
-  integer :: handle
-  complex(mytype), dimension(:,:,:) :: src, dst, sbuf, rbuf
-  TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-  TYPE(DECOMP_INFO) :: decomp
-
-  integer :: s1,s2,s3
-  integer :: ierror
-
-  if (present(opt_decomp)) then
-     decomp = opt_decomp
-  else
-     decomp = decomp_main
-  end if
-
-  s1 = SIZE(src,1)
-  s2 = SIZE(src,2)
-  s3 = SIZE(src,3)
-
-  ! rearrange source array as send buffer
-  call mem_split_yz_complex(src, s1, s2, s3, sbuf, dims(2), &
-       decomp%y2dist, decomp)
-
-#ifdef EVEN
-  call NBC_IALLTOALL(sbuf, decomp%y2count, &
-       complex_type, rbuf, decomp%z2count, &
-       complex_type, DECOMP_2D_COMM_ROW, handle, ierror)
-#else
-  call NBC_IALLTOALLV(sbuf, decomp%y2cnts, decomp%y2disp, &
-       complex_type, rbuf,decomp%z2cnts, decomp%z2disp, &
-       complex_type, DECOMP_2D_COMM_ROW, handle, ierror)
-#endif
-
-  return
-end subroutine transpose_y_to_z_complex_start
-
-
-subroutine transpose_y_to_z_complex_wait(handle, src, dst, sbuf, &
-     rbuf, opt_decomp)
-
-  implicit none
-
-  integer :: handle
-  complex(mytype), dimension(:,:,:) :: src, dst, sbuf, rbuf
-  TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-  TYPE(DECOMP_INFO) :: decomp
-
-  integer :: ierror
-
-  if (present(opt_decomp)) then
-     decomp = opt_decomp
-  else
-     decomp = decomp_main
-  end if
-
-  call NBC_WAIT(handle, ierror)
-
-  dst = rbuf
-
-  return
-end subroutine transpose_y_to_z_complex_wait
-#endif
-
-
-! pack/unpack ALLTOALL(V) buffers
-
-subroutine mem_split_yz_real(in,n1,n2,n3,out,iproc,dist,decomp)
-
-  implicit none
-
-  integer, intent(IN) :: n1,n2,n3
-  real(mytype), dimension(n1,n2,n3), intent(IN) :: in
-  real(mytype), dimension(*), intent(OUT) :: out
-  integer, intent(IN) :: iproc
-  integer, dimension(0:iproc-1), intent(IN) :: dist
-  TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-  integer :: i,j,k, m,i1,i2,pos
-
-  do m=0,iproc-1
-     if (m==0) then 
-        i1 = 1
-        i2 = dist(0)
-     else
-        i1 = i2+1
-        i2 = i1+dist(m)-1
-     end if
-
-#ifdef SHM
-     pos = decomp%y2disp_o(m) + 1
-#else
-#ifdef EVEN
-     pos = m * decomp%y2count + 1
-#else
-     pos = decomp%y2disp(m) + 1
-#endif
-#endif
-
-     do k=1,n3
-        do j=i1,i2
-           do i=1,n1
-              out(pos) = in(i,j,k)
-              pos = pos + 1
-           end do
-        end do
-     end do
-  end do
-
-  return
-end subroutine mem_split_yz_real
-
-
-subroutine mem_split_yz_complex(in,n1,n2,n3,out,iproc,dist,decomp)
-
-  implicit none
-
-  integer, intent(IN) :: n1,n2,n3
-  complex(mytype), dimension(n1,n2,n3), intent(IN) :: in
-  complex(mytype), dimension(*), intent(OUT) :: out
-  integer, intent(IN) :: iproc
-  integer, dimension(0:iproc-1), intent(IN) :: dist
-  TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-  integer :: i,j,k, m,i1,i2,pos
-
-  do m=0,iproc-1
-     if (m==0) then 
-        i1 = 1
-        i2 = dist(0)
-     else
-        i1 = i2+1
-        i2 = i1+dist(m)-1
-     end if
-
-#ifdef SHM
-     pos = decomp%y2disp_o(m) + 1
-#else
-#ifdef EVEN
-     pos = m * decomp%y2count + 1
-#else
-     pos = decomp%y2disp(m) + 1
-#endif
-#endif
-
-     do k=1,n3
-        do j=i1,i2
-           do i=1,n1
-              out(pos) = in(i,j,k)
-              pos = pos + 1
-           end do
-        end do
-     end do
-  end do
-
-  return
-end subroutine mem_split_yz_complex
-
-
-subroutine mem_merge_yz_real(in,n1,n2,n3,out,iproc,dist,decomp)
-
-  implicit none
-
-  integer, intent(IN) :: n1,n2,n3
-  real(mytype), dimension(*), intent(IN) :: in
-  real(mytype), dimension(n1,n2,n3), intent(OUT) :: out
-  integer, intent(IN) :: iproc
-  integer, dimension(0:iproc-1), intent(IN) :: dist
-  TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-  integer :: i,j,k, m,i1,i2, pos
-
-  do m=0,iproc-1
-     if (m==0) then
-        i1 = 1
-        i2 = dist(0)
-     else
-        i1 = i2+1
-        i2 = i1+dist(m)-1
-     end if
-
-#ifdef SHM
-     pos = decomp%z2disp_o(m) + 1
-#else
-#ifdef EVEN
-     pos = m * decomp%z2count + 1
-#else
-     pos = decomp%z2disp(m) + 1
-#endif
-#endif
-
-     do k=i1,i2
-        do j=1,n2
-           do i=1,n1
-              out(i,j,k) = in(pos)
-              pos = pos + 1
-           end do
-        end do
-     end do
-  end do
-
-  return
-end subroutine mem_merge_yz_real
-
-
-subroutine mem_merge_yz_complex(in,n1,n2,n3,out,iproc,dist,decomp)
-
-  implicit none
-
-  integer, intent(IN) :: n1,n2,n3
-  complex(mytype), dimension(*), intent(IN) :: in
-  complex(mytype), dimension(n1,n2,n3), intent(OUT) :: out
-  integer, intent(IN) :: iproc
-  integer, dimension(0:iproc-1), intent(IN) :: dist
-  TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-  integer :: i,j,k, m,i1,i2, pos
-
-  do m=0,iproc-1
-     if (m==0) then
-        i1 = 1
-        i2 = dist(0)
-     else
-        i1 = i2+1
-        i2 = i1+dist(m)-1
-     end if
-
-#ifdef SHM
-     pos = decomp%z2disp_o(m) + 1
-#else
-#ifdef EVEN
-     pos = m * decomp%z2count + 1
-#else
-     pos = decomp%z2disp(m) + 1
-#endif
-#endif
-
-     do k=i1,i2
-        do j=1,n2
-           do i=1,n1
-              out(i,j,k) = in(pos)
-              pos = pos + 1
-           end do
-        end do
-     end do
-  end do
-
-  return
-end subroutine mem_merge_yz_complex
diff --git a/decomp2d/transpose_z_to_y.inc b/decomp2d/transpose_z_to_y.inc
deleted file mode 100644
index 22c6f1109..000000000
--- a/decomp2d/transpose_z_to_y.inc
+++ /dev/null
@@ -1,524 +0,0 @@
-! -*- mode: f90 -*-
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-! This file contains the routines that transpose data from Z to Y pencil
-
-subroutine transpose_z_to_y_real(src, dst, opt_decomp)
-
-  implicit none
-
-  real(mytype), dimension(:,:,:), intent(IN) :: src
-  real(mytype), dimension(:,:,:), intent(OUT) :: dst
-  TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-  TYPE(DECOMP_INFO) :: decomp
-
-#ifdef SHM
-  real(mytype) :: work1(*), work2(*)
-  POINTER  (work1_p, work1), (work2_p, work2)  ! Cray pointers
-#endif
-
-  integer :: s1,s2,s3,d1,d2,d3
-  integer :: ierror
-
-  if (present(opt_decomp)) then
-     decomp = opt_decomp
-  else
-     decomp = decomp_main
-  end if
-
-  s1 = SIZE(src,1)
-  s2 = SIZE(src,2)
-  s3 = SIZE(src,3)
-  d1 = SIZE(dst,1)
-  d2 = SIZE(dst,2)
-  d3 = SIZE(dst,3)
-
-  ! rearrange source array as send buffer
-#ifdef SHM
-  work1_p = decomp%ROW_INFO%SND_P
-  call mem_split_zy_real(src, s1, s2, s3, work1, dims(2), &
-       decomp%z2dist, decomp)
-#else
-#ifdef EVEN
-  if (.not. decomp%even) then
-     call mem_split_zy_real(src, s1, s2, s3, work1_r, dims(2), &
-          decomp%z2dist, decomp)
-  end if
-#else
-  ! note the src array is suitable to be a send buffer
-  ! so no split operation needed
-#endif
-#endif
-
-  ! define receive buffer
-#ifdef SHM
-  work2_p = decomp%ROW_INFO%RCV_P
-  call MPI_BARRIER(decomp%ROW_INFO%CORE_COMM, ierror)
-#endif
-
-#ifdef SHM
-  if (decomp%ROW_INFO%CORE_ME==1) THEN
-     call MPI_ALLTOALLV(work1, decomp%z2cnts_s, decomp%z2disp_s, &
-          real_type, work2, decomp%y2cnts_s, decomp%y2disp_s, &
-          real_type, decomp%ROW_INFO%SMP_COMM, ierror)
-  end if
-#else
-#ifdef EVEN
-  if (decomp%even) then
-     call MPI_ALLTOALL(src, decomp%z2count, &
-          real_type, work2_r, decomp%y2count, &
-          real_type, DECOMP_2D_COMM_ROW, ierror)
-  else
-     call MPI_ALLTOALL(work1_r, decomp%z2count, &
-          real_type, work2_r, decomp%y2count, &
-          real_type, DECOMP_2D_COMM_ROW, ierror)
-  end if
-#else
-  call MPI_ALLTOALLV(src, decomp%z2cnts, decomp%z2disp, &
-       real_type, work2_r, decomp%y2cnts, decomp%y2disp, &
-       real_type, DECOMP_2D_COMM_ROW, ierror)
-#endif
-#endif
-
-  ! rearrange receive buffer
-#ifdef SHM
-  call MPI_BARRIER(decomp%ROW_INFO%CORE_COMM, ierror)
-  call mem_merge_zy_real(work2, d1, d2, d3, dst, dims(2), &
-       decomp%y2dist, decomp)
-#else
-  call mem_merge_zy_real(work2_r, d1, d2, d3, dst, dims(2), &
-       decomp%y2dist, decomp)
-#endif
-
-  return
-end subroutine transpose_z_to_y_real
-
-
-#ifdef OCC
-subroutine transpose_z_to_y_real_start(handle, src, dst, sbuf, rbuf, &
-     opt_decomp)
-
-  implicit none
-
-  integer :: handle
-  real(mytype), dimension(:,:,:) :: src, dst, sbuf, rbuf
-  TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-  TYPE(DECOMP_INFO) :: decomp
-
-  integer :: ierror
-
-  if (present(opt_decomp)) then
-     decomp = opt_decomp
-  else
-     decomp = decomp_main
-  end if
-
-  sbuf = src
-
-#ifdef EVEN
-  call NBC_IALLTOALL(sbuf, decomp%z2count, real_type, &
-       rbuf, decomp%y2count, real_type, &
-       DECOMP_2D_COMM_ROW, handle, ierror)
-#else
-  call NBC_IALLTOALLV(sbuf, decomp%z2cnts, decomp%z2disp, real_type, &
-       rbuf, decomp%y2cnts, decomp%y2disp, real_type, &
-       DECOMP_2D_COMM_ROW, handle, ierror)
-#endif
-
-  return
-end subroutine transpose_z_to_y_real_start
-
-
-subroutine transpose_z_to_y_real_wait(handle, src, dst, sbuf, rbuf, &
-     opt_decomp)
-
-  implicit none
-
-  integer :: handle
-  real(mytype), dimension(:,:,:) :: src, dst, sbuf, rbuf
-  TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-  TYPE(DECOMP_INFO) :: decomp
-
-  integer :: d1,d2,d3
-  integer :: ierror
-
-  if (present(opt_decomp)) then
-     decomp = opt_decomp
-  else
-     decomp = decomp_main
-  end if
-
-  d1 = SIZE(dst,1)
-  d2 = SIZE(dst,2)
-  d3 = SIZE(dst,3)
-
-  call NBC_WAIT(handle, ierror)
-
-  ! rearrange receive buffer
-  call mem_merge_zy_real(rbuf, d1, d2, d3, dst, dims(2), &
-       decomp%y2dist, decomp)
-
-  return
-end subroutine transpose_z_to_y_real_wait
-#endif
-
-
-subroutine transpose_z_to_y_complex(src, dst, opt_decomp)
-
-  implicit none
-
-  complex(mytype), dimension(:,:,:), intent(IN) :: src
-  complex(mytype), dimension(:,:,:), intent(OUT) :: dst
-  TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-  TYPE(DECOMP_INFO) :: decomp
-
-#ifdef SHM
-  complex(mytype) :: work1(*), work2(*)
-  POINTER  (work1_p, work1), (work2_p, work2)  ! Cray pointers
-#endif
-
-  integer :: s1,s2,s3,d1,d2,d3
-  integer :: ierror
-
-  if (present(opt_decomp)) then
-     decomp = opt_decomp
-  else
-     decomp = decomp_main
-  end if
-
-  s1 = SIZE(src,1)
-  s2 = SIZE(src,2)
-  s3 = SIZE(src,3)
-  d1 = SIZE(dst,1)
-  d2 = SIZE(dst,2)
-  d3 = SIZE(dst,3)
-
-  ! rearrange source array as send buffer
-#ifdef SHM
-  work1_p = decomp%ROW_INFO%SND_P_c
-  call mem_split_zy_complex(src, s1, s2, s3, work1, dims(2), &
-       decomp%z2dist, decomp)
-#else
-#ifdef EVEN
-  if (.not. decomp%even) then
-     call mem_split_zy_complex(src, s1, s2, s3, work1_c, dims(2), &
-          decomp%z2dist, decomp)
-  end if
-#else
-  ! note the src array is suitable to be a send buffer
-  ! so no split operation needed
-#endif
-#endif
-
-  ! define receive buffer
-#ifdef SHM
-  work2_p = decomp%ROW_INFO%RCV_P_c
-  call MPI_BARRIER(decomp%ROW_INFO%CORE_COMM, ierror)
-#endif
-
-#ifdef SHM
-  if (decomp%ROW_INFO%CORE_ME==1) THEN
-     call MPI_ALLTOALLV(work1, decomp%z2cnts_s, decomp%z2disp_s, &
-          complex_type, work2, decomp%y2cnts_s, decomp%y2disp_s, &
-          complex_type, decomp%ROW_INFO%SMP_COMM, ierror)
-  end if
-#else
-#ifdef EVEN
-  if (decomp%even) then
-     call MPI_ALLTOALL(src, decomp%z2count, &
-          complex_type, work2_c, decomp%y2count, &
-          complex_type, DECOMP_2D_COMM_ROW, ierror)
-  else
-     call MPI_ALLTOALL(work1_c, decomp%z2count, &
-          complex_type, work2_c, decomp%y2count, &
-          complex_type, DECOMP_2D_COMM_ROW, ierror)
-  end if
-#else
-  call MPI_ALLTOALLV(src, decomp%z2cnts, decomp%z2disp, &
-       complex_type, work2_c, decomp%y2cnts, decomp%y2disp, &
-       complex_type, DECOMP_2D_COMM_ROW, ierror)
-#endif
-#endif
-
-  ! rearrange receive buffer
-#ifdef SHM
-  call MPI_BARRIER(decomp%ROW_INFO%CORE_COMM, ierror)
-  call mem_merge_zy_complex(work2, d1, d2, d3, dst, dims(2), &
-       decomp%y2dist, decomp)
-#else
-  call mem_merge_zy_complex(work2_c, d1, d2, d3, dst, dims(2), &
-       decomp%y2dist, decomp)
-#endif
-
-  return
-end subroutine transpose_z_to_y_complex
-
-
-#ifdef OCC
-subroutine transpose_z_to_y_complex_start(handle, src, dst, sbuf, &
-     rbuf, opt_decomp)
-
-  implicit none
-
-  integer :: handle
-  complex(mytype), dimension(:,:,:) :: src, dst, sbuf, rbuf
-  TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-  TYPE(DECOMP_INFO) :: decomp
-
-  integer :: ierror
-
-  if (present(opt_decomp)) then
-     decomp = opt_decomp
-  else
-     decomp = decomp_main
-  end if
-
-  sbuf = src
-
-#ifdef EVEN
-  call NBC_IALLTOALL(sbuf, decomp%z2count, &
-       complex_type, rbuf, decomp%y2count, &
-       complex_type, DECOMP_2D_COMM_ROW, handle, ierror)
-#else
-  call NBC_IALLTOALLV(sbuf, decomp%z2cnts, decomp%z2disp, &
-       complex_type, rbuf, decomp%y2cnts, decomp%y2disp, &
-       complex_type, DECOMP_2D_COMM_ROW, handle, ierror)
-#endif
-
-  return
-end subroutine transpose_z_to_y_complex_start
-
-
-subroutine transpose_z_to_y_complex_wait(handle, src, dst, sbuf, &
-     rbuf, opt_decomp)
-
-  implicit none
-
-  integer :: handle
-  complex(mytype), dimension(:,:,:) :: src, dst, sbuf, rbuf
-  TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-  TYPE(DECOMP_INFO) :: decomp
-
-  integer :: d1,d2,d3
-  integer :: ierror
-
-  if (present(opt_decomp)) then
-     decomp = opt_decomp
-  else
-     decomp = decomp_main
-  end if
-
-  d1 = SIZE(dst,1)
-  d2 = SIZE(dst,2)
-  d3 = SIZE(dst,3)
-
-  call NBC_WAIT(handle, ierror)
-
-  ! rearrange receive buffer
-  call mem_merge_zy_complex(rbuf, d1, d2, d3, dst, dims(2), &
-       decomp%y2dist, decomp)
-
-  return
-end subroutine transpose_z_to_y_complex_wait
-#endif
-
-
-! pack/unpack ALLTOALL(V) buffers
-
-subroutine mem_split_zy_real(in,n1,n2,n3,out,iproc,dist,decomp)
-
-  implicit none
-
-  integer, intent(IN) :: n1,n2,n3
-  real(mytype), dimension(n1,n2,n3), intent(IN) :: in
-  real(mytype), dimension(*), intent(OUT) :: out
-  integer, intent(IN) :: iproc
-  integer, dimension(0:iproc-1), intent(IN) :: dist
-  TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-  integer :: i,j,k, m,i1,i2,pos
-
-  do m=0,iproc-1
-     if (m==0) then 
-        i1 = 1
-        i2 = dist(0)
-     else
-        i1 = i2+1
-        i2 = i1+dist(m)-1
-     end if
-
-#ifdef SHM
-     pos = decomp%z2disp_o(m) + 1
-#else
-#ifdef EVEN
-     pos = m * decomp%z2count + 1
-#else
-     pos = decomp%z2disp(m) + 1
-#endif
-#endif
-
-     do k=i1,i2
-        do j=1,n2
-           do i=1,n1
-              out(pos) = in(i,j,k)
-              pos = pos + 1
-           end do
-        end do
-     end do
-  end do
-
-  return
-end subroutine mem_split_zy_real
-
-
-subroutine mem_split_zy_complex(in,n1,n2,n3,out,iproc,dist,decomp)
-
-  implicit none
-
-  integer, intent(IN) :: n1,n2,n3
-  complex(mytype), dimension(n1,n2,n3), intent(IN) :: in
-  complex(mytype), dimension(*), intent(OUT) :: out
-  integer, intent(IN) :: iproc
-  integer, dimension(0:iproc-1), intent(IN) :: dist
-  TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-  integer :: i,j,k, m,i1,i2,pos
-
-  do m=0,iproc-1
-     if (m==0) then 
-        i1 = 1
-        i2 = dist(0)
-     else
-        i1 = i2+1
-        i2 = i1+dist(m)-1
-     end if
-
-#ifdef SHM
-     pos = decomp%z2disp_o(m) + 1
-#else
-#ifdef EVEN
-     pos = m * decomp%z2count + 1
-#else
-     pos = decomp%z2disp(m) + 1
-#endif
-#endif
-
-     do k=i1,i2
-        do j=1,n2
-           do i=1,n1
-              out(pos) = in(i,j,k)
-              pos = pos + 1
-           end do
-        end do
-     end do
-  end do
-
-  return
-end subroutine mem_split_zy_complex
-
-
-subroutine mem_merge_zy_real(in,n1,n2,n3,out,iproc,dist,decomp)
-
-  implicit none
-
-  integer, intent(IN) :: n1,n2,n3
-  real(mytype), dimension(*), intent(IN) :: in
-  real(mytype), dimension(n1,n2,n3), intent(OUT) :: out
-  integer, intent(IN) :: iproc
-  integer, dimension(0:iproc-1), intent(IN) :: dist
-  TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-  integer :: i,j,k, m,i1,i2, pos
-
-  do m=0,iproc-1
-     if (m==0) then
-        i1 = 1
-        i2 = dist(0)
-     else
-        i1 = i2+1
-        i2 = i1+dist(m)-1
-     end if
-
-#ifdef SHM
-     pos = decomp%y2disp_o(m) + 1
-#else
-#ifdef EVEN
-     pos = m * decomp%y2count + 1
-#else
-     pos = decomp%y2disp(m) + 1
-#endif
-#endif
-
-     do k=1,n3
-        do j=i1,i2
-           do i=1,n1
-              out(i,j,k) = in(pos)
-              pos = pos + 1
-           end do
-        end do
-     end do
-  end do
-
-  return
-end subroutine mem_merge_zy_real
-
-
-subroutine mem_merge_zy_complex(in,n1,n2,n3,out,iproc,dist,decomp)
-
-  implicit none
-
-  integer, intent(IN) :: n1,n2,n3
-  complex(mytype), dimension(*), intent(IN) :: in
-  complex(mytype), dimension(n1,n2,n3), intent(OUT) :: out
-  integer, intent(IN) :: iproc
-  integer, dimension(0:iproc-1), intent(IN) :: dist
-  TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-  integer :: i,j,k, m,i1,i2, pos
-
-  do m=0,iproc-1
-     if (m==0) then
-        i1 = 1
-        i2 = dist(0)
-     else
-        i1 = i2+1
-        i2 = i1+dist(m)-1
-     end if
-
-#ifdef SHM
-     pos = decomp%y2disp_o(m) + 1
-#else
-#ifdef EVEN
-     pos = m * decomp%y2count + 1
-#else
-     pos = decomp%y2disp(m) + 1
-#endif
-#endif
-
-     do k=1,n3
-        do j=i1,i2
-           do i=1,n1
-              out(i,j,k) = in(pos)
-              pos = pos + 1
-           end do
-        end do
-     end do
-  end do
-
-  return
-end subroutine mem_merge_zy_complex
diff --git a/examples/ABL-old/CMakeLists.txt b/examples/ABL-old/CMakeLists.txt
index 468ef0b8a..f9951244e 100644
--- a/examples/ABL-old/CMakeLists.txt
+++ b/examples/ABL-old/CMakeLists.txt
@@ -1,45 +1,50 @@
 # ABL Convective
-install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/ABL-Convective-old)
-install(FILES input_convective_test.i3d DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/ABL-Convective-old)
-if(${USE_ADIOS2})
-  install(FILES ../adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/ABL-Convective-old)
+set(case "ABL-Old-Convective")
+set(case_dir "${test_dir}/${case}")
+file(MAKE_DIRECTORY ${case_dir})
+set(input_file "input_convective_test.i3d")
+install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+install(FILES ${input_file} DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+if(ADIOS2_FOUND)
+  install(FILES adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
 endif()
 # If testing active add test for TGV case
-if (${BUILD_TESTING})
-  set(case_dir "${test_dir}/ABL-Convective-old")
-  file(MAKE_DIRECTORY ${case_dir})
-  file(COPY input_convective_test.i3d DESTINATION ${case_dir})
-  if(${USE_ADIOS2})
-    file(COPY ../adios2_config.xml DESTINATION ${case_dir})
-  endif()
+file(COPY ${input_file} DESTINATION ${case_dir})
+if(ADIOS2_FOUND)
+  file(COPY adios2_config.xml DESTINATION ${case_dir})
 endif()
+add_test(NAME ${case} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:xcompact3d> ${input_file} WORKING_DIRECTORY ${case_dir})
+
 # ABL Neutral
-install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/ABL-Neutral-old)
-install(FILES input_neutral_test.i3d DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/ABL-Neutral-old)
-if(${USE_ADIOS2})
-  install(FILES ../adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/ABL-Neutral-old)
+set(case "ABL-Old-Neutral")
+set(case_dir "${test_dir}/${case}")
+file(MAKE_DIRECTORY ${case_dir})
+set(input_file "input_neutral_test.i3d")
+install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+install(FILES ${input_file} DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+if(ADIOS2_FOUND)
+  install(FILES adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
 endif()
 # If testing active add test for TGV case
-if (${BUILD_TESTING})
-  set(case_dir "${test_dir}/ABL-Neutral-old")
-  file(MAKE_DIRECTORY ${case_dir})
-  file(COPY input_neutral_test.i3d DESTINATION ${case_dir})
-  if(${USE_ADIOS2})
-    file(COPY ../adios2_config.xml DESTINATION ${case_dir})
-  endif()
+file(COPY ${input_file} DESTINATION ${case_dir})
+if(ADIOS2_FOUND)
+  file(COPY adios2_config.xml DESTINATION ${case_dir})
 endif()
+add_test(NAME ${case} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:xcompact3d> ${input_file} WORKING_DIRECTORY ${case_dir})
+
 # ABL Stable
-install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/ABL-Stable-old)
-install(FILES input_stable_test.i3d DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/ABL-Stable-old)
-if(${USE_ADIOS2})
-  install(FILES ../adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/ABL-Stable-old)
+set(case "ABL-Old-Stable")
+set(case_dir "${test_dir}/${case}")
+file(MAKE_DIRECTORY ${case_dir})
+set(input_file "input_stable_test.i3d")
+install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+install(FILES ${input_file} DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+if(ADIOS2_FOUND)
+  install(FILES adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
 endif()
 # If testing active add test for TGV case
-if (${BUILD_TESTING})
-	set(case_dir "${test_dir}/ABL-Stable-old")
-  file(MAKE_DIRECTORY ${case_dir})
-  file(COPY input_stable_test.i3d DESTINATION ${case_dir})
-  if(${USE_ADIOS2})
-    file(COPY ../adios2_config.xml DESTINATION ${case_dir})
-  endif()
+file(COPY ${input_file} DESTINATION ${case_dir})
+if(ADIOS2_FOUND)
+  file(COPY adios2_config.xml DESTINATION ${case_dir})
 endif()
+add_test(NAME ${case} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:xcompact3d> ${input_file} WORKING_DIRECTORY ${case_dir})
diff --git a/examples/ABL/CMakeLists.txt b/examples/ABL/CMakeLists.txt
index 1f54079ae..65c117e75 100644
--- a/examples/ABL/CMakeLists.txt
+++ b/examples/ABL/CMakeLists.txt
@@ -1,15 +1,17 @@
 # ABL Neutral
-install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/ABL-Neutral)
-install(FILES input_neutral_test.i3d DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/ABL-Neutral)
-if(${USE_ADIOS2})
-  install(FILES ../adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/ABL-Neutral)
+set(case "ABL-Neutral")
+set(case_dir "${test_dir}/${case}")
+file(MAKE_DIRECTORY ${case_dir})
+set(input_file "input_neutral_test.i3d")
+install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+install(FILES ${input_file} DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+if(ADIOS2_FOUND)
+  install(FILES adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
 endif()
 # If testing active add test for TGV case
-if (${BUILD_TESTING})
-  set(case_dir "${test_dir}/ABL-Neutral")
-  file(MAKE_DIRECTORY ${case_dir})
-  file(COPY input_neutral_test.i3d DESTINATION ${case_dir})
-  if(${USE_ADIOS2})
-    file(COPY ../adios2_config.xml DESTINATION ${case_dir})
-  endif()
+file(COPY ${input_file} DESTINATION ${case_dir})
+if(ADIOS2_FOUND)
+  file(COPY adios2_config.xml DESTINATION ${case_dir})
 endif()
+add_test(NAME ${case} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:xcompact3d> ${input_file} WORKING_DIRECTORY ${case_dir})
+
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 1098b46f8..9aa78f0b3 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,17 +1,25 @@
 install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples)
+
 # if testing active create a working dir for testing
-if (${BUILD_TESTING})
-  file(MAKE_DIRECTORY ${test_dir})
-endif()
-add_subdirectory(ABL-old)
-add_subdirectory(ABL)
-add_subdirectory(Cavity)
-add_subdirectory(Channel-Flow)
-if(NOT ${USE_ADIOS2})
-  add_subdirectory(Cylinder)
-endif()
-add_subdirectory(Lock-exchange)
-add_subdirectory(Mixing-layer)
-add_subdirectory(Taylor-Green-Vortex)
-add_subdirectory(Turbulent-Boundary-Layer)
-add_subdirectory(Wind-Turbine/NREL-5MW_ALM)
+set(test_dir "${PROJECT_BINARY_DIR}/RunTests")
+file(MAKE_DIRECTORY ${test_dir})
+
+# We need to find a way to check if 2DECOMP&FFT is build with ADIOS2
+option(USE_ADIOS2 "Build XCompact with ADIOS2 library" OFF)
+
+# TGV is from tests
+#add_subdirectory(TGV)
+
+if (${BUILD_TESTING_FULL})
+  add_subdirectory(ABL)
+  add_subdirectory(ABL-old)
+  add_subdirectory(Cavity)
+  add_subdirectory(Channel-Flow)
+  if(NOT ${USE_ADIOS2})
+    add_subdirectory(Cylinder)
+  endif()
+  add_subdirectory(Lock-exchange)
+  add_subdirectory(Mixing-layer)
+  add_subdirectory(Turbulent-Boundary-Layer)
+  add_subdirectory(Wind-Turbine/NREL-5MW_ALM)
+endif (${BUILD_TESTING_FULL})
diff --git a/examples/Cavity/CMakeLists.txt b/examples/Cavity/CMakeLists.txt
index 6391c4cb9..ded80e2ba 100644
--- a/examples/Cavity/CMakeLists.txt
+++ b/examples/Cavity/CMakeLists.txt
@@ -1,14 +1,17 @@
-install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/Cavity)
-install(FILES input_test.i3d DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/Cavity)
-if(${USE_ADIOS2})
-  install(FILES ../adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/Cavity)
+# Cavity
+set(case "Cavity")
+set(case_dir "${test_dir}/${case}")
+file(MAKE_DIRECTORY ${case_dir})
+set(input_file "input_test.i3d")
+install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+install(FILES ${input_file} DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+if(ADIOS2_FOUND)
+  install(FILES adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
 endif()
 # If testing active add test for TGV case
-if (${BUILD_TESTING})
-  set(case_dir "${test_dir}/Cavity")
-  file(MAKE_DIRECTORY ${case_dir})
-  file(COPY input_test.i3d DESTINATION ${case_dir})
-  if(${USE_ADIOS2})
-    file(COPY ../adios2_config.xml DESTINATION ${case_dir})
-  endif()
+file(COPY ${input_file} DESTINATION ${case_dir})
+if(ADIOS2_FOUND)
+  file(COPY adios2_config.xml DESTINATION ${case_dir})
 endif()
+add_test(NAME ${case} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:xcompact3d> ${input_file} WORKING_DIRECTORY ${case_dir})
+
diff --git a/examples/Channel-Flow/CMakeLists.txt b/examples/Channel-Flow/CMakeLists.txt
index 3c0e8d17c..257cfab84 100644
--- a/examples/Channel-Flow/CMakeLists.txt
+++ b/examples/Channel-Flow/CMakeLists.txt
@@ -1,29 +1,33 @@
-install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/Channel-Flow-X)
-install(FILES input_test_x.i3d DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/Channel-Flow-X)
-if(${USE_ADIOS2})
-  install(FILES ../adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/Channel-Flow-X)
+# Channel-Flow-X
+set(case "Channel_Flow-X")
+set(case_dir "${test_dir}/${case}")
+file(MAKE_DIRECTORY ${case_dir})
+set(input_file "input_test_x.i3d")
+install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+install(FILES ${input_file} DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+if(ADIOS2_FOUND)
+  install(FILES adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
 endif()
 # If testing active add test for TGV case
-if (${BUILD_TESTING})
-  set(case_dir "${test_dir}/Channel-Flow-X")
-  file(MAKE_DIRECTORY ${case_dir})
-  file(COPY input_test_x.i3d DESTINATION ${case_dir})
-  if(${USE_ADIOS2})
-    file(COPY ../adios2_config.xml DESTINATION ${case_dir})
-  endif()
+file(COPY ${input_file} DESTINATION ${case_dir})
+if(ADIOS2_FOUND)
+  file(COPY adios2_config.xml DESTINATION ${case_dir})
 endif()
-# Z dir test
-install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/Channel-Flow-Z)
-install(FILES input_test_z.i3d DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/Channel-Flow-Z)
-if(${USE_ADIOS2})
-  install(FILES ../adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/Channel-Flow-Z)
+add_test(NAME ${case} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:xcompact3d> ${input_file} WORKING_DIRECTORY ${case_dir})
+
+# Channel-Flow-Z
+set(case "Channel_Flow-Z")
+set(case_dir "${test_dir}/${case}")
+file(MAKE_DIRECTORY ${case_dir})
+set(input_file "input_test_z.i3d")
+install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+install(FILES ${input_file} DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+if(ADIOS2_FOUND)
+  install(FILES adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
 endif()
 # If testing active add test for TGV case
-if (${BUILD_TESTING})
-  set(case_dir "${test_dir}/Channel-Flow-Z")
-  file(MAKE_DIRECTORY ${case_dir})
-  file(COPY input_test_z.i3d DESTINATION ${case_dir})
-  if(${USE_ADIOS2})
-    file(COPY ../adios2_config.xml DESTINATION ${case_dir})
-  endif()
+file(COPY ${input_file} DESTINATION ${case_dir})
+if(ADIOS2_FOUND)
+  file(COPY adios2_config.xml DESTINATION ${case_dir})
 endif()
+add_test(NAME ${case} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:xcompact3d> ${input_file} WORKING_DIRECTORY ${case_dir})
diff --git a/examples/Cylinder/CMakeLists.txt b/examples/Cylinder/CMakeLists.txt
index 771626e28..e6337b055 100644
--- a/examples/Cylinder/CMakeLists.txt
+++ b/examples/Cylinder/CMakeLists.txt
@@ -1,17 +1,33 @@
-install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/Cylinder)
-install(FILES input_test.i3d DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/Cylinder)
-# If testing active add test for Cylinder case
-if (${BUILD_TESTING})
-  set(case_dir "${test_dir}/Cylinder")
-  file(MAKE_DIRECTORY ${case_dir})
-  file(COPY input_test.i3d DESTINATION ${case_dir})
+# Cylinder
+set(case "Cylinder")
+set(case_dir "${test_dir}/${case}")
+file(MAKE_DIRECTORY ${case_dir})
+set(input_file "input_test.i3d")
+install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+install(FILES ${input_file} DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+if(ADIOS2_FOUND)
+  install(FILES adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
 endif()
-# Moving Cylinder
-install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/CylinderMoving)
-install(FILES input_test_moving.i3d DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/CylinderMoving)
-# If testing active add test for Cylinder case
-if (${BUILD_TESTING})
-  set(case_dir "${test_dir}/CylinderMoving")
-  file(MAKE_DIRECTORY ${case_dir})
-  file(COPY input_test_moving.i3d DESTINATION ${case_dir})
+# If testing active add test for TGV case
+file(COPY ${input_file} DESTINATION ${case_dir})
+if(ADIOS2_FOUND)
+  file(COPY adios2_config.xml DESTINATION ${case_dir})
 endif()
+add_test(NAME ${case} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:xcompact3d> ${input_file} WORKING_DIRECTORY ${case_dir})
+
+# Cylinder Moving
+set(case "CylinderMoving")
+set(case_dir "${test_dir}/${case}")
+file(MAKE_DIRECTORY ${case_dir})
+set(input_file "input_test_moving.i3d")
+install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+install(FILES ${input_file} DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+if(ADIOS2_FOUND)
+  install(FILES adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+endif()
+# If testing active add test for TGV case
+file(COPY ${input_file} DESTINATION ${case_dir})
+if(ADIOS2_FOUND)
+  file(COPY adios2_config.xml DESTINATION ${case_dir})
+endif()
+add_test(NAME ${case} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:xcompact3d> ${input_file} WORKING_DIRECTORY ${case_dir})
diff --git a/examples/Lock-exchange/CMakeLists.txt b/examples/Lock-exchange/CMakeLists.txt
index 657e20996..59293ecc2 100644
--- a/examples/Lock-exchange/CMakeLists.txt
+++ b/examples/Lock-exchange/CMakeLists.txt
@@ -1,14 +1,16 @@
-install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/Lock-exchange)
-install(FILES input_test.i3d DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/Lock-exchange)
-if(${USE_ADIOS2})
-  install(FILES ../adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/Lock-exchange)
+# Lock Exchange
+set(case "Lock-exchange")
+set(case_dir "${test_dir}/${case}")
+file(MAKE_DIRECTORY ${case_dir})
+set(input_file "input_test.i3d")
+install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+install(FILES ${input_file} DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+if(ADIOS2_FOUND)
+  install(FILES adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
 endif()
-# If testing active add test for Lock-exchange case
-if (${BUILD_TESTING})
-  set(case_dir "${test_dir}/Lock-exchange")
-  file(MAKE_DIRECTORY ${case_dir})
-  file(COPY input_test.i3d DESTINATION ${case_dir})
-  if(${USE_ADIOS2})
-    file(COPY ../adios2_config.xml DESTINATION ${case_dir})
-  endif()
+# If testing active add test for TGV case
+file(COPY ${input_file} DESTINATION ${case_dir})
+if(ADIOS2_FOUND)
+  file(COPY adios2_config.xml DESTINATION ${case_dir})
 endif()
+add_test(NAME ${case} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:xcompact3d> ${input_file} WORKING_DIRECTORY ${case_dir})
diff --git a/examples/Mixing-layer/CMakeLists.txt b/examples/Mixing-layer/CMakeLists.txt
index d96ecce08..51ce13ab4 100644
--- a/examples/Mixing-layer/CMakeLists.txt
+++ b/examples/Mixing-layer/CMakeLists.txt
@@ -1,14 +1,16 @@
-install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/Mixing-layer)
-install(FILES input_test.i3d DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/Mixing-layer)
-if(${USE_ADIOS2})
-  install(FILES ../adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/Mixing-layer)
+# ABL Neutral
+set(case "Mixing-layer")
+set(case_dir "${test_dir}/${case}")
+file(MAKE_DIRECTORY ${case_dir})
+set(input_file "input_test.i3d")
+install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+install(FILES ${input_file} DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+if(ADIOS2_FOUND)
+  install(FILES adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
 endif()
-# If testing active add test for Mixing-layer case
-if (${BUILD_TESTING})
-  set(case_dir "${test_dir}/Mixing-layer")
-  file(MAKE_DIRECTORY ${case_dir})
-  file(COPY input_test.i3d DESTINATION ${case_dir})
-  if(${USE_ADIOS2})
-    file(COPY ../adios2_config.xml DESTINATION ${case_dir})
-  endif()
+# If testing active add test for TGV case
+file(COPY ${input_file} DESTINATION ${case_dir})
+if(ADIOS2_FOUND)
+  file(COPY adios2_config.xml DESTINATION ${case_dir})
 endif()
+add_test(NAME ${case} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:xcompact3d> ${input_file} WORKING_DIRECTORY ${case_dir})
diff --git a/examples/TGV/CMakeLists.txt b/examples/TGV/CMakeLists.txt
new file mode 100644
index 000000000..28fe43519
--- /dev/null
+++ b/examples/TGV/CMakeLists.txt
@@ -0,0 +1,16 @@
+# TGV
+set(case "TGV")
+set(case_dir "${test_dir}/${case}")
+file(MAKE_DIRECTORY ${case_dir})
+set(input_file "input_test.i3d")
+install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+install(FILES ${input_file} DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+if(ADIOS2_FOUND)
+  install(FILES adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+endif()
+# If testing active add test for TGV case
+file(COPY ${input_file} DESTINATION ${case_dir})
+if(ADIOS2_FOUND)
+  file(COPY adios2_config.xml DESTINATION ${case_dir})
+endif()
+add_test(NAME ${case} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:xcompact3d> ${input_file} WORKING_DIRECTORY ${case_dir})
diff --git a/examples/Taylor-Green-Vortex/TGV_Re1600.dat b/examples/TGV/TGV_Re1600.dat
similarity index 100%
rename from examples/Taylor-Green-Vortex/TGV_Re1600.dat
rename to examples/TGV/TGV_Re1600.dat
diff --git a/examples/Taylor-Green-Vortex/TGV_Re5000.dat b/examples/TGV/TGV_Re5000.dat
similarity index 100%
rename from examples/Taylor-Green-Vortex/TGV_Re5000.dat
rename to examples/TGV/TGV_Re5000.dat
diff --git a/examples/Taylor-Green-Vortex/adios2_config.xml b/examples/TGV/adios2_config.xml
similarity index 100%
rename from examples/Taylor-Green-Vortex/adios2_config.xml
rename to examples/TGV/adios2_config.xml
diff --git a/examples/Taylor-Green-Vortex/input.i3d b/examples/TGV/input.i3d
similarity index 100%
rename from examples/Taylor-Green-Vortex/input.i3d
rename to examples/TGV/input.i3d
diff --git a/examples/Taylor-Green-Vortex/input_2D.i3d b/examples/TGV/input_2D.i3d
similarity index 100%
rename from examples/Taylor-Green-Vortex/input_2D.i3d
rename to examples/TGV/input_2D.i3d
diff --git a/examples/Taylor-Green-Vortex/input_DNS_Re1600.i3d b/examples/TGV/input_DNS_Re1600.i3d
similarity index 100%
rename from examples/Taylor-Green-Vortex/input_DNS_Re1600.i3d
rename to examples/TGV/input_DNS_Re1600.i3d
diff --git a/examples/Taylor-Green-Vortex/input_ILES_Re5000.i3d b/examples/TGV/input_ILES_Re5000.i3d
similarity index 100%
rename from examples/Taylor-Green-Vortex/input_ILES_Re5000.i3d
rename to examples/TGV/input_ILES_Re5000.i3d
diff --git a/examples/Taylor-Green-Vortex/input_test.i3d b/examples/TGV/input_test.i3d
similarity index 100%
rename from examples/Taylor-Green-Vortex/input_test.i3d
rename to examples/TGV/input_test.i3d
diff --git a/examples/Taylor-Green-Vortex/CMakeLists.txt b/examples/Taylor-Green-Vortex/CMakeLists.txt
deleted file mode 100644
index b03c58dea..000000000
--- a/examples/Taylor-Green-Vortex/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/Taylor-Green-Vortex)
-install(FILES input_test.i3d DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/Taylor-Green-Vortex)
-if(${USE_ADIOS2})
-  install(FILES ../adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/Taylor-Green-Vortex)
-endif()
-# If testing active add test for TGV case
-if (${BUILD_TESTING})
-  set(case_dir "${test_dir}/TGV")
-  file(MAKE_DIRECTORY ${case_dir})
-  file(COPY input_test.i3d DESTINATION ${case_dir})
-  if(${USE_ADIOS2})
-    file(COPY ../adios2_config.xml DESTINATION ${case_dir})
-  endif()
-endif()
diff --git a/examples/Turbulent-Boundary-Layer/CMakeLists.txt b/examples/Turbulent-Boundary-Layer/CMakeLists.txt
index ca86c9fef..2ab207481 100644
--- a/examples/Turbulent-Boundary-Layer/CMakeLists.txt
+++ b/examples/Turbulent-Boundary-Layer/CMakeLists.txt
@@ -1,14 +1,16 @@
-install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/TBL)
-install(FILES input_test.i3d DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/TBL)
-if(${USE_ADIOS2})
-  install(FILES ../adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/TBL)
+# ABL Neutral
+set(case "TBL")
+set(case_dir "${test_dir}/${case}")
+file(MAKE_DIRECTORY ${case_dir})
+set(input_file "input_test.i3d")
+install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+install(FILES ${input_file} DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+if(ADIOS2_FOUND)
+  install(FILES adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
 endif()
-# If testing active add test for Cylinder case
-if (${BUILD_TESTING})
-  set(case_dir "${test_dir}/TBL")
-  file(MAKE_DIRECTORY ${case_dir})
-  file(COPY input_test.i3d DESTINATION ${case_dir})
-  if(${USE_ADIOS2})
-    file(COPY ../adios2_config.xml DESTINATION ${case_dir})
-  endif()
+# If testing active add test for TGV case
+file(COPY ${input_file} DESTINATION ${case_dir})
+if(ADIOS2_FOUND)
+  file(COPY adios2_config.xml DESTINATION ${case_dir})
 endif()
+add_test(NAME ${case} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:xcompact3d> ${input_file} WORKING_DIRECTORY ${case_dir})
diff --git a/examples/Wind-Turbine/NREL-5MW_ALM/CMakeLists.txt b/examples/Wind-Turbine/NREL-5MW_ALM/CMakeLists.txt
index de56be3b1..6dc8cddda 100644
--- a/examples/Wind-Turbine/NREL-5MW_ALM/CMakeLists.txt
+++ b/examples/Wind-Turbine/NREL-5MW_ALM/CMakeLists.txt
@@ -1,17 +1,19 @@
-install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/Wind-Turbine)
-install(FILES input_test.i3d DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/Wind-Turbine)
-install(FILES input_test.turb DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/Wind-Turbine)
-if(${USE_ADIOS2})
-  install(FILES ../../adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/Wind-Turbine)
+# ABL Neutral
+set(case "Wind-Turbine")
+set(case_dir "${test_dir}/${case}")
+file(MAKE_DIRECTORY ${case_dir})
+set(input_file "input_test.i3d")
+install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+install(FILES ${input_file} DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+install(FILES input_test.turb DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+if(ADIOS2_FOUND)
+  install(FILES adios2_config.xml DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
 endif()
-# If testing active add test for Wind-Turbine case
-if (${BUILD_TESTING})
-  set(case_dir "${test_dir}/Wind-Turbine")
-  file(MAKE_DIRECTORY ${case_dir})
-  file(COPY input_test.i3d DESTINATION ${case_dir})
-  file(COPY input_test.turb DESTINATION ${case_dir})
-  file(COPY NRELResources DESTINATION ${case_dir})
-  if(${USE_ADIOS2})
-    file(COPY ../../adios2_config.xml DESTINATION ${case_dir})
-  endif()
+# If testing active add test for TGV case
+file(COPY ${input_file} DESTINATION ${case_dir})
+file(COPY input_test.turb DESTINATION ${case_dir})
+file(COPY NRELResources DESTINATION ${case_dir})
+if(ADIOS2_FOUND)
+  file(COPY adios2_config.xml DESTINATION ${case_dir})
 endif()
+add_test(NAME ${case} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:xcompact3d> ${input_file} WORKING_DIRECTORY ${case_dir})
diff --git a/src/BC-ABL.f90 b/src/BC-ABL.f90
index 771d3c8e1..906111236 100644
--- a/src/BC-ABL.f90
+++ b/src/BC-ABL.f90
@@ -4,6 +4,10 @@
 
 module abl
 
+use decomp_2d_constants
+use decomp_2d_mpi
+use decomp_2d
+
 contains
 
   !*******************************************************************************
@@ -12,7 +16,6 @@ subroutine init_abl(ux1,uy1,uz1,ep1,phi1)
   !
   !*******************************************************************************
 
-    use decomp_2d
     use decomp_2d_io
     use variables
     use param
@@ -144,7 +147,6 @@ subroutine boundary_conditions_abl(ux,uy,uz,phi)
 
     USE param
     USE variables
-    USE decomp_2d
 
     implicit none
 
@@ -185,7 +187,6 @@ subroutine inflow (ux,uy,uz,phi)
   
     USE param
     USE variables
-    USE decomp_2d
     USE MPI
     USE var, only: ux_inflow, uy_inflow, uz_inflow
   
@@ -251,7 +252,6 @@ subroutine outflow (ux,uy,uz,phi)
 
     USE param
     USE variables
-    USE decomp_2d
     USE MPI
 
     implicit none
@@ -298,7 +298,6 @@ subroutine momentum_forcing_abl(dux1,duy1,duz1,ux1,uy1,uz1,phi1)
 
     USE param
     USE variables
-    USE decomp_2d
 
     implicit none
 
@@ -350,7 +349,6 @@ subroutine scalar_forcing_abl(uy1,dphi1,phi1)
 
     USE param
     USE variables
-    USE decomp_2d
 
     implicit none
 
@@ -379,7 +377,6 @@ subroutine wall_sgs_slip(ux,uy,uz,phi,nut1,wallfluxx,wallfluxy,wallfluxz)
   !*******************************************************************************
 
     use MPI
-    use decomp_2d
     use param
     use variables
     use var, only: uxf1, uzf1, phif1, uxf3, uzf3, phif3
@@ -655,7 +652,6 @@ subroutine wall_sgs_slip_scalar(sgsphi1,nut1,dphidy1)
   !
   !*******************************************************************************
 
-    use decomp_2d
     use param
     use var, only: heatflux
     use variables
@@ -692,7 +688,6 @@ subroutine wall_sgs_noslip(ux1,uy1,uz1,nut1,wallsgsx1,wallsgsy1,wallsgsz1)
   !*******************************************************************************
 
     use MPI
-    use decomp_2d
     use param
     use variables
     use var, only: di1, di2, di3
@@ -820,7 +815,7 @@ subroutine wall_sgs_noslip(ux1,uy1,uz1,nut1,wallsgsx1,wallsgsy1,wallsgsz1)
     if (mod(itime,ilist)==0.and.nrank==0) then
        ! Write u_shear in file
        write(42,'(20e20.12)') t,u_shear
-       call flush(42)
+       flush(42)
        ! Print in terminal
        write(*,*)  ' ABL:'
        write(*,*)  ' Horizontally-averaged velocity at 5*dy: ', ux_HAve,uz_HAve
@@ -836,7 +831,6 @@ subroutine forceabl (ux) ! Routine to force constant flow rate
   !
   !*******************************************************************************
 
-    use decomp_2d
     use decomp_2d_poisson
     use param
     use var
@@ -899,7 +893,6 @@ subroutine fringe_region (ux,uy,uz)
   !
   !*******************************************************************************
 
-    USE decomp_2d
     USE param
     USE var
 
@@ -1000,7 +993,6 @@ subroutine damping_zone (dux1,duy1,duz1,ux1,uy1,uz1) ! Damping zone for ABL
   !
   !*******************************************************************************
 
-    use decomp_2d
     use param
     use var, only: yp
     use dbg_schemes, only: sin_prec, cos_prec, log_prec
@@ -1064,7 +1056,6 @@ subroutine damping_zone_scalar (dphi1,phi1)
   !
   !*******************************************************************************
 
-    use decomp_2d
     use param
     use var, only: yp
     use dbg_schemes, only: sin_prec
@@ -1103,7 +1094,6 @@ subroutine boundary_height(ux,uy,uz,hBL) ! routine to find the height of the bou
   !
   !*******************************************************************************
 
-    use decomp_2d
     use MPI
     use param
     use variables
@@ -1194,7 +1184,6 @@ subroutine postprocess_abl(ux1, uy1, uz1, ep1)
   !*******************************************************************************
 
     USE MPI
-    USE decomp_2d
     USE decomp_2d_io
     USE var, only : umean,vmean,wmean,uumean,vvmean,wwmean,uvmean,uwmean,vwmean,tmean
     USE var, only : uvisu
diff --git a/src/BC-Channel-flow.f90 b/src/BC-Channel-flow.f90
index c010644a9..69800059b 100644
--- a/src/BC-Channel-flow.f90
+++ b/src/BC-Channel-flow.f90
@@ -4,6 +4,8 @@
 
 module channel
 
+  use decomp_2d_constants
+  use decomp_2d_mpi
   use decomp_2d
   use variables
   use param
@@ -23,7 +25,6 @@ module channel
   !############################################################################
   subroutine init_channel (ux1,uy1,uz1,ep1,phi1)
 
-    use decomp_2d
     use decomp_2d_io
     use variables
     use param
@@ -153,7 +154,6 @@ subroutine boundary_conditions_channel (ux,uy,uz,phi)
     use param
     use var, only : di2
     use variables
-    use decomp_2d
 
     implicit none
 
@@ -256,7 +256,6 @@ subroutine postprocess_channel(ux1,uy1,uz1,pp3,phi1,ep1)
   end subroutine postprocess_channel
   subroutine visu_channel_init(visu_initialised)
 
-    use decomp_2d, only : mytype
     use decomp_2d_io, only : decomp_2d_register_variable
     use visu, only : io_name, output2D
     
@@ -379,7 +378,6 @@ end subroutine momentum_forcing_channel
   !############################################################################
   subroutine geomcomplex_channel(epsi,nxi,nxf,ny,nyi,nyf,nzi,nzf,yp,remp)
 
-    use decomp_2d, only : mytype
     use param, only : zero, one, two, ten
     use ibm
 
diff --git a/src/BC-Cylinder.f90 b/src/BC-Cylinder.f90
index 749a73f3d..1e01e4b1b 100644
--- a/src/BC-Cylinder.f90
+++ b/src/BC-Cylinder.f90
@@ -4,6 +4,8 @@
 
 module cyl
 
+  USE decomp_2d_constants
+  USE decomp_2d_mpi
   USE decomp_2d
   USE variables
   USE param
@@ -22,7 +24,6 @@ module cyl
 
   subroutine geomcomplex_cyl(epsi,nxi,nxf,ny,nyi,nyf,nzi,nzf,dx,yp,remp)
 
-    use decomp_2d, only : mytype
     use param, only : one, two, ten
     use ibm_param
     use dbg_schemes, only: sqrt_prec
@@ -85,7 +86,6 @@ subroutine boundary_conditions_cyl (ux,uy,uz,phi)
 
     USE param
     USE variables
-    USE decomp_2d
 
     implicit none
 
@@ -102,7 +102,6 @@ subroutine inflow (phi)
 
     USE param
     USE variables
-    USE decomp_2d
     USE ibm_param
 
     implicit none
@@ -138,7 +137,6 @@ subroutine outflow (ux,uy,uz,phi)
 
     USE param
     USE variables
-    USE decomp_2d
     USE MPI
     USE ibm_param
 
@@ -207,7 +205,6 @@ end subroutine outflow
   !********************************************************************
   subroutine init_cyl (ux1,uy1,uz1,phi1)
 
-    USE decomp_2d
     USE decomp_2d_io
     USE variables
     USE param
@@ -288,7 +285,6 @@ end subroutine init_cyl
   subroutine postprocess_cyl(ux1,uy1,uz1,ep1)
 
     USE MPI
-    USE decomp_2d
     USE decomp_2d_io
     USE var, only : uvisu
     USE var, only : ta1,tb1,tc1,td1,te1,tf1,tg1,th1,ti1,di1
@@ -302,7 +298,6 @@ end subroutine postprocess_cyl
 
   subroutine visu_cyl_init (visu_initialised)
 
-    use decomp_2d, only : mytype
     use decomp_2d_io, only : decomp_2d_register_variable
     use visu, only : io_name, output2D
     
diff --git a/src/BC-Lock-exchange.f90 b/src/BC-Lock-exchange.f90
index 39684385b..6398b9362 100644
--- a/src/BC-Lock-exchange.f90
+++ b/src/BC-Lock-exchange.f90
@@ -4,11 +4,12 @@
 
 module lockexch
 
-  use decomp_2d, only : mytype, real_type, real2_type
+  use decomp_2d_constants, only : mytype, real_type, real2_type
   use decomp_2d, only : xsize, ysize, zsize
   use decomp_2d, only : xstart, ystart, zstart
   use decomp_2d, only : xend, yend, zend
   use decomp_2d, only : transpose_x_to_y, transpose_y_to_z, transpose_z_to_y, transpose_y_to_x
+  use decomp_2d, only : xszV, alloc_x, fine_to_coarsev
 
   use variables, only : numscalar
 
@@ -50,7 +51,6 @@ subroutine boundary_conditions_lockexch (rho1, phi1)
 
     USE param
     USE variables
-    USE decomp_2d
     USE MPI
 
     implicit none
@@ -98,7 +98,6 @@ end subroutine boundary_conditions_lockexch
 
   subroutine init_lockexch (rho1,ux1,uy1,uz1,ep1,phi1)
 
-    USE decomp_2d
     USE decomp_2d_io
     USE variables
     USE param
@@ -246,7 +245,6 @@ end subroutine init_lockexch
 
   subroutine visu_lockexch_init(visu_initialised)
 
-    use decomp_2d, only : mytype
     use decomp_2d_io, only : decomp_2d_register_variable
     
     implicit none
@@ -262,8 +260,6 @@ end subroutine visu_lockexch_init
 
   subroutine postprocess_lockexch(rho1,ux1,uy1,uz1,phi1,ep1) !By Felipe Schuch
 
-    use decomp_2d, only : alloc_x
-
     use var, only : phi2, rho2
     use var, only : phi3, rho3
     use tools, only : mean_plane_z
@@ -377,7 +373,6 @@ end subroutine postprocess_lockexch
 
   subroutine budget(rho1,ux1,uy1,uz1,phi1,vol1)
 
-    USE decomp_2d
     USE decomp_2d_io
     USE MPI
 
@@ -419,8 +414,8 @@ subroutine budget(rho1,ux1,uy1,uz1,phi1,vol1)
 
     real(mytype),dimension(xszV(1),xszV(2),xszV(3)) :: uvisu
 
-    real(8) :: ek,ek1,dek,dek1,ep,ep1,dep,dep1,xvol
-    integer :: ijk,i,j,k,l,m,is,code
+    real(mytype) :: ek,ek1,dek,dek1,ep,ep1,dep,dep1,xvol
+    integer :: i,j,k,l,m,is,code
     character(len=30) :: filename
 
     real(mytype) :: visc
@@ -487,10 +482,14 @@ subroutine budget(rho1,ux1,uy1,uz1,phi1,vol1)
        enddo
     enddo
 
-    do ijk=1,xsize(1)*xsize(2)*xsize(3)
-       xvol=real(vol1(ijk,1,1),8)
-       ek = ek + half * xvol * rho1(ijk,1,1,1) * (ux1(ijk,1,1)**2+uy1(ijk,1,1)**2+uz1(ijk,1,1)**2)
-       dek = dek + xvol * diss1(ijk,1,1)
+    do k=1,xsize(3)
+       do j=1,xsize(2)
+          do i=1,xsize(1)
+             xvol=real(vol1(i,j,k), mytype)
+             ek = ek + half * xvol * rho1(i,j,k,1) * (ux1(i,j,k)**2+uy1(i,j,k)**2+uz1(i,j,k)**2)
+             dek = dek + xvol * diss1(i,j,k)
+          enddo
+       enddo
     enddo
 
     call transpose_x_to_y(vol1,vol2)
@@ -524,7 +523,7 @@ subroutine budget(rho1,ux1,uy1,uz1,phi1,vol1)
           do j=1,ysize(2)
              y = (j + ystart(2) - 2) * dy
              do i=1,ysize(1)
-                xvol=real(vol2(i,j,k),8)
+                xvol=real(vol2(i,j,k),mytype)
                 ep = ep - xvol * ri(is) * phi2(i,j,k,is) * (gravy * y)
                 dep = dep &
                      - xvol * ri(is) * (ddphi2(i,j,k)*xnu/sc(is) &
@@ -552,7 +551,7 @@ subroutine budget(rho1,ux1,uy1,uz1,phi1,vol1)
           do j = 1, ysize(2)
              y = (j + ystart(2) - 2) * dy
              do i = 1, ysize(1)
-                xvol = real(vol2(i, j, k), 8)
+                xvol = real(vol2(i, j, k), mytype)
                 ep = ep - xvol * (one / Fr**2) * rho2(i, j, k) * (gravy * y)
                 dep = dep - xvol * ((xnu / prandtl / (Fr**2)) &
                      * (ta2(i, j, k) + tb2(i, j, k) + tc2(i, j, k))) &
diff --git a/src/BC-Mixing-layer.f90 b/src/BC-Mixing-layer.f90
index a0a035313..90066fcb0 100644
--- a/src/BC-Mixing-layer.f90
+++ b/src/BC-Mixing-layer.f90
@@ -4,6 +4,8 @@
 
 module mixlayer
 
+  USE decomp_2d_constants
+  USE decomp_2d_mpi
   USE decomp_2d
   USE variables
   USE param
@@ -17,7 +19,6 @@ module mixlayer
 
   subroutine init_mixlayer (rho1,ux1,uy1,uz1)
 
-    use decomp_2d, only : mytype, xsize
     use param, only : u1, u2, dens1, dens2
     use param, only : half, one, two, four, eight, sixteen
     use param, only : ntime, nrhotime
diff --git a/src/BC-Periodic-hill.f90 b/src/BC-Periodic-hill.f90
index b320395fa..96f447b48 100644
--- a/src/BC-Periodic-hill.f90
+++ b/src/BC-Periodic-hill.f90
@@ -16,6 +16,8 @@
 
 module hill
 
+  USE decomp_2d_constants
+  USE decomp_2d_mpi
   USE decomp_2d
   USE variables
   USE param
@@ -36,7 +38,6 @@ module hill
   subroutine geomcomplex_hill(epsi,nxi,nxf,ny,nyi,nyf,nzi,nzf,dx,yp,dz,remp)
 !############################################################################
 
-    use decomp_2d, only : mytype
     use param, only : zero, one, two, three, nine, fourteen, twenty, twentyeight
     use ibm
 
@@ -115,7 +116,6 @@ subroutine boundary_conditions_hill (ux,uy,uz,phi,ep1)
 
     USE param
     USE variables
-    USE decomp_2d
 
     implicit none
 
@@ -136,7 +136,6 @@ end subroutine boundary_conditions_hill
   subroutine init_hill (ux1,uy1,uz1,ep1,phi1)
 !############################################################################
 
-    USE decomp_2d
     USE decomp_2d_io
     USE variables
     USE param
@@ -225,7 +224,6 @@ end subroutine init_post
   subroutine hill_flrt (ux,constant)
 !############################################################################
 
-    USE decomp_2d
     USE decomp_2d_poisson
     USE variables
     USE param
@@ -286,7 +284,6 @@ subroutine postprocess_hill(ux1,uy1,uz1,pp3,phi1,ep1)
   end subroutine postprocess_hill
   subroutine visu_hill_init(visu_initialised)
 
-    use decomp_2d, only : mytype
     use decomp_2d_io, only : decomp_2d_register_variable
     use visu, only : io_name, output2D
     
diff --git a/src/BC-Pipe-flow.f90 b/src/BC-Pipe-flow.f90
index fc1187d3c..4706935b9 100644
--- a/src/BC-Pipe-flow.f90
+++ b/src/BC-Pipe-flow.f90
@@ -4,6 +4,8 @@
 
 module pipe
 
+  use decomp_2d_constants
+  use decomp_2d_mpi
   use decomp_2d
   use variables
   use param
@@ -33,7 +35,6 @@ subroutine geomcomplex_pipe(epsi,nxi,nxf,ny,nyi,nyf,nzi,nzf,dx,yp,dz,remp)
   !
   !********************************************************************
 
-    use decomp_2d,only : mytype
     use MPI
     use param,only : zero,one,two,yly,zlz
     use ibm_param
@@ -100,7 +101,6 @@ subroutine init_pipe (ux1,uy1,uz1,ep1,phi1)
   !
   !********************************************************************
 
-    use decomp_2d
     use decomp_2d_io
     use variables
     use param
@@ -193,7 +193,6 @@ subroutine boundary_conditions_pipe (ux,uy,uz,phi)
 
     use param
     use variables
-    use decomp_2d
 
     implicit none
 
diff --git a/src/BC-Sandbox.f90 b/src/BC-Sandbox.f90
index 917d424b5..ffb27f81a 100644
--- a/src/BC-Sandbox.f90
+++ b/src/BC-Sandbox.f90
@@ -20,12 +20,12 @@
 
 module sandbox
 
-  use decomp_2d, only : mytype, real_type, real2_type
+  use decomp_2d_constants, only : mytype, real_type, real2_type
   use decomp_2d, only : xsize, ysize, zsize
   use decomp_2d, only : xstart, ystart, zstart
   use decomp_2d, only : xend, yend, zend
   use decomp_2d, only : transpose_x_to_y, transpose_y_to_z, &
-      transpose_z_to_y, transpose_y_to_x
+      transpose_z_to_y, transpose_y_to_x, alloc_x
 
   use variables, only : numscalar
 
@@ -74,7 +74,6 @@ module sandbox
 
   subroutine geomcomplex_sandbox(epsi, nxi, nxf, ny, nyi, nyf, nzi, nzf, yp, remp)
 
-    use decomp_2d, only : mytype, xstart, xend
     use decomp_2d_io, only : decomp_2d_read_one
     use param, only : one, two
     use variables, only : nx, nz
@@ -111,7 +110,6 @@ subroutine boundary_conditions_sandbox (ux, uy, uz, phi1)
 
     USE param
     USE variables
-    USE decomp_2d
     USE MPI
 
     implicit none
@@ -138,7 +136,6 @@ subroutine deposit (phi1)
     !================================================================================
     USE param
     USE variables
-    USE decomp_2d
     USE MPI
     USE var, only : phi2, phi3, ta2, di2
 
@@ -196,7 +193,6 @@ subroutine inflow (phi)
 
     USE param
     USE variables
-    USE decomp_2d
 
     implicit none
 
@@ -226,7 +222,6 @@ subroutine outflow (ux,uy,uz,phi)
 
     USE param
     USE variables
-    USE decomp_2d
     USE MPI
 
     implicit none
@@ -301,7 +296,6 @@ subroutine flow_rate_control (ux)
     !================================================================================
     USE param
     USE variables
-    USE decomp_2d
     USE MPI
 
     implicit none
@@ -336,7 +330,6 @@ subroutine flow_rate_control_SZA (ux, uy, uz)
     !================================================================================
     USE param
     USE variables
-    USE decomp_2d
     USE MPI
 
     implicit none
@@ -386,7 +379,6 @@ end subroutine flow_rate_control_SZA
   !********************************************************************
   subroutine init_sandbox (ux1,uy1,uz1,ep1,phi1,iresflg)
 
-    USE decomp_2d
     USE decomp_2d_io
     USE variables
     USE param
diff --git a/src/BC-TBL.f90 b/src/BC-TBL.f90
index 9cd7f9176..c74a90f8e 100644
--- a/src/BC-TBL.f90
+++ b/src/BC-TBL.f90
@@ -5,6 +5,8 @@
 module tbl
 
   use decomp_2d
+  use decomp_2d_constants
+  use decomp_2d_mpi
   use variables
   use param
 
@@ -317,7 +319,6 @@ end subroutine postprocess_tbl
 
   subroutine visu_tbl_init (visu_initialised)
 
-    use decomp_2d, only : mytype
     use decomp_2d_io, only : decomp_2d_register_variable
     use visu, only : io_name, output2D
     
diff --git a/src/BC-TGV.f90 b/src/BC-TGV.f90
index f0963365c..d42997f8c 100644
--- a/src/BC-TGV.f90
+++ b/src/BC-TGV.f90
@@ -4,6 +4,8 @@
 
 module tgv
 
+  USE decomp_2d_constants
+  USE decomp_2d_mpi
   USE decomp_2d
   USE variables
   USE param
@@ -22,7 +24,6 @@ module tgv
 
   subroutine init_tgv (ux1,uy1,uz1,ep1,phi1)
 
-    use decomp_2d
     use decomp_2d_io
     use variables
     use param
@@ -137,7 +138,6 @@ subroutine boundary_conditions_tgv (ux,uy,uz,phi)
 
     USE param
     USE variables
-    USE decomp_2d
 
     implicit none
 
@@ -215,7 +215,6 @@ end subroutine init_post
   !############################################################################
   subroutine postprocess_tgv(ux1,uy1,uz1,phi1,ep1)
 
-    USE decomp_2d
     USE decomp_2d_io
     USE variables, only: nx,ny,nz
     USE MPI
@@ -398,7 +397,7 @@ subroutine postprocess_tgv(ux1,uy1,uz1,phi1,ep1)
        
        if (nrank==0) then
           write(42,'(20e20.12)') (itime-1)*dt,eek,eps,eps2,enst
-          call flush(42)
+          flush(42)
        endif
     endif
 
@@ -413,7 +412,6 @@ end subroutine postprocess_tgv
   !############################################################################
   subroutine visu_tgv_init (visu_initialised)
 
-    use decomp_2d, only : mytype
     use decomp_2d_io, only : decomp_2d_register_variable
     use visu, only : io_name, output2D
     
@@ -537,7 +535,6 @@ subroutine dissipation (ta1,tb1,tc1,td1,te1,tf1,tg1,th1,ti1,diss1)
 
     USE param
     USE variables
-    USE decomp_2d
     implicit none
 
     real(mytype),dimension(xsize(1),xsize(2),xsize(3)) :: ta1,tb1,tc1,td1,te1,tf1,tg1,th1,ti1,diss1
@@ -575,7 +572,6 @@ end subroutine dissipation
   !############################################################################
   subroutine error_tgv2D(ux1,uy1,phi1)
 
-    use decomp_2d
     use MPI
     use param, only : one, two, xnu, ifirst, itime
     use variables, only : numscalar, sc
@@ -738,7 +734,6 @@ end subroutine error_tgv2D
   ! Compute the damping factors
   subroutine compute_tgv2D_errors(xdamping, ydamping, sdamping)
 
-    use decomp_2d
     use param, only : one, two, xnu, ifirst, itime, itimescheme, iimplicit
     use variables, only : numscalar, sc
     use dbg_schemes, only: exp_prec
@@ -841,7 +836,6 @@ end subroutine compute_tgv2D_errors
   ! Warning : we use the X momentum wavenumber for Y momentum and for the scalars
   subroutine compute_k2(kin, k2out)
 
-    use decomp_2d, only : mytype
     use param
     use derivx, only : alsaix, asix, bsix, csix, dsix
     use dbg_schemes, only: cos_prec
@@ -868,7 +862,6 @@ end subroutine compute_k2
   ! Compute L1, L2 and Linf norm of given 3D array
   subroutine error_L1_L2_Linf(err, l1, l2, linf)
 
-    USE decomp_2d
     USE MPI
     
     implicit none
diff --git a/src/BC-Uniform.f90 b/src/BC-Uniform.f90
index 8c1c0809f..90b5b6d79 100644
--- a/src/BC-Uniform.f90
+++ b/src/BC-Uniform.f90
@@ -4,6 +4,8 @@
 
 module uniform
 
+  USE decomp_2d_constants
+  USE decomp_2d_mpi
   USE decomp_2d
   USE variables
   USE param
@@ -26,7 +28,6 @@ subroutine init_uniform (ux1,uy1,uz1,ep1,phi1)
   !
   !*******************************************************************************
 
-    USE decomp_2d
     USE decomp_2d_io
     USE variables
     USE param
@@ -89,7 +90,6 @@ subroutine boundary_conditions_uniform (ux,uy,uz,phi)
 
     USE param
     USE variables
-    USE decomp_2d
     USE MPI
 
     implicit none
@@ -120,7 +120,6 @@ subroutine inflow (ux,uy,uz,phi)
   
     USE param
     USE variables
-    USE decomp_2d
     USE MPI
     USE var, only: ux_inflow, uy_inflow, uz_inflow
   
@@ -186,7 +185,6 @@ subroutine outflow (ux,uy,uz,phi)
 
     USE param
     USE variables
-    USE decomp_2d
     USE MPI
 
     implicit none
@@ -234,7 +232,6 @@ subroutine postprocess_uniform(ux1,uy1,uz1,ep1)
   !*******************************************************************************
 
     USE MPI
-    USE decomp_2d
     USE decomp_2d_io
     USE var, only : uvisu
     USE var, only : ta1,tb1,tc1,td1,te1,tf1,tg1,th1,ti1,di1
@@ -248,7 +245,6 @@ end subroutine postprocess_uniform
   
    subroutine visu_uniform_init (visu_initialised)
 
-    use decomp_2d, only : mytype
     use decomp_2d_io, only : decomp_2d_register_variable
     use visu, only : io_name, output2D
     
diff --git a/src/BC-User.f90 b/src/BC-User.f90
index 2f8496fc1..0f6213ea5 100644
--- a/src/BC-User.f90
+++ b/src/BC-User.f90
@@ -4,6 +4,8 @@
 
 module user_sim
 
+  USE decomp_2d_constants
+  USE decomp_2d_mpi
   USE decomp_2d
   USE variables
   USE param
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e945010c9..8acb39a56 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,5 +1,5 @@
-include_directories(${CMAKE_SOURCE_DIR}/decomp2d
-	            ${CMAKE_SOURCE_DIR}/src)
+include_directories(${2decomp_INCLUDE_DIR}
+                    ${CMAKE_SOURCE_DIR}/src)
 
 message(STATUS "CMAKE_SOURCE_DIR: ${CMAKE_SOURCE_DIR}")
 message(STATUS "PROJECT_BINARY_DIR: ${PROJECT_BINARY_DIR}")
@@ -19,7 +19,6 @@ add_executable(xcompact3d
                     BC-Cavity.f90
                     BC-Channel-flow.f90
                     BC-Cylinder.f90
-                    BC-dbg-schemes.f90
                     BC-Lock-exchange.f90
                     BC-Mixing-layer.f90
                     BC-Periodic-hill.f90
@@ -54,12 +53,13 @@ add_executable(xcompact3d
                     variables.f90
                     visu.f90
                     xcompact3d.f90)
-#add_executable(xcompact3d ${files_xcompact})
-target_include_directories(xcompact3d PRIVATE ${PROJECT_BINARY_DIR}/decomp2d)
+
 target_link_libraries(xcompact3d PRIVATE decomp2d)
+
 if (MPI_FOUND)
   target_link_libraries(xcompact3d PRIVATE MPI::MPI_Fortran)
 endif (MPI_FOUND)
+
 if(ADIOS2_FOUND)
   #target_link_libraries(xcompact3d ${ADIOS_LIBRARIES})
   target_link_libraries(xcompact3d PRIVATE adios2::fortran_mpi)
diff --git a/src/acl_controller.f90 b/src/acl_controller.f90
index 22ce01aa9..988d27b19 100644
--- a/src/acl_controller.f90
+++ b/src/acl_controller.f90
@@ -4,7 +4,8 @@
 
 module actuator_line_controller
 
-    use decomp_2d, only: mytype, nrank
+    use decomp_2d_constants, only: mytype
+    use decomp_2d_mpi, only: nrank
     use variables, only: ilist
     use param, only: itime
      
diff --git a/src/acl_elem.f90 b/src/acl_elem.f90
index 3f161f790..70faab818 100644
--- a/src/acl_elem.f90
+++ b/src/acl_elem.f90
@@ -4,7 +4,7 @@
 
 module actuator_line_element
 
-    use decomp_2d, only: mytype
+    use decomp_2d_constants, only: mytype
     use actuator_line_model_utils 
     use airfoils
     use dynstall_legacy
diff --git a/src/acl_model.f90 b/src/acl_model.f90
index 951297a46..9568bf379 100644
--- a/src/acl_model.f90
+++ b/src/acl_model.f90
@@ -4,7 +4,8 @@
 
 module actuator_line_model
 
-    use decomp_2d, only: mytype, nrank
+    use decomp_2d_constants, only: mytype
+    use decomp_2d_mpi, only : nrank
     use variables, only : ilist
     use param, only: itime
     use actuator_line_model_utils
@@ -60,7 +61,7 @@ subroutine actuator_line_model_init(Nturbines,Nactuatorlines,turbines_file,actua
       if (Ntur>0) then
          if (irestart==1) then
             ! Read the checkpoint information and rotate actuator lines accordingly
-            write(filename,"('restart',I7.7'.alm')") itime
+            write(filename,"('restart',I7.7,'.alm')") itime
             open(17,file=filename)
             read(17,*)
             do itur=1,Ntur
@@ -147,7 +148,7 @@ subroutine get_turbine_options(turbines_path)
     !
     !*******************************************************************************
 
-      use decomp_2d, only: mytype
+      use decomp_2d_constants, only : mytype
       use param, only: u1,u2
       use param, only: zero, zpthree, zptwoone, zpfive, one, twentyone
       use constants
@@ -468,7 +469,7 @@ subroutine actuator_line_model_update(current_time,dt)
     !
     !*******************************************************************************
 
-      use decomp_2d, only: mytype
+      use decomp_2d_constants, only : mytype
       use param, only: zpfive, two, sixty, onehundredeighty
       use dbg_schemes, only: sin_prec
 
@@ -667,7 +668,7 @@ subroutine actuator_line_model_write_restart()
       character(len=30) :: filename
 
       if (Ntur>0) then
-         write(filename,"('restart',I7.7'.alm')") itime
+         write(filename,"('restart',I7.7,'.alm')") itime
          open(2021,file=filename)
          write(2021,*) 'Azimuthal angle, Angular velocity, Collective blade pitch'
          do itur=1,Ntur
diff --git a/src/acl_source.f90 b/src/acl_source.f90
index 5d609b0cb..1cf18a85c 100644
--- a/src/acl_source.f90
+++ b/src/acl_source.f90
@@ -4,8 +4,8 @@
 
 module actuator_line_source
 
-    use decomp_2d, only: mytype
-    use decomp_2d, only: real_type
+    use decomp_2d_constants, only : mytype
+    use decomp_2d_constants, only: real_type
     use variables, only: ilist
     use param, only: itime, zero, half, one
     use dbg_schemes, only: sin_prec, sqrt_prec
@@ -257,7 +257,8 @@ subroutine Compute_Momentum_Source_Term_pointwise
     !
     !*******************************************************************************
 
-      use decomp_2d, only: nproc, xstart, xend, xsize, update_halo
+      use decomp_2d_mpi, only: nproc
+      use decomp_2d, only: xstart, xend, xsize, update_halo
       use MPI
       use param, only: dx,dy,dz,eps_factor,xnu,istret,xlx,yly,zlz
       use var, only: ux1, uy1, uz1, FTx, FTy, FTz, yp
diff --git a/src/acl_turb.f90 b/src/acl_turb.f90
index 7a6981314..5cdb68e93 100644
--- a/src/acl_turb.f90
+++ b/src/acl_turb.f90
@@ -4,8 +4,8 @@
 
 module actuator_line_turbine
     
-    use decomp_2d, only: mytype, nrank
-    use decomp_2d, only: real_type
+    use decomp_2d_constants, only: mytype, real_type
+    use decomp_2d_mpi, only: nrank
     use variables, only: ilist
     use param, only: itime, zero, zpone, half, one, two, onethousand
     use dbg_schemes, only: cos_prec, sin_prec, abs_prec, exp_prec, acos_prec, sqrt_prec
diff --git a/src/acl_utils.f90 b/src/acl_utils.f90
index a64ccc36d..1c449b66e 100644
--- a/src/acl_utils.f90
+++ b/src/acl_utils.f90
@@ -4,7 +4,7 @@
 
 module actuator_line_model_utils
 
-    use decomp_2d, only: mytype
+    use decomp_2d_constants, only: mytype
     use param, only: zero, one, two
     use dbg_schemes, only: sqrt_prec, cos_prec, exp_prec, sin_prec
     
diff --git a/src/adm.f90 b/src/adm.f90
index 659e5b50d..8f194d0b7 100644
--- a/src/adm.f90
+++ b/src/adm.f90
@@ -4,8 +4,8 @@
 
 module actuator_disc_model
 
-    use decomp_2d, only: mytype, nrank
-    use decomp_2d, only: real_type
+    use decomp_2d_mpi, only: nrank, nproc
+    use decomp_2d_constants, only: mytype, real_type
     use actuator_line_model_utils 
     use airfoils
 
@@ -165,7 +165,7 @@ subroutine actuator_disc_model_compute_source(ux1,uy1,uz1)
     !
     !*******************************************************************************
         
-      use decomp_2d, only: mytype, nproc, xsize
+      use decomp_2d, only: xsize
       use MPI
       use param, only: dx, dy, dz, dt, itime, initstat, rho_air, T_relax, dBL, ustar
       use var, only: Fdiscx, Fdiscy, Fdiscz, GammaDisc
diff --git a/src/airfoils.f90 b/src/airfoils.f90
index 1059cd5ff..3a3d1e0c0 100644
--- a/src/airfoils.f90
+++ b/src/airfoils.f90
@@ -4,7 +4,7 @@
 
 module Airfoils
 
-    use decomp_2d, only: mytype
+    use decomp_2d_constants, only: mytype
     use actuator_line_model_utils
     use constants
 
diff --git a/src/case.f90 b/src/case.f90
index 37b9d9bbf..d420415f6 100644
--- a/src/case.f90
+++ b/src/case.f90
@@ -5,7 +5,7 @@
 module case
 
   use param
-  use decomp_2d
+  use decomp_2d_constants
   use variables
 
   use user_sim
@@ -207,7 +207,7 @@ end subroutine boundary_conditions
   !##################################################################
   subroutine preprocessing(rho1, ux1, uy1, uz1, pp3, phi1, ep1)
 
-    use decomp_2d, only : mytype, xsize, ph1
+    use decomp_2d, only : xsize, ph1
     use visu, only  : write_snapshot
     use stats, only : overall_statistic
 
@@ -230,7 +230,7 @@ end subroutine preprocessing
   !##################################################################
   subroutine postprocessing(rho1, ux1, uy1, uz1, pp3, phi1, ep1)
 
-    use decomp_2d, only : mytype, xsize, ph1
+    use decomp_2d, only : xsize, ph1
     use var, only : nzmsize, numscalar, nrhotime, npress, abl_T
 
     real(mytype),dimension(xsize(1),xsize(2),xsize(3)), intent(in) :: ux1, uy1, uz1
@@ -256,7 +256,7 @@ end subroutine postprocessing
   !##################################################################
   subroutine run_postprocessing(rho1, ux1, uy1, uz1, pp3, phi1, ep1)
 
-    use decomp_2d, only : mytype, xsize, ph1
+    use decomp_2d, only : xsize, ph1
     use visu, only  : write_snapshot, end_snapshot
     use stats, only : overall_statistic
 
diff --git a/src/constants.f90 b/src/constants.f90
index d57cd4c82..114831df3 100644
--- a/src/constants.f90
+++ b/src/constants.f90
@@ -4,7 +4,7 @@
 
 module constants
 
-    use decomp_2d, only: mytype
+    use decomp_2d_constants, only: mytype
     use param, only: onehundredeighty
 
     ! Mathematical constants
diff --git a/src/dynstall.f90 b/src/dynstall.f90
index 594b6b0f4..ed2f4b524 100644
--- a/src/dynstall.f90
+++ b/src/dynstall.f90
@@ -4,7 +4,8 @@
 
 module dynstall
     
-    use decomp_2d, only: nrank, mytype
+    use decomp_2d_mpi, only: nrank
+    use decomp_2d_constants, only : mytype
     use airfoils
 
     implicit none
diff --git a/src/dynstall_legacy.f90 b/src/dynstall_legacy.f90
index cb1e1467d..1683e958c 100644
--- a/src/dynstall_legacy.f90
+++ b/src/dynstall_legacy.f90
@@ -82,7 +82,7 @@ subroutine LB_EvalIdealCL(AOA,AOA0,CLa,RefFlag,CLID)
     !
     !*******************************************************************************
 
-      use decomp_2d, only: mytype
+      use decomp_2d_constants, only : mytype
       use param, only: one, two, thirty
       use dbg_schemes, only: sin_prec
     
@@ -153,7 +153,7 @@ subroutine LB_UpdateStates(lb,airfoil,Re,ds)
     !
     !*******************************************************************************
         
-      use decomp_2d, only: mytype
+      use decomp_2d_constants, only : mytype
       use param, only: half, one, two, three, four, eleven
       use dbg_schemes, only: exp_prec
 
@@ -309,7 +309,7 @@ subroutine LB_DynStall(airfoil,lb,CLstat,CDstat,alphaL,alpha5,Re,CL,CD)
     !
     !*******************************************************************************
 
-      use decomp_2d, only: mytype
+      use decomp_2d_constants, only : mytype
       use param, only: zero, zpone, zptwofive, one, two, four, fifty
       use dbg_schemes, only: abs_prec, sqrt_prec, sin_prec, cos_prec, exp_prec
 
diff --git a/src/filters.f90 b/src/filters.f90
index d71e96fe0..cf34259b0 100644
--- a/src/filters.f90
+++ b/src/filters.f90
@@ -63,7 +63,8 @@ subroutine set_filter_coefficients(af,alfa1,a1,b1,c1,d1,alfa2,a2,b2,c2,d2,alfa3,
      alfan,an,bn,cn,dn,alfam,am,bm,cm,dm,alfap,ap,bp,cp,dp,ep,fp,&
      alfai,ai,bi,ci,di,ff,fs,fw,ffp,fsp,fwp,n,ncl1,ncln)
 
-  use decomp_2d, only : mytype, nrank
+  use decomp_2d_constants, only : mytype
+  use decomp_2d_mpi, only : nrank
   use param
 
   implicit none
diff --git a/src/forces.f90 b/src/forces.f90
index 66d274c45..42b79dc20 100644
--- a/src/forces.f90
+++ b/src/forces.f90
@@ -13,7 +13,11 @@
 !=======================================================================
 
 module forces
+
+  use decomp_2d_constants
+  use decomp_2d_mpi
   USE decomp_2d
+
   implicit none
 
   integer :: nvol,iforces
@@ -30,7 +34,6 @@ module forces
 
   subroutine init_forces
 
-    USE decomp_2d
     USE decomp_2d_io, only : decomp_2d_register_variable, decomp_2d_init_io
     USE param
     USE variables
@@ -118,7 +121,6 @@ end subroutine init_forces
 
   subroutine restart_forces(itest1)
 
-    USE decomp_2d
     USE decomp_2d_io
     USE variables
     USE param
@@ -180,7 +182,6 @@ subroutine force(ux1,uy1,ep1)
 
     USE param
     USE variables
-    USE decomp_2d
     USE MPI
     USE ibm_param
 
@@ -506,12 +507,12 @@ subroutine force(ux1,uy1,ep1)
        !     endif
        if (nrank .eq. 0) then
           write(38,*) t,xDrag_mean,yLift_mean
-          call flush(38)
+          flush(38)
        endif
        if (mod(itime, icheckpoint).eq.0) then
           if (nrank .eq. 0) then
              write(filename,"('forces.dat',I7.7)") itime
-             call system("cp forces.dat " //filename)
+             call execute_command_line ("cp forces.dat "//filename)
           endif
        endif
     enddo
diff --git a/src/genepsi3d.f90 b/src/genepsi3d.f90
index c907260e3..60a3c33ca 100644
--- a/src/genepsi3d.f90
+++ b/src/genepsi3d.f90
@@ -4,6 +4,9 @@
 
 module genepsi
 
+  use decomp_2d_constants
+  use decomp_2d_mpi
+
   public
 
 contains
@@ -11,7 +14,7 @@ module genepsi
   subroutine epsi_init(ep1)
 
     USE param, only : zero, one, dx, dz
-    USE decomp_2d, only : xstart, xend, xsize, mytype, nrank
+    USE decomp_2d, only : xstart, xend, xsize
     !USE decomp_2d_io
     USE variables, only : yp, ny
 
@@ -28,7 +31,7 @@ subroutine epsi_init(ep1)
     if (nrank==0) then
       inquire(file="geometry", exist=dir_exists)
       if (.not.dir_exists) then
-        call system("mkdir geometry 2> /dev/null")
+        call execute_command_line("mkdir geometry 2> /dev/null")
       end if
     end if
     !###################################################################
@@ -47,7 +50,6 @@ subroutine geomcomplex(epsi, nxi, nxf, ny, nyi, nyf, nzi, nzf, dx, yp, dz, remp)
 
     USE param, ONLY : itype, itype_cyl, itype_hill, itype_channel,&
                       itype_sandbox, itype_pipe
-    USE decomp_2d, ONLY : mytype
     USE cyl, ONLY : geomcomplex_cyl
     USE hill, ONLY : geomcomplex_hill
     USE channel, ONLY : geomcomplex_channel
@@ -121,11 +123,11 @@ subroutine genepsi3d(ep1)
     if (nrank==0) then
       inquire(file="data", exist=dir_exists)
       if (.not.dir_exists) then
-        call system("mkdir data 2> /dev/null")
+        call execute_command_line("mkdir data 2> /dev/null")
       end if
       inquire(file="data/geometry", exist=dir_exists)
       if (.not.dir_exists) then
-        call system("mkdir data/geometry 2> /dev/null")
+        call execute_command_line("mkdir data/geometry 2> /dev/null")
       end if
     end if
     !###################################################################
diff --git a/src/ibm.f90 b/src/ibm.f90
index 7f05a985d..8509f272a 100644
--- a/src/ibm.f90
+++ b/src/ibm.f90
@@ -4,6 +4,9 @@
 
 module ibm
 
+  use decomp_2d_constants
+  use decomp_2d_mpi, only : nrank
+
   public
 
 contains
@@ -48,7 +51,7 @@ end subroutine corgp_IBM
   !############################################################################
   subroutine body(ux1,uy1,uz1,ep1)
     use param, only : zero, one, dx, dz
-    use decomp_2d, only : xstart, xend, xsize, mytype, nrank
+    use decomp_2d, only : xstart, xend, xsize
     !use decomp_2d_io
     use variables, only : ny
     implicit none
@@ -1026,7 +1029,7 @@ end subroutine ana_x_cyl
 SUBROUTINE analitic_x(j,x_pos,ana_res,k)
 
   USE param, ONLY : itype, itype_cyl
-  USE decomp_2d, ONLY : mytype
+  USE decomp_2d_constants, ONLY : mytype
 !  USE cyl, ONLY : geomcomplex_cyl
 
   IMPLICIT NONE
@@ -1046,7 +1049,7 @@ END SUBROUTINE analitic_x
 SUBROUTINE analitic_y(i,y_pos,ana_res,k)
 
   USE param, ONLY : itype, itype_cyl
-  USE decomp_2d, ONLY : mytype
+  USE decomp_2d_constants, ONLY : mytype
 !  USE cyl, ONLY : geomcomplex_cyl
 
   IMPLICIT NONE
diff --git a/src/implicit.f90 b/src/implicit.f90
index ec5320f3d..55ff9b474 100644
--- a/src/implicit.f90
+++ b/src/implicit.f90
@@ -4,7 +4,7 @@
 
 module ludecomp
 
-  use decomp_2d, only : mytype
+  use decomp_2d_constants, only : mytype
 
   implicit none
 
@@ -36,7 +36,7 @@ module ludecomp
   subroutine ludecomp7_12(aam,bbm,ccm,ddm,eem,qqm,ggm,hhm,ssm,rrm,vvm,wwm,zzm,ny)
 !
 !*******************************************************************
-    use decomp_2d, only : mytype
+    use decomp_2d_constants, only : mytype
     USE param
     implicit none
     
@@ -83,7 +83,7 @@ end subroutine ludecomp7_12
   subroutine ludecomp7_0(aam,bbm,ccm,ddm,eem,qqm,ggm,hhm,ssm,rrm,vvm,wwm,zzm,l1m,l2m,l3m,u1m,u2m,u3m,ny)
 !
 !*******************************************************************
-    use decomp_2d, only : mytype
+    use decomp_2d_constants, only : mytype
     USE param
     
     implicit none
@@ -207,7 +207,7 @@ end subroutine ludecomp7_0
   subroutine ludecomp9_12(aam,bbm,ccm,ddm,eem,qqm,ggm,hhm,ssm,rrm,vvm,wwm,zzm,ttm,uum,sssm,zzzm,ny)
 !
 !*******************************************************************
-    use decomp_2d, only : mytype
+    use decomp_2d_constants, only : mytype
     use param
 
     implicit none
@@ -268,7 +268,7 @@ end subroutine ludecomp9_12
   subroutine ludecomp9_0(aam,bbm,ccm,ddm,eem,qqm,ggm,hhm,ssm,rrm,vvm,wwm,zzm,l1m,l2m,l3m,u1m,u2m,u3m,ny)
 !
 !*******************************************************************
-    use decomp_2d, only : mytype
+    use decomp_2d_constants, only : mytype
     use param
     USE MPI
 
@@ -302,7 +302,7 @@ end module ludecomp
 !
 module matinv
 
-  use decomp_2d, only : mytype
+  use decomp_2d_constants, only : mytype
 
   implicit none
 
@@ -332,7 +332,7 @@ module matinv
   subroutine septinv_12(xsol,bbb,ggm,hhm,ssm,rrm,vvm,wwm,zzm,nx,ny,nz)
     !
     !********************************************************************
-    use decomp_2d, only : mytype
+    use decomp_2d_constants, only : mytype
 
     implicit none
 
@@ -390,7 +390,7 @@ end subroutine septinv_12
   subroutine septinv_0(xsol,bbb,ggm,hhm,ssm,rrm,vvm,wwm,zzm,l1m,l2m,l3m,u1m,u2m,u3m,nx,ny,nz)
 !    
 !********************************************************************
-    use decomp_2d, only : mytype
+    use decomp_2d_constants, only : mytype
 
     implicit none
 
@@ -457,7 +457,7 @@ end subroutine septinv_0
   subroutine nonainv_12(xSol,bbb,ggm,hhm,ssm,sssm,ttm,zzzm,zzm,wwm,vvm,nx,ny,nz)
 !
 !********************************************************************
-    use decomp_2d, only : mytype
+    use decomp_2d_constants, only : mytype
 
     implicit none
 
@@ -505,7 +505,7 @@ end subroutine nonainv_12
   subroutine nonainv_0(xsol,bbb,ggm,hhm,ssm,rrm,vvm,wwm,zzm,l1m,l2m,l3m,u1m,u2m,u3m,nx,ny,nz)
     !
     !********************************************************************
-    use decomp_2d, only : mytype
+    use decomp_2d_constants, only : mytype
     USE MPI
     
     implicit none
@@ -520,6 +520,8 @@ subroutine nonainv_0(xsol,bbb,ggm,hhm,ssm,rrm,vvm,wwm,zzm,l1m,l2m,l3m,u1m,u2m,u3
     real(mytype),dimension(ny), intent(in) :: l1m,l2m,l3m
     real(mytype),dimension(ny), intent(in) :: u1m,u2m,u3m
 
+    xsol = 0._mytype
+
     write(*,*) 'NOT READY YET! SIMULATION IS STOPPED!'
     call MPI_ABORT(MPI_COMM_WORLD,code,ierror); stop
 
@@ -551,6 +553,7 @@ subroutine  inttimp (var1,dvar1,npaire,isc,forcing1)
   USE variables
   USE var, ONLY: ta1, ta2, tb2, tc2, td2
   USE decomp_2d
+  use decomp_2d_mpi
   use derivY
   use matinv
 
@@ -2224,7 +2227,7 @@ end subroutine init_implicit
 !
 subroutine init_implicit_coef(tab1d, tab2d)
 
-  use decomp_2d, only : mytype
+  use decomp_2d_constants, only : mytype
   use variables, only : ny, numscalar
 
   implicit none
diff --git a/src/les_models.f90 b/src/les_models.f90
index faf0919bc..44bdf46b0 100644
--- a/src/les_models.f90
+++ b/src/les_models.f90
@@ -4,6 +4,9 @@
 
 module les
 
+  use decomp_2d_constants
+  use decomp_2d_mpi
+  use decomp_2d
   use visu, only : gen_filename, output2D
 
   character(len=*), parameter :: io_turb = "turb-io", &
@@ -21,7 +24,6 @@ subroutine init_explicit_les
 
     USE param
     USE variables
-    USE decomp_2d
     use decomp_2d_io, only : decomp_2d_init_io, decomp_2d_register_variable, decomp_2d_open_io, decomp_2d_write_mode
 
     implicit none
@@ -102,7 +104,6 @@ subroutine compute_SGS(sgsx1,sgsy1,sgsz1,ux1,uy1,uz1,phi1,ep1)
 
     USE param
     USE variables
-    USE decomp_2d
     USE decomp_2d_io
     use var, only: nut1
     USE abl, only: wall_sgs_slip, wall_sgs_noslip
@@ -173,7 +174,6 @@ subroutine smag(nut1,ux1,uy1,uz1)
     use MPI
     USE param
     USE variables
-    USE decomp_2d
     USE decomp_2d_io
     USE var, only : ta1,tb1,tc1,td1,te1,tf1,tg1,th1,ti1,di1
     USE var, only : ux2,uy2,uz2,ta2,tb2,tc2,td2,te2,tf2,tg2,th2,ti2,di2
@@ -339,7 +339,6 @@ subroutine dynsmag(nut1,ux1,uy1,uz1,ep1)
 
     USE param
     USE variables
-    USE decomp_2d
     USE decomp_2d_io
     USE MPI
     USE var, only : ta1,tb1,tc1,td1,te1,tf1,tg1,th1,ti1,di1
@@ -917,7 +916,6 @@ subroutine wale(nut1,ux1,uy1,uz1)
 
   USE param
   USE variables
-  USE decomp_2d
   USE decomp_2d_io
   USE var, only : ta1,tb1,tc1,td1,te1,tf1,tg1,th1,ti1,di1
   USE var, only : ux2,uy2,uz2,ta2,tb2,tc2,td2,te2,tf2,tg2,th2,ti2,di2
@@ -1099,7 +1097,6 @@ subroutine sgs_mom_nonconservative(sgsx1,sgsy1,sgsz1,ux1,uy1,uz1,nut1,ep1)
 
     USE param
     USE variables
-    USE decomp_2d
     USE var, only : ta1,tb1,tc1,td1,te1,tf1,tg1,th1,ti1,di1
     USE var, only : ux2,uy2,uz2,ta2,tb2,tc2,td2,te2,tf2,tg2,th2,ti2,tj2,di2
     USE var, only : ux3,uy3,uz3,ta3,tb3,tc3,td3,te3,tf3,tg3,th3,ti3,di3
@@ -1254,7 +1251,6 @@ subroutine sgs_scalar_nonconservative(sgsphi1,nut1,phi1,is)
 
     USE param
     USE variables
-    USE decomp_2d
 
     USE var, only: di1,tb1,di2,tb2,di3,tb3,tc1,tc2,tc3
     USE abl, only: wall_sgs_slip_scalar
@@ -1329,7 +1325,6 @@ subroutine sgs_mom_conservative(sgsx1,sgsy1,sgsz1,ux1,uy1,uz1,nut1)
 
     USE param
     USE variables
-    USE decomp_2d
     use MPI
     USE var, only : ta1,tb1,tc1,di1
     USE var, only : ta2,tb2,tc2,di2
diff --git a/src/module_param.f90 b/src/module_param.f90
index 3b97a8e89..be0b9da04 100644
--- a/src/module_param.f90
+++ b/src/module_param.f90
@@ -5,7 +5,7 @@
 module variables
   !USE param
   !USE var
-  use decomp_2d, only : mytype
+  use decomp_2d_constants, only : mytype
 
   ! Boundary conditions : ncl = 2 --> Dirichlet
   ! Boundary conditions : ncl = 1 --> Free-slip
@@ -129,7 +129,7 @@ module variables
 
   ABSTRACT INTERFACE
      SUBROUTINE DERIVATIVE_X(t,u,r,s,ff,fs,fw,nx,ny,nz,npaire,lind)
-       use decomp_2d, only : mytype
+       use decomp_2d_constants, only : mytype
        integer :: nx,ny,nz,npaire
        real(mytype), dimension(nx,ny,nz) :: t,u,r
        real(mytype), dimension(ny,nz):: s
@@ -137,7 +137,7 @@ SUBROUTINE DERIVATIVE_X(t,u,r,s,ff,fs,fw,nx,ny,nz,npaire,lind)
        real(mytype) :: lind
      END SUBROUTINE DERIVATIVE_X
      SUBROUTINE DERIVATIVE_Y(t,u,r,s,ff,fs,fw,pp,nx,ny,nz,npaire,lind)
-       use decomp_2d, only : mytype
+       use decomp_2d_constants, only : mytype
        integer :: nx,ny,nz,npaire
        real(mytype), dimension(nx,ny,nz) :: t,u,r
        real(mytype), dimension(nx,nz):: s
@@ -145,7 +145,7 @@ SUBROUTINE DERIVATIVE_Y(t,u,r,s,ff,fs,fw,pp,nx,ny,nz,npaire,lind)
        real(mytype) :: lind
      END SUBROUTINE DERIVATIVE_Y
      SUBROUTINE DERIVATIVE_YY(t,u,r,s,ff,fs,fw,nx,ny,nz,npaire,lind)
-       use decomp_2d, only : mytype
+       use decomp_2d_constants, only : mytype
        integer :: nx,ny,nz,npaire
        real(mytype), dimension(nx,ny,nz) :: t,u,r
        real(mytype), dimension(nx,nz):: s
@@ -153,7 +153,7 @@ SUBROUTINE DERIVATIVE_YY(t,u,r,s,ff,fs,fw,nx,ny,nz,npaire,lind)
        real(mytype) :: lind
      END SUBROUTINE DERIVATIVE_YY
      SUBROUTINE DERIVATIVE_Z(t,u,r,s,ff,fs,fw,nx,ny,nz,npaire,lind)
-       use decomp_2d, only : mytype
+       use decomp_2d_constants, only : mytype
        integer :: nx,ny,nz,npaire
        real(mytype), dimension(nx,ny,nz) :: t,u,r
        real(mytype), dimension(nx,ny):: s
@@ -181,7 +181,7 @@ END SUBROUTINE DERIVATIVE_Z
 
   ABSTRACT INTERFACE
      SUBROUTINE FILTER_X(t,u,r,s,ff,fs,fw,nx,ny,nz,npaire,lind)
-       use decomp_2d, only : mytype
+       use decomp_2d_constants, only : mytype
        integer :: nx,ny,nz,npaire
        real(mytype), dimension(nx,ny,nz) :: t,u,r
        real(mytype), dimension(ny,nz):: s
@@ -189,7 +189,7 @@ SUBROUTINE FILTER_X(t,u,r,s,ff,fs,fw,nx,ny,nz,npaire,lind)
        real(mytype) :: lind
      END SUBROUTINE FILTER_X
      SUBROUTINE FILTER_Y(t,u,r,s,ff,fs,fw,nx,ny,nz,npaire,lind)
-       use decomp_2d, only : mytype
+       use decomp_2d_constants, only : mytype
        integer :: nx,ny,nz,npaire
        real(mytype), dimension(nx,ny,nz) :: t,u,r
        real(mytype), dimension(nx,nz):: s
@@ -197,7 +197,7 @@ SUBROUTINE FILTER_Y(t,u,r,s,ff,fs,fw,nx,ny,nz,npaire,lind)
        real(mytype) :: lind
      END SUBROUTINE FILTER_Y
      SUBROUTINE FILTER_Z(t,u,r,s,ff,fs,fw,nx,ny,nz,npaire,lind)
-       use decomp_2d, only : mytype
+       use decomp_2d_constants, only : mytype
        integer :: nx,ny,nz,npaire
        real(mytype), dimension(nx,ny,nz) :: t,u,r
        real(mytype), dimension(nx,ny):: s
@@ -260,7 +260,7 @@ end module variables
 !############################################################################
 module param
 
-  use decomp_2d, only : mytype
+  use decomp_2d_constants, only : mytype
 
   integer :: nclx1,nclxn,ncly1,nclyn,nclz1,nclzn
   integer :: nclxS1,nclxSn,nclyS1,nclySn,nclzS1,nclzSn
@@ -501,7 +501,7 @@ end module param
 !############################################################################
 module complex_geometry
 
-  use decomp_2d,only : mytype
+  use decomp_2d_constants,only : mytype
   use variables,only : nx,ny,nz,nxm,nym,nzm
 
   integer     ,allocatable,dimension(:,:)   :: nobjx,nobjy,nobjz
@@ -513,7 +513,7 @@ end module complex_geometry
 !############################################################################
 module derivX
 
-  use decomp_2d, only : mytype
+  use decomp_2d_constants, only : mytype
 
   real(mytype) :: alcaix6,acix6,bcix6
   real(mytype) :: ailcaix6,aicix6,bicix6,cicix6,dicix6
@@ -533,7 +533,7 @@ end module derivX
 !############################################################################
 module derivY
 
-  use decomp_2d, only : mytype
+  use decomp_2d_constants, only : mytype
 
   real(mytype) :: alcaiy6,aciy6,bciy6
   real(mytype) :: ailcaiy6,aiciy6,biciy6,ciciy6,diciy6
@@ -553,7 +553,7 @@ end module derivY
 !############################################################################
 module derivZ
 
-  use decomp_2d, only : mytype
+  use decomp_2d_constants, only : mytype
 
   real(mytype) :: alcaiz6,aciz6,bciz6
   real(mytype) :: ailcaiz6,aiciz6,biciz6,ciciz6,diciz6
@@ -574,7 +574,7 @@ end module derivZ
 !############################################################################
 ! Describes the parameters for the discrete filters in X-Pencil
 module parfiX
-  use decomp_2d, only : mytype
+  use decomp_2d_constants, only : mytype
   real(mytype) :: fial1x, fia1x, fib1x, fic1x, fid1x, fie1x, fif1x  ! Coefficients for filter at boundary point 1
   real(mytype) :: fial2x, fia2x, fib2x, fic2x, fid2x, fie2x, fif2x  ! Coefficients for filter at boundary point 2
   real(mytype) :: fial3x, fia3x, fib3x, fic3x, fid3x, fie3x, fif3x  ! Coefficients for filter at boundary point 3
@@ -587,7 +587,7 @@ end module parfiX
 !############################################################################
 module parfiY
 
-  use decomp_2d, only : mytype
+  use decomp_2d_constants, only : mytype
   real(mytype) :: fial1y, fia1y, fib1y, fic1y, fid1y, fie1y, fif1y ! Coefficients for filter at boundary point 1
   real(mytype) :: fial2y, fia2y, fib2y, fic2y, fid2y, fie2y, fif2y ! Coefficients for filter at boundary point 2
   real(mytype) :: fial3y, fia3y, fib3y, fic3y, fid3y, fie3y, fif3y ! Coefficients for filter at boundary point 3
@@ -600,7 +600,7 @@ end module parfiY
 !############################################################################
 module parfiZ
 
-  use decomp_2d, only : mytype
+  use decomp_2d_constants, only : mytype
   real(mytype) :: fial1z, fia1z, fib1z, fic1z, fid1z, fie1z, fif1z ! Coefficients for filter at boundary point 1
   real(mytype) :: fial2z, fia2z, fib2z, fic2z, fid2z, fie2z, fif2z ! Coefficients for filter at boundary point 2
   real(mytype) :: fial3z, fia3z, fib3z, fic3z, fid3z, fie3z, fif3z ! Coefficients for filter at boundary point 3
@@ -617,13 +617,14 @@ end module simulation_stats
 !############################################################################
 !############################################################################
 module ibm_param
-  use decomp_2d, only : mytype
+  use decomp_2d_constants, only : mytype
   real(mytype) :: cex,cey,cez,ra,rai,rao,ubcx,ubcy,ubcz,rads, c_air
   real(mytype) :: chord,thickness,omega
   integer :: inana ! Analytical BC as Input
   integer :: imove
 end module ibm_param
 !############################################################################
+!############################################################################
 module dbg_schemes
 
   use decomp_2d
@@ -641,14 +642,14 @@ module dbg_schemes
             sqrt_prec,  abs_prec
 
 contains
- 
+
 
   !##################################################################
   !********************************************************************
   ! Math functions for Single/double precision
   !-------------------------------------------
   function sin_prec(x) result(y)
-    USE decomp_2d, only : mytype
+    use decomp_2d_constants, only : mytype
     real(mytype), intent(in) :: x
     real(mytype) :: y
 #ifdef DOUBLE_PREC
@@ -659,7 +660,7 @@ function sin_prec(x) result(y)
   end function sin_prec
   !-------------------------------------------
   function cos_prec(x) result(y)
-    USE decomp_2d, only : mytype
+    use decomp_2d_constants, only : mytype
     real(mytype), intent(in) :: x
     real(mytype) :: y
 #ifdef DOUBLE_PREC
@@ -670,7 +671,7 @@ function cos_prec(x) result(y)
   end function cos_prec
   !-------------------------------------------
   function tan_prec(x) result(y)
-    USE decomp_2d, only : mytype
+    use decomp_2d_constants, only : mytype
     real(mytype), intent(in) :: x
     real(mytype) :: y
 #ifdef DOUBLE_PREC
@@ -681,7 +682,7 @@ function tan_prec(x) result(y)
   end function tan_prec
   !-------------------------------------------
   function asin_prec(x) result(y)
-    USE decomp_2d, only : mytype
+    use decomp_2d_constants, only : mytype
     real(mytype), intent(in) :: x
     real(mytype) :: y
 #ifdef DOUBLE_PREC
@@ -692,7 +693,7 @@ function asin_prec(x) result(y)
   end function asin_prec
   !-------------------------------------------
   function acos_prec(x) result(y)
-    USE decomp_2d, only : mytype
+    use decomp_2d_constants, only : mytype
     real(mytype), intent(in) :: x
     real(mytype) :: y
 #ifdef DOUBLE_PREC
@@ -703,7 +704,7 @@ function acos_prec(x) result(y)
   end function acos_prec
   !-------------------------------------------
   function atan_prec(x) result(y)
-    USE decomp_2d, only : mytype
+    use decomp_2d_constants, only : mytype
     real(mytype), intent(in) :: x
     real(mytype) :: y
 #ifdef DOUBLE_PREC
@@ -714,7 +715,7 @@ function atan_prec(x) result(y)
   end function atan_prec
   !-------------------------------------------
   function sinh_prec(x) result(y)
-    USE decomp_2d, only : mytype
+    use decomp_2d_constants, only : mytype
     real(mytype), intent(in) :: x
     real(mytype) :: y
 #ifdef DOUBLE_PREC
@@ -725,7 +726,7 @@ function sinh_prec(x) result(y)
   end function sinh_prec
   !-------------------------------------------
   function cosh_prec(x) result(y)
-    USE decomp_2d, only : mytype
+    use decomp_2d_constants, only : mytype
     real(mytype), intent(in) :: x
     real(mytype) :: y
 #ifdef DOUBLE_PREC
@@ -736,7 +737,7 @@ function cosh_prec(x) result(y)
   end function cosh_prec
   !-------------------------------------------
   function tanh_prec(x) result(y)
-    USE decomp_2d, only : mytype
+    use decomp_2d_constants, only : mytype
     real(mytype), intent(in) :: x
     real(mytype) :: y
 #ifdef DOUBLE_PREC
@@ -747,7 +748,7 @@ function tanh_prec(x) result(y)
   end function tanh_prec
   !-------------------------------------------
   function exp_prec(x) result(y)
-    USE decomp_2d, only : mytype
+    use decomp_2d_constants, only : mytype
     real(mytype), intent(in) :: x
     real(mytype) :: y
 #ifdef DOUBLE_PREC
@@ -758,7 +759,7 @@ function exp_prec(x) result(y)
   end function exp_prec
   !-------------------------------------------
   function log_prec(x) result(y)
-    USE decomp_2d, only : mytype
+    use decomp_2d_constants, only : mytype
     real(mytype), intent(in) :: x
     real(mytype) :: y
 #ifdef DOUBLE_PREC
@@ -769,7 +770,7 @@ function log_prec(x) result(y)
   end function log_prec
   !-------------------------------------------
   function log10_prec(x) result(y)
-    USE decomp_2d, only : mytype
+    use decomp_2d_constants, only : mytype
     real(mytype), intent(in) :: x
     real(mytype) :: y
 #ifdef DOUBLE_PREC
@@ -780,7 +781,7 @@ function log10_prec(x) result(y)
   end function log10_prec
   !-------------------------------------------
   function sqrt_prec(x) result(y)
-    USE decomp_2d, only : mytype
+    use decomp_2d_constants, only : mytype
     real(mytype), intent(in) :: x
     real(mytype) :: y
 #ifdef DOUBLE_PREC
@@ -791,7 +792,7 @@ function sqrt_prec(x) result(y)
   end function sqrt_prec
   !-------------------------------------------
   function abs_prec(x) result(y)
-    USE decomp_2d, only : mytype
+    use decomp_2d_constants, only : mytype
     real(mytype), intent(in) :: x
     real(mytype) :: y
 #ifdef DOUBLE_PREC
@@ -800,5 +801,4 @@ function abs_prec(x) result(y)
     y = abs(x)
 #endif
   end function abs_prec
-  end module dbg_schemes
-
+end module dbg_schemes
diff --git a/src/navier.f90 b/src/navier.f90
index 60d8ac8da..a0910ecfe 100644
--- a/src/navier.f90
+++ b/src/navier.f90
@@ -4,6 +4,10 @@
 
 module navier
 
+  use decomp_2d_constants
+  use decomp_2d_mpi
+  use decomp_2d
+
   implicit none
 
   private
@@ -22,7 +26,6 @@ module navier
   !############################################################################
   SUBROUTINE solve_poisson(pp3, px1, py1, pz1, rho1, ux1, uy1, uz1, ep1, drho1, divu3)
 
-    USE decomp_2d, ONLY : mytype, xsize, zsize, ph1, nrank, real_type
     USE decomp_2d_poisson, ONLY : poisson
     USE var, ONLY : nzmsize
     USE var, ONLY : dv3
@@ -150,7 +153,6 @@ END SUBROUTINE solve_poisson
   !############################################################################
   SUBROUTINE lmn_t_to_rho_trans(drho1, dtemp1, rho1, dphi1, phi1)
 
-    USE decomp_2d
     USE param, ONLY : zero
     USE param, ONLY : imultispecies, massfrac, mol_weight
     USE param, ONLY : ntime
@@ -204,7 +206,6 @@ SUBROUTINE lmn_t_to_rho_trans(drho1, dtemp1, rho1, dphi1, phi1)
   !############################################################################
   subroutine cor_vel (ux,uy,uz,px,py,pz)
 
-    USE decomp_2d
     USE variables
     USE param
     USE mpi
@@ -257,7 +258,6 @@ end subroutine cor_vel
   subroutine divergence (pp3,rho1,ux1,uy1,uz1,ep1,drho1,divu3,nlock)
 
     USE param
-    USE decomp_2d
     USE variables
     USE var, ONLY: ta1, tb1, tc1, pp1, pgy1, pgz1, di1, &
          duxdxp2, uyp2, uzp2, duydypi2, upi2, ta2, dipp2, &
@@ -384,7 +384,6 @@ end subroutine divergence
   subroutine gradp(px1,py1,pz1,pp3)
 
     USE param
-    USE decomp_2d
     USE variables
     USE MPI
     USE var, only: pp1,pgy1,pgz1,di1,pp2,ppi2,pgy2,pgz2,pgzi2,dip2,&
@@ -500,7 +499,6 @@ end subroutine gradp
   !############################################################################
   subroutine pre_correc(ux,uy,uz,ep)
 
-    USE decomp_2d
     USE variables
     USE param
     USE var
@@ -762,7 +760,6 @@ end subroutine pre_correc
   !! Convert to/from conserved/primary variables
   SUBROUTINE primary_to_conserved(rho1, var1)
 
-    USE decomp_2d, ONLY : mytype, xsize
     USE param, ONLY : nrhotime
 
     IMPLICIT NONE
@@ -777,7 +774,6 @@ SUBROUTINE primary_to_conserved(rho1, var1)
   !############################################################################
   SUBROUTINE velocity_to_momentum (rho1, ux1, uy1, uz1)
 
-    USE decomp_2d, ONLY : mytype, xsize
     USE param, ONLY : nrhotime
     USE var, ONLY : ilmn
 
@@ -799,7 +795,6 @@ SUBROUTINE velocity_to_momentum (rho1, ux1, uy1, uz1)
   !############################################################################
   SUBROUTINE conserved_to_primary(rho1, var1)
 
-    USE decomp_2d, ONLY : mytype, xsize
     USE param, ONLY : nrhotime
 
     IMPLICIT NONE
@@ -814,7 +809,6 @@ SUBROUTINE conserved_to_primary(rho1, var1)
   !############################################################################
   SUBROUTINE momentum_to_velocity (rho1, ux1, uy1, uz1)
 
-    USE decomp_2d, ONLY : mytype, xsize
     USE param, ONLY : nrhotime
     USE var, ONLY : ilmn
 
@@ -837,8 +831,6 @@ SUBROUTINE momentum_to_velocity (rho1, ux1, uy1, uz1)
   !! Calculate velocity-divergence constraint
   SUBROUTINE calc_divu_constraint(divu3, rho1, phi1)
 
-    USE decomp_2d, ONLY : mytype, xsize, ysize, zsize
-    USE decomp_2d, ONLY : transpose_x_to_y, transpose_y_to_z
     USE param, ONLY : nrhotime, zero, ilmn, pressure0, imultispecies, massfrac, mol_weight
     USE param, ONLY : ibirman_eos
     USE param, ONLY : xnu, prandtl
@@ -978,7 +970,6 @@ SUBROUTINE calc_divu_constraint(divu3, rho1, phi1)
   ! Calculate extrapolation drhodt 
   SUBROUTINE extrapol_drhodt(drhodt1_next, rho1, drho1)
 
-    USE decomp_2d, ONLY : mytype, xsize, nrank
     USE param, ONLY : ntime, nrhotime, itime, itimescheme, itr, dt, gdt, irestart
     USE param, ONLY : half, three, four
     USE param, ONLY : ibirman_eos
@@ -1039,8 +1030,6 @@ SUBROUTINE extrapol_drhodt(drhodt1_next, rho1, drho1)
 
   SUBROUTINE birman_drhodt_corr(drhodt1_next, rho1)
 
-    USE decomp_2d, ONLY : mytype, xsize, ysize, zsize
-    USE decomp_2d, ONLY : transpose_x_to_y, transpose_y_to_z, transpose_z_to_y, transpose_y_to_x
     USE variables, ONLY : derxx, deryy, derzz
     USE param, ONLY : nrhotime
     USE param, ONLY : xnu, prandtl
@@ -1088,7 +1077,6 @@ SUBROUTINE birman_drhodt_corr(drhodt1_next, rho1)
   SUBROUTINE test_varcoeff(converged, divup3norm, pp3, dv3, atol, rtol, poissiter)
 
     USE MPI
-    USE decomp_2d, ONLY: mytype, ph1, real_type, nrank
     USE var, ONLY : nzmsize
     USE param, ONLY : npress, itime
     USE variables, ONLY : nxm, nym, nzm, ilist
@@ -1160,8 +1148,6 @@ SUBROUTINE calc_varcoeff_rhs(pp3, rho1, px1, py1, pz1, dv3, drho1, ep1, divu3, r
 
     USE MPI
 
-    USE decomp_2d
-
     USE param, ONLY : nrhotime, ntime
     USE param, ONLY : one
 
@@ -1213,7 +1199,6 @@ subroutine tbl_flrt (ux1,uy1,uz1)
   !
   !********************************************************************
 
-    USE decomp_2d
     USE decomp_2d_poisson
     USE variables
     USE param
@@ -1305,7 +1290,6 @@ subroutine pipe_bulk(ux,uy,uz,ep)
   !********************************************************************
 
     use param, only: one
-    use decomp_2d, only: mytype, xsize
     use variables, only: numscalar
     use var, only: ta1, phi1
 
@@ -1337,7 +1321,6 @@ subroutine pipe_bulk_u(ux,uy,uz,ep,ub_constant,ncount)
   !
   !********************************************************************
 
-    use decomp_2d
     use variables
     use param
     use var
@@ -1408,7 +1391,6 @@ subroutine pipe_bulk_phi(phi,ux,ep,is,phib_constant,ncount)
   !
   !********************************************************************
 
-    use decomp_2d
     use decomp_2d_poisson
     use variables
     use param
@@ -1495,7 +1477,6 @@ subroutine pipe_volume_avg(var,qm,ep,ncount)
 
     use param
     use variables
-    use decomp_2d
     use MPI
     use ibm_param, only: rai
 
diff --git a/src/parameters.f90 b/src/parameters.f90
index c1e5365bc..3573f7e9b 100644
--- a/src/parameters.f90
+++ b/src/parameters.f90
@@ -20,6 +20,7 @@ subroutine parameter(input_i3d)
   use variables
   use complex_geometry
   use decomp_2d
+  use decomp_2d_mpi
   use ibm_param
   use dbg_schemes, only: sin_prec, cos_prec
 
@@ -313,7 +314,7 @@ subroutine parameter(input_i3d)
   !###########################################################################
   ! Log-output
   !###########################################################################
-  if (nrank==0) call system('mkdir data out probes 2> /dev/null')
+  if (nrank==0) call execute_command_line('mkdir data out probes 2> /dev/null')
 
 #ifdef DEBG
   if (nrank == 0) write(*,*) '# parameter input.i3d done'
@@ -532,7 +533,7 @@ subroutine parameter(input_i3d)
         else
            write(*,*)  "LMN boundedness    : Not enforced"
         endif
-        write(*,"(' dens1 and dens2    : ',F6.2' ',F6.2)") dens1, dens2
+        write(*,"(' dens1 and dens2    : ',F6.2,' ',F6.2)") dens1, dens2
         write(*,"(' Prandtl number Re  : ',F15.8)") prandtl
      endif
      if (angle.ne.0.) write(*,"(' Solid rotation     : ',F6.2)") angle
diff --git a/src/poisson.f90 b/src/poisson.f90
index af0df6876..7c62ad126 100644
--- a/src/poisson.f90
+++ b/src/poisson.f90
@@ -4,6 +4,7 @@
 
 module decomp_2d_poisson
 
+  use decomp_2d_constants
   use decomp_2d
   use decomp_2d_fft
 
@@ -51,7 +52,7 @@ module decomp_2d_poisson
 
   abstract interface
      subroutine poisson_xxx(rhs)
-       use decomp_2d, only : mytype
+       use decomp_2d_constants, only : mytype
        real(mytype), dimension(:,:,:), intent(inout) :: rhs
      end subroutine poisson_xxx
   end interface
@@ -466,7 +467,7 @@ subroutine poisson_100(rhs)
     ! normalisation
     cw1 = cw1 / real(nx, kind=mytype) /real(ny, kind=mytype) &
          / real(nz, kind=mytype)
-#ifdef DEBUG
+#ifdef DEBG
     do k = sp%xst(3), sp%xen(3)
        do j = sp%xst(2), sp%xen(2)
           do i = sp%xst(1), sp%xen(1)
@@ -488,7 +489,7 @@ subroutine poisson_100(rhs)
              tmp2 = iy(cw1(i,j,k))
              cw1(i,j,k) = cx(tmp1 * bz(k) + tmp2 * az(k), &
                              tmp2 * bz(k) - tmp1 * az(k))
-#ifdef DEBUG
+#ifdef DEBG
              if (abs_prec(cw1(i,j,k)) > 1.0e-4) &
                   write(*,100) 'after z',i,j,k,cw1(i,j,k)
 #endif
@@ -505,7 +506,7 @@ subroutine poisson_100(rhs)
              cw1(i,j,k) = cx(tmp1 * by(j) + tmp2 * ay(j), &
                              tmp2 * by(j) - tmp1 * ay(j))
              if (j > (ny/2+1)) cw1(i,j,k) = -cw1(i,j,k)
-#ifdef DEBUG
+#ifdef DEBG
              if (abs_prec(cw1(i,j,k)) > 1.0e-4) &
                   write(*,100) 'after y',i,j,k,cw1(i,j,k)
 #endif
@@ -535,7 +536,7 @@ subroutine poisson_100(rhs)
           end do
        end do
     end do
-#ifdef DEBUG
+#ifdef DEBG
     do k = sp%xst(3), sp%xen(3)
        do j = sp%xst(2), sp%xen(2)
           do i = sp%xst(1), sp%xen(1)
@@ -566,7 +567,7 @@ subroutine poisson_100(rhs)
              if ((abs_prec(tmp1) >= epsilon).and.(abs_prec(tmp2) >= epsilon)) then
                 cw1b(i,j,k)=cx(rl(cw1b(i,j,k)) / (-tmp1), iy(cw1b(i,j,k)) / (-tmp2))
              end if
-#ifdef DEBUG
+#ifdef DEBG
              if (abs_prec(cw1b(i,j,k)) > 1.0e-4) &
                   write(*,100) 'AFTER',i,j,k,cw1b(i,j,k)
 #endif
@@ -598,7 +599,7 @@ subroutine poisson_100(rhs)
           end do
        end do
     end do
-#ifdef DEBUG
+#ifdef DEBG
     do k = sp%xst(3),sp%xen(3)
        do j = sp%xst(2),sp%xen(2)
           do i = sp%xst(1),sp%xen(1)
@@ -619,7 +620,7 @@ subroutine poisson_100(rhs)
              cw1(i,j,k) = cx(tmp1 * by(j) - tmp2 * ay(j), &
                              tmp2 * by(j) + tmp1 * ay(j))
              if (j > (ny/2+1)) cw1(i,j,k) = -cw1(i,j,k)
-#ifdef DEBUG
+#ifdef DEBG
              if (abs_prec(cw1(i,j,k)) > 1.0e-4) &
                   write(*,100) 'AFTER Y',i,j,k,cw1(i,j,k)
 #endif
@@ -635,7 +636,7 @@ subroutine poisson_100(rhs)
              tmp2 = iy(cw1(i,j,k))
              cw1(i,j,k) = cx(tmp1 * bz(k) - tmp2 * az(k), &
                              tmp2 * bz(k) + tmp1 * az(k))
-#ifdef DEBUG
+#ifdef DEBG
              if (abs_prec(cw1(i,j,k)) > 1.0e-4) &
                   write(*,100) 'END',i,j,k,cw1(i,j,k)
 #endif
@@ -722,7 +723,7 @@ subroutine poisson_010(rhs)
     ! normalisation
     cw1 = cw1 / real(nx, kind=mytype) /real(ny, kind=mytype) &
          / real(nz, kind=mytype)
-#ifdef DEBUG
+#ifdef DEBG
     do k = sp%xst(3), sp%xen(3)
        do j = sp%xst(2), sp%xen(2)
           do i = sp%xst(1), sp%xen(1)
@@ -744,7 +745,7 @@ subroutine poisson_010(rhs)
              tmp2 = iy(cw1(i,j,k))
              cw1(i,j,k) = cx(tmp1 * bz(k) + tmp2 * az(k), &
                              tmp2 * bz(k) - tmp1 * az(k))
-#ifdef DEBUG
+#ifdef DEBG
              if (abs_prec(cw1(i,j,k)) > 1.0e-4) &
                   write(*,100) 'after z',i,j,k,cw1(i,j,k)
 #endif
@@ -761,7 +762,7 @@ subroutine poisson_010(rhs)
              cw1(i,j,k) = cx(tmp1 * bx(i) + tmp2 * ax(i), &
                              tmp2 * bx(i) - tmp1 * ax(i))
              if (i.gt.(nx/2+1)) cw1(i,j,k)=-cw1(i,j,k)
-#ifdef DEBUG
+#ifdef DEBG
              if (abs_prec(cw1(i,j,k)) > 1.0e-4) &
                   write(*,100) 'after x',i,j,k,cw1(i,j,k)
 #endif
@@ -794,7 +795,7 @@ subroutine poisson_010(rhs)
           end do
        end do
     end do
-#ifdef DEBUG
+#ifdef DEBG
     do k = sp%yst(3), sp%yen(3)
        do j = sp%yst(2), sp%yen(2)
           do i = sp%yst(1), sp%yen(1)
@@ -921,7 +922,7 @@ subroutine poisson_010(rhs)
           endif
        enddo
     enddo
-#ifdef DEBUG
+#ifdef DEBG
     do k = sp%yst(3), sp%yen(3)
        do j = sp%yst(2), sp%yen(2)
           do i = sp%yst(1), sp%yen(1)
@@ -960,7 +961,7 @@ subroutine poisson_010(rhs)
 
     ! Back to X-pencil
     call transpose_y_to_x(cw2,cw1,sp)
-#ifdef DEBUG
+#ifdef DEBG
     do k = sp%xst(3),sp%xen(3)
        do j = sp%xst(2),sp%xen(2)
           do i = sp%xst(1),sp%xen(1)
@@ -981,7 +982,7 @@ subroutine poisson_010(rhs)
              cw1(i,j,k) = cx(tmp1 * bx(i) - tmp2 * ax(i), &
                              tmp2 * bx(i) + tmp1 * ax(i))
              if (i > (nx/2 + 1)) cw1(i,j,k) = -cw1(i,j,k)
-#ifdef DEBUG
+#ifdef DEBG
              if (abs_prec(cw1(i,j,k)) > 1.0e-4) &
                   write(*,100) 'AFTER X',i,j,k,cw1(i,j,k)
 #endif
@@ -997,7 +998,7 @@ subroutine poisson_010(rhs)
              tmp2 = iy(cw1(i,j,k))
              cw1(i,j,k) = cx(tmp1 * bz(k) - tmp2 * az(k), &
                              tmp2 * bz(k) + tmp1 * az(k))
-#ifdef DEBUG
+#ifdef DEBG
              if (abs_prec(cw1(i,j,k)) > 1.0e-4) &
                   write(*,100) 'END',i,j,k,cw1(i,j,k)
 #endif
@@ -1186,7 +1187,7 @@ subroutine poisson_11x(rhs)
 
     ! back to X-pencil
     call transpose_y_to_x(cw2b,cw1,sp)
-#ifdef DEBUG
+#ifdef DEBG
     dep=maxval(abs(cw1))
     call MPI_ALLREDUCE(dep,dep1,1,real_type,MPI_SUM,MPI_COMM_WORLD,code)
     if (nrank == 0) write(*,*)'## Poisson11X Back to X cw1 ', dep1
@@ -1214,7 +1215,7 @@ subroutine poisson_11x(rhs)
           end do
        end do
     end do
-#ifdef DEBUG
+#ifdef DEBG
     dep=maxval(abs(cw1b))
     call MPI_ALLREDUCE(dep,dep1,1,real_type,MPI_SUM,MPI_COMM_WORLD,code)
     if (nrank == 0) write(*,*)'## Poisson11X Back to X cw1b ', cw1b
@@ -1244,7 +1245,7 @@ subroutine poisson_11x(rhs)
              end do
           end do
        end do
-#ifdef DEBUG
+#ifdef DEBG
        dep=maxval(abs(cw1b))
        call MPI_ALLREDUCE(dep,dep1,1,real_type,MPI_SUM,MPI_COMM_WORLD,code)
        if (nrank == 0) write(*,*)'## Poisson11X Solve Pois istret 0 ', dep1
@@ -1315,7 +1316,7 @@ subroutine poisson_11x(rhs)
                 enddo
              enddo
           enddo
-#ifdef DEBUG
+#ifdef DEBG
           dep=maxval(abs(cw2b))
           call MPI_ALLREDUCE(dep,dep1,1,real_type,MPI_SUM,MPI_COMM_WORLD,code)
           if (nrank == 0) write(*,*)'## Poisson11X Solve Pois istret < 3 ', dep1
@@ -1340,7 +1341,7 @@ subroutine poisson_11x(rhs)
              enddo
           enddo
        endif
-#ifdef DEBUG
+#ifdef DEBG
        dep=maxval(abs(cw2b))
        call MPI_ALLREDUCE(dep,dep1,1,real_type,MPI_SUM,MPI_COMM_WORLD,code)
        if (nrank == 0) write(*,*)'## Poisson11X Solve Pois istret = 3 ', dep1
@@ -1349,7 +1350,7 @@ subroutine poisson_11x(rhs)
        call transpose_y_to_x(cw2b,cw1b,sp)
     endif
 
-#ifdef DEBUG
+#ifdef DEBG
     dep=maxval(abs(cw1b))
     call MPI_ALLREDUCE(dep,dep1,1,real_type,MPI_SUM,MPI_COMM_WORLD,code)
     if (nrank == 0) write(*,*)'## Poisson11X Solve Pois AFTER ', dep1
@@ -1378,7 +1379,7 @@ subroutine poisson_11x(rhs)
           end do
        end do
     end do
-#ifdef DEBUG
+#ifdef DEBG
     dep=maxval(abs(cw1))
     call MPI_ALLREDUCE(dep,dep1,1,real_type,MPI_SUM,MPI_COMM_WORLD,code)
     if (nrank == 0) write(*,*)'## Poisson11X Solve Pois POSTPR X ', dep1
@@ -1408,7 +1409,7 @@ subroutine poisson_11x(rhs)
           end do
        end do
     end do
-#ifdef DEBUG
+#ifdef DEBG
     dep=maxval(abs(cw2b))
     call MPI_ALLREDUCE(dep,dep1,1,real_type,MPI_SUM,MPI_COMM_WORLD,code)
     if (nrank == 0) write(*,*)'## Poisson11X Solve Pois POSTPR Y ', dep1
@@ -1427,7 +1428,7 @@ subroutine poisson_11x(rhs)
           end do
        end do
     end do
-#ifdef DEBUG
+#ifdef DEBG
     dep=maxval(abs(cw1))
     call MPI_ALLREDUCE(dep,dep1,1,real_type,MPI_SUM,MPI_COMM_WORLD,code)
     if (nrank == 0) write(*,*)'## Poisson11X Solve Pois POSTPR Z ', dep1
@@ -1435,7 +1436,7 @@ subroutine poisson_11x(rhs)
 
     ! compute c2r transform, back to physical space
     call decomp_2d_fft_3d(cw1,rhs)
-#ifdef DEBUG
+#ifdef DEBG
     dep=maxval(abs(rhs))
     call MPI_ALLREDUCE(dep,dep1,1,real_type,MPI_SUM,MPI_COMM_WORLD,code)
     if (nrank == 0) write(*,*)'## Poisson11X Solve Pois Back Phy RHS ', dep1
diff --git a/src/probes.f90 b/src/probes.f90
index 7de100069..e70e9e07f 100644
--- a/src/probes.f90
+++ b/src/probes.f90
@@ -13,9 +13,11 @@
 
 module probes
 
-  USE decomp_2d, only : ph1, nrank, mytype
+  USE decomp_2d, only : ph1
+  use decomp_2d_mpi, only : nrank
+  use decomp_2d_constants, only : mytype
   USE decomp_2d, only : xstart, xend, ystart, yend, zstart, zend
-  USE decomp_2d, only : decomp_2d_abort
+  USE decomp_2d_mpi, only : decomp_2d_abort
 
   IMPLICIT NONE
 
@@ -73,7 +75,7 @@ subroutine setup_probes()
   !
   subroutine init_probes()
 
-    USE decomp_2d, only : real_type
+    USE decomp_2d_constants, only : real_type
     USE MPI
     USE param, only : dx, dy, dz, nclx, ncly, nclz, xlx, yly, zlz, istret, one, half
     USE param, only : irestart, ifirst
diff --git a/src/schemes.f90 b/src/schemes.f90
index 1891baf82..d60689fdd 100644
--- a/src/schemes.f90
+++ b/src/schemes.f90
@@ -8,6 +8,7 @@ subroutine schemes()
   !
   !********************************************************************
 
+  use decomp_2d_mpi
   USE param
   USE derivX
   USE derivY
@@ -209,7 +210,7 @@ subroutine prepare (b,c,f,s,w,n)
   !
   !*******************************************************************
 
-  use decomp_2d, only : mytype
+  use decomp_2d_constants, only : mytype
   use param, only : one
 
   implicit none
@@ -241,7 +242,8 @@ subroutine first_derivative(alfa1,af1,bf1,cf1,df1,alfa2,af2,alfan,afn,bfn,&
   !
   !*******************************************************************
 
-  use decomp_2d, only : mytype, nrank
+  use decomp_2d_constants, only : mytype
+  use decomp_2d_mpi, only : nrank
   use param
   use MPI
 
@@ -402,7 +404,8 @@ subroutine second_derivative(alsa1,as1,bs1,&
      sf,ss,sw,sfp,ssp,swp,d2,n,ncl1,ncln)
   !*******************************************************************
 
-  use decomp_2d, only : mytype, nrank
+  use decomp_2d_constants, only : mytype
+  use decomp_2d_mpi, only : nrank
   use param
   use MPI
   use variables, only : nu0nu,cnu
@@ -662,7 +665,7 @@ subroutine interpolation(dx,nxm,nx,nclx1,nclxn,&
   !
   !*******************************************************************
 
-  use decomp_2d, only : mytype
+  use decomp_2d_constants, only : mytype
   use param, only : zero, half, one, two, three, four, nine, ten
   use param, only : ipinter, ifirstder
 
diff --git a/src/statistics.f90 b/src/statistics.f90
index 0ca5800e3..13d8c8c9a 100644
--- a/src/statistics.f90
+++ b/src/statistics.f90
@@ -4,6 +4,10 @@
 
 module stats
 
+  use decomp_2d_constants
+  use decomp_2d_mpi
+  use decomp_2d
+
   implicit none
 
   character(len=*), parameter :: io_statistics = "statistics-io", &
@@ -18,7 +22,6 @@ module stats
 
   subroutine init_statistic_adios2
 
-    use decomp_2d, only : mytype
     use decomp_2d_io, only : decomp_2d_register_variable, decomp_2d_init_io
 
     use var, only : numscalar
@@ -147,7 +150,6 @@ subroutine read_or_write_all_stats(flag_read)
 
     use param, only : iscalar, itime
     use variables, only : numscalar
-    use decomp_2d, only : nrank
     use decomp_2d_io, only : decomp_2d_write_mode, decomp_2d_read_mode, &
          decomp_2d_open_io, decomp_2d_close_io, decomp_2d_start_io, decomp_2d_end_io
     use var, only : pmean
@@ -240,7 +242,7 @@ end subroutine read_or_write_all_stats
   !
   subroutine read_or_write_one_stat(flag_read, filename, array)
 
-    use decomp_2d, only : mytype, xstS, xenS
+    use decomp_2d, only : xstS, xenS
     use decomp_2d_io, only : decomp_2d_read_one, decomp_2d_write_one
 
     implicit none
@@ -267,7 +269,6 @@ subroutine overall_statistic(ux1,uy1,uz1,phi1,pp3,ep1)
 
     use param
     use variables
-    use decomp_2d
     use decomp_2d_io
     use tools, only : rescale_pressure
 
@@ -352,7 +353,6 @@ end subroutine overall_statistic
   !
   elemental real(mytype) function one_minus_ep1(var, ep1)
 
-    use decomp_2d, only : mytype
     use param, only : iibm, one
 
     implicit none
@@ -373,7 +373,7 @@ end function one_minus_ep1
   !
   subroutine update_average_scalar(um, ux, ep)
 
-    use decomp_2d, only : mytype, xsize, xstS, xenS, fine_to_coarseS
+    use decomp_2d, only : xsize, xstS, xenS, fine_to_coarseS
     use param, only : itime, initstat,istatfreq
     use var, only : di1, tmean
 
@@ -396,7 +396,7 @@ end subroutine update_average_scalar
   !
   subroutine update_average_vector(um, vm, wm, ux, uy, uz, ep)
 
-    use decomp_2d, only : mytype, xsize, xstS, xenS
+    use decomp_2d, only : xsize, xstS, xenS
 
     implicit none
 
@@ -415,7 +415,7 @@ end subroutine update_average_vector
   !
   subroutine update_variance_vector(uum, vvm, wwm, uvm, uwm, vwm, ux, uy, uz, ep)
 
-    use decomp_2d, only : mytype, xsize, xstS, xenS
+    use decomp_2d, only : xsize, xstS, xenS
 
     implicit none
 
diff --git a/src/time_integrators.f90 b/src/time_integrators.f90
index 133a88db7..624bc12f1 100644
--- a/src/time_integrators.f90
+++ b/src/time_integrators.f90
@@ -4,6 +4,10 @@
 
 module time_integrators
 
+  use decomp_2d_constants
+  use decomp_2d_mpi
+  use decomp_2d
+
   implicit none
 
   private
@@ -16,7 +20,6 @@ subroutine intt(var1,dvar1,npaire,isc,forcing1)
     use MPI
     use param
     use variables
-    use decomp_2d
     use ydiff_implicit, only : inttimp
 
     implicit none
@@ -195,7 +198,6 @@ end subroutine intt
 
   SUBROUTINE int_time(rho1, ux1, uy1, uz1, phi1, drho1, dux1, duy1, duz1, dphi1)
 
-    use decomp_2d, only : mytype, xsize, nrank, real_type
     use param, only : zero, one
     use param, only : ntime, nrhotime, ilmn, iscalar, ilmn_solve_temp,itimescheme
     use param, only : iimplicit, sc_even
@@ -323,7 +325,6 @@ subroutine int_time_momentum(ux1, uy1, uz1, dux1, duy1, duz1)
     USE param
     USE variables
     USE var, ONLY: px1, py1, pz1
-    USE decomp_2d
 
     implicit none
 
@@ -359,7 +360,6 @@ subroutine int_time_continuity(rho1, drho1)
 
     USE param
     USE variables
-    USE decomp_2d
 
     implicit none
 
@@ -425,7 +425,6 @@ subroutine int_time_temperature(rho1, drho1, dphi1, phi1)
 
     USE param
     USE variables
-    USE decomp_2d
 
     USE navier, ONLY : lmn_t_to_rho_trans
     USE var, ONLY : tc1, tb1
diff --git a/src/tools.f90 b/src/tools.f90
index 7372bcec9..924eeb68e 100644
--- a/src/tools.f90
+++ b/src/tools.f90
@@ -4,6 +4,10 @@
 
 module tools
 
+  use decomp_2d_constants
+  use decomp_2d_mpi
+  use decomp_2d
+
   implicit none
 
   logical, save :: adios2_restart_initialised = .false.
@@ -27,7 +31,6 @@ module tools
   !##################################################################
   subroutine test_scalar_min_max(phi)
 
-    use decomp_2d
     use variables
     use param
     use var
@@ -78,7 +81,6 @@ end subroutine test_scalar_min_max
   !##################################################################
   subroutine test_speed_min_max(ux,uy,uz)
 
-    use decomp_2d
     use variables
     use param
     use var
@@ -139,7 +141,6 @@ end subroutine test_speed_min_max
   !##################################################################
   subroutine simu_stats(iwhen)
 
-    use decomp_2d
     use simulation_stats
     use var
     use MPI
@@ -204,7 +205,6 @@ end subroutine simu_stats
   !##############################################################################
   subroutine restart(ux1,uy1,uz1,dux1,duy1,duz1,ep1,pp3,phi1,dphi1,px1,py1,pz1,rho1,drho1,mu1,iresflg)
 
-    use decomp_2d
     use decomp_2d_io
     use variables
     use param
@@ -477,7 +477,6 @@ end subroutine restart
   
   subroutine init_restart_adios2()
 
-    use decomp_2d, only : mytype, phG
     use decomp_2d_io, only : decomp_2d_register_variable, decomp_2d_init_io
     use variables, only : numscalar
     use param, only : ilmn, nrhotime, ntime
@@ -546,7 +545,6 @@ end subroutine init_restart_adios2
   !############################################################################
   subroutine apply_spatial_filter(ux1,uy1,uz1,phi1)
 
-    use decomp_2d
     use param
     use var, only: uxf1,uyf1,uzf1,uxf2,uyf2,uzf2,uxf3,uyf3,uzf3,di1,di2,di3,phif1,phif2,phif3
     use variables
@@ -626,7 +624,6 @@ end subroutine apply_spatial_filter
   !############################################################################
   subroutine init_inflow_outflow()
 
-    use decomp_2d, only : mytype
     use decomp_2d_io, only : decomp_2d_init_io, decomp_2d_register_variable
 
     use param, only : ntimesteps
@@ -647,7 +644,6 @@ end subroutine init_inflow_outflow
   !############################################################################
   subroutine read_inflow(ux1,uy1,uz1,ifileinflow)
 
-    use decomp_2d
     use decomp_2d_io
     use var, only: ux_inflow, uy_inflow, uz_inflow
     use param
@@ -685,7 +681,6 @@ end subroutine read_inflow
   !############################################################################
   subroutine append_outflow(ux,uy,uz,timestep)
  
-    use decomp_2d
     use decomp_2d_io
     use var, only: ux_recoutflow, uy_recoutflow, uz_recoutflow, ilist
     use param
@@ -712,7 +707,6 @@ end subroutine append_outflow
   !############################################################################
   subroutine write_outflow(ifileoutflow)
 
-    use decomp_2d
     use decomp_2d_io
     use param
     use var, only: ux_recoutflow, uy_recoutflow, uz_recoutflow
@@ -755,7 +749,6 @@ subroutine compute_cfldiff()
      use param, only : xnu,dt,dx,dy,dz,istret
      use param, only : cfl_diff_sum, cfl_diff_x, cfl_diff_y, cfl_diff_z
      use variables, only : dyp
-     use decomp_2d, only : nrank
 
      implicit none
 
@@ -789,7 +782,6 @@ end subroutine compute_cfldiff
   !##################################################################
   subroutine compute_cfl(ux,uy,uz)
     use param, only : dx,dy,dz,dt,istret
-    use decomp_2d, only : nrank, mytype, xsize, xstart, xend, real_type
     use mpi
     use variables, only : dyp
 
@@ -852,7 +844,6 @@ end subroutine compute_cfl
   !##################################################################
   elemental subroutine rescale_pressure(pre1)
 
-    use decomp_2d, only : mytype
     use param, only : itimescheme, gdt
     implicit none
 
@@ -930,7 +921,6 @@ end subroutine mean_plane_z
   subroutine rename(oldname, newname, opt_rank)
 
     use MPI
-    use decomp_2d, only : nrank, decomp_2d_abort
     use decomp_2d_io, only : gen_iodir_name
     
     character(len=*), intent(in) :: oldname
@@ -975,7 +965,6 @@ end subroutine rename
   subroutine delete_filedir(name, opt_rank)
 
     use MPI
-    use decomp_2d, only : nrank, decomp_2d_abort
     
     character(len=*), intent(in) :: name
     integer, intent(in), optional :: opt_rank
@@ -1027,7 +1016,6 @@ end subroutine delete_filedir
   logical function validate_restart(refname, testname, opt_rank)
 
     use MPI
-    use decomp_2d, only : nrank, decomp_2d_abort
     use decomp_2d_io, only : gen_iodir_name
     
     character(len=*), intent(in) :: refname
@@ -1100,6 +1088,7 @@ end module tools
 !##################################################################
 subroutine cfl_compute(uxmax,uymax,uzmax)
 
+  use decomp_2d_constants
   use param
   use variables
   use var
@@ -1156,6 +1145,7 @@ end subroutine cfl_compute
 subroutine stretching()
 
   use decomp_2d
+  use decomp_2d_constants
   !use decomp_2d_poisson
   use variables
   use param
@@ -1344,6 +1334,7 @@ end subroutine stretching
 subroutine inversion5_v1(aaa_in,eee,spI)
 
   use decomp_2d
+  use decomp_2d_constants
   !use decomp_2d_poisson
   use variables
   use param
@@ -1487,6 +1478,7 @@ end subroutine inversion5_v1
 subroutine inversion5_v2(aaa,eee,spI)
 
   use decomp_2d
+  use decomp_2d_constants
   !use decomp_2d_poisson
   use variables
   use param
@@ -1628,6 +1620,8 @@ subroutine tripping(tb,ta)
   use param
   use variables
   use decomp_2d
+  use decomp_2d_constants
+  use decomp_2d_mpi
   use mpi
   use dbg_schemes, only: sqrt_prec, sin_prec, exp_prec
 
@@ -1748,6 +1742,8 @@ subroutine tbl_tripping(tb,ta)
   use param
   use variables
   use decomp_2d
+  use decomp_2d_constants
+  use decomp_2d_mpi
   use mpi
   use dbg_schemes, only: sqrt_prec, exp_prec, sin_prec
 
@@ -1918,6 +1914,7 @@ end function cx
 subroutine calc_temp_eos(temp, rho, phi, mweight, xlen, ylen, zlen)
 
   use decomp_2d
+  use decomp_2d_constants
   use param, only : pressure0, imultispecies
   use var, only : numscalar
 
@@ -1946,6 +1943,7 @@ subroutine calc_temp_eos(temp, rho, phi, mweight, xlen, ylen, zlen)
 subroutine calc_rho_eos(rho, temp, phi, mweight, xlen, ylen, zlen)
 
   use decomp_2d
+  use decomp_2d_constants
   use param, only : pressure0, imultispecies
   use var, only : numscalar
 
@@ -1974,6 +1972,7 @@ subroutine calc_rho_eos(rho, temp, phi, mweight, xlen, ylen, zlen)
 subroutine calc_mweight(mweight, phi, xlen, ylen, zlen)
 
   use decomp_2d
+  use decomp_2d_constants
   use param, only : zero, one
   use param, only : massfrac, mol_weight
   use var, only : numscalar
@@ -2003,6 +2002,8 @@ subroutine test_min_max(name,text,array_tmp,i_size_array_tmp)
   use param
   use variables
   use decomp_2d
+  use decomp_2d_constants
+  use decomp_2d_mpi
   use MPI
 
   implicit none
@@ -2030,7 +2031,7 @@ subroutine test_min_max(name,text,array_tmp,i_size_array_tmp)
      write(*,*) trim(text)//' Tot ',name,tot_tot
      write(*,*) trim(text)//' Min ',name,min_tot
      write(*,*) " "
-     call flush(6)
+     flush(6)
   endif
 
   return
diff --git a/src/transeq.f90 b/src/transeq.f90
index c392679aa..fea8059a3 100644
--- a/src/transeq.f90
+++ b/src/transeq.f90
@@ -4,6 +4,9 @@
 
 module transeq
 
+  use decomp_2d_constants
+  use decomp_2d_mpi
+
   private
   public :: calculate_transeq_rhs
 
@@ -16,7 +19,7 @@ module transeq
   !############################################################################
   subroutine calculate_transeq_rhs(drho1,dux1,duy1,duz1,dphi1,rho1,ux1,uy1,uz1,ep1,phi1,divu3)
 
-    use decomp_2d, only : mytype, xsize, zsize
+    use decomp_2d, only : xsize, zsize
     use variables, only : numscalar
     use param, only : ntime, ilmn, nrhotime, ilmn_solve_temp
 
@@ -1176,7 +1179,7 @@ end subroutine temperature_rhs_eq
   !############################################################################
   subroutine continuity_rhs_eq(drho1, rho1, ux1, divu3)
 
-    use decomp_2d, only : mytype, xsize, ysize, zsize
+    use decomp_2d, only : xsize, ysize, zsize
     use decomp_2d, only : transpose_z_to_y, transpose_y_to_x
     use param, only : ntime, nrhotime, ibirman_eos, zero
     use param, only : xnu, prandtl
diff --git a/src/variables.f90 b/src/variables.f90
index 18b9d9a80..fa549a7d8 100644
--- a/src/variables.f90
+++ b/src/variables.f90
@@ -4,7 +4,9 @@
 
 module var
 
+  use decomp_2d_constants
   use decomp_2d
+  use decomp_2d_mpi
   USE variables
   USE param
   USE complex_geometry
diff --git a/src/visu.f90 b/src/visu.f90
index 8e0ca9cbe..336738f0c 100644
--- a/src/visu.f90
+++ b/src/visu.f90
@@ -4,6 +4,10 @@
 
 module visu
   
+  use decomp_2d_constants
+  use decomp_2d_mpi
+  use decomp_2d
+
   implicit none
 
   ! True to activate the XDMF output
@@ -37,7 +41,6 @@ subroutine visu_init()
     use param, only : ilmn, iscalar, ilast, ifirst, ioutput, istret
     use variables, only : numscalar, prec, nvisu
     use param, only : dx, dy, dz
-    use decomp_2d, only : nrank, mytype, xszV, yszV, zszV, xsize, ysize, zsize
     use decomp_2d_io, only : decomp_2d_init_io, decomp_2d_open_io, decomp_2d_append_mode
     use decomp_2d_io, only : decomp_2d_register_variable
 
@@ -166,9 +169,6 @@ end subroutine visu_finalise
   !
   subroutine write_snapshot(rho1, ux1, uy1, uz1, pp3, phi1, ep1, itime, num)
 
-    use decomp_2d, only : transpose_z_to_y, transpose_y_to_x
-    use decomp_2d, only : mytype, xsize, ysize, zsize
-    use decomp_2d, only : nrank
     use decomp_2d_io, only : decomp_2d_start_io
 
     use param, only : nrhotime, ilmn, iscalar, ioutput, irestart
@@ -271,7 +271,6 @@ end subroutine write_snapshot
 
   subroutine end_snapshot(itime, num)
 
-    use decomp_2d, only : nrank
     use decomp_2d_io, only : decomp_2d_end_io
     use param, only : istret, xlx, yly, zlz
     use variables, only : nx, ny, nz, beta
@@ -339,7 +338,6 @@ subroutine write_xdmf_header(pathname, filename, num)
 
     use variables, only : nvisu, yp
     use param, only : dx,dy,dz,istret
-    use decomp_2d, only : mytype, nrank, xszV, yszV, zszV, ystV
 
     implicit none
 
@@ -427,7 +425,6 @@ end subroutine write_xdmf_header
 
   subroutine write_xdmf_topo()
 
-    use decomp_2d, only : xszV, yszV, zszV
     use param, only : istret
     
     implicit none
@@ -462,8 +459,6 @@ end subroutine write_xdmf_topo
   !
   subroutine write_xdmf_footer()
 
-    use decomp_2d, only : nrank
-
     implicit none
 
     if (nrank.eq.0) then
@@ -488,8 +483,6 @@ subroutine write_field(f1, pathname, filename, num, skip_ibm, flush)
     use var, only : zero, one
     use var, only : uvisu
     use param, only : iibm
-    use decomp_2d, only : mytype, xsize, xszV, yszV, zszV
-    use decomp_2d, only : nrank, fine_to_coarseV
     use decomp_2d_io, only : decomp_2d_write_one, decomp_2d_write_plane
 
     implicit none
diff --git a/src/xcompact3d.f90 b/src/xcompact3d.f90
index 95beed8b2..67e8a2583 100644
--- a/src/xcompact3d.f90
+++ b/src/xcompact3d.f90
@@ -268,6 +268,7 @@ subroutine init_xcompact3d()
 subroutine finalise_xcompact3d()
 
   use MPI
+  use decomp_2d_mpi
   use decomp_2d
   use decomp_2d_io, only : decomp_2d_io_finalise
 
@@ -306,7 +307,7 @@ subroutine finalise_xcompact3d()
 
 subroutine check_transients()
 
-  use decomp_2d, only : mytype
+  use decomp_2d_constants, only : mytype
   use mpi
   use var
   
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
new file mode 100644
index 000000000..ab16d1a40
--- /dev/null
+++ b/test/CMakeLists.txt
@@ -0,0 +1,8 @@
+install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples)
+
+# if testing active create a working dir for testing
+set(test_dir "${PROJECT_BINARY_DIR}/RunTests")
+file(MAKE_DIRECTORY ${test_dir})
+# TGV is from here
+add_subdirectory(data/Taylor-Green-Vortex)
+
diff --git a/test/data/Taylor-Green-Vortex/CMakeLists.txt b/test/data/Taylor-Green-Vortex/CMakeLists.txt
new file mode 100644
index 000000000..6101ab1f8
--- /dev/null
+++ b/test/data/Taylor-Green-Vortex/CMakeLists.txt
@@ -0,0 +1,14 @@
+# TGV
+set(case "TGV")
+set(case_dir "${test_dir}/${case}")
+file(MAKE_DIRECTORY ${case_dir})
+set(input_file "reference_input.i3d")
+install(DIRECTORY DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+install(FILES ${input_file} DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+install(FILES reference_time_evol.dat DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+install(FILES ../../compare_TGV_time_evolution.py DESTINATION ${CMAKE_INSTALL_PREFIX}/examples/${case})
+# If testing active add test for TGV case
+file(COPY ${input_file} DESTINATION ${case_dir})
+file(COPY reference_time_evol.dat DESTINATION ${case_dir})
+file(COPY ../../compare_TGV_time_evolution.py DESTINATION ${case_dir})
+add_test(NAME ${case} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:xcompact3d> ${input_file} WORKING_DIRECTORY ${case_dir})