Merge branch 'parallel-read-torus-from-exo' into 'master'

etphipp · etphipp · commit 7c6f74d456cc · 2024-11-13T21:36:32.000Z
Parallel torus read

See merge request etphipp/genten!53
diff --git a/python/physics_utils/decompose_torus.py b/python/physics_utils/decompose_torus.py
@@ -0,0 +1,45 @@
+# example usage:
+#   decomp --processors [num_procs] --subdir exo_files exo_files/test_larger_kappa_perp.exo
+#   mpiexec -n [num_procs] python3 decompose_torus.py exo_files/test_larger_kappa_perp.exo z
+
+import os
+import sys
+import exodus3 as ex
+import numpy as np
+import pygenten as gt
+import pyttb
+sys.path.append('..')
+import torus_to_tensor
+
+if __name__ == "__main__":
+   assert len(sys.argv) >= 3, "usage: decompose_torus.py <exodus_base_name> <axis_of_rotation>"
+   num_procs = gt.num_procs()
+   base_filename = sys.argv[1]
+   axis = sys.argv[2]
+   tol = 1.0e-10
+   if len(sys.argv) >= 4:
+     tol = np.double(sys.argv[3])
+
+   num_procs_per_poloidal_plane = num_procs
+   np_tensor, global_blocking, parallel_map = torus_to_tensor.torus_to_tensor(base_filename, axis, num_procs_per_poloidal_plane, tol)
+   shape = np_tensor.shape
+   print("Dimensions on proc " + str(gt.proc_rank()) + ":")
+   print("  Num toroidal:  "+str(shape[0]))
+   print("  Num poloidal:  "+str(shape[1]))
+   print("  Num variables: "+str(shape[2]))
+   print("  Num times:     "+str(shape[3]))
+
+   ttb_tensor = pyttb.tensor.from_data(np_tensor)
+   gt_tensor = gt.make_gt_tensor(ttb_tensor)
+
+   gt_dist_tensor, gt_dtc = gt.distribute_tensor(gt_tensor, global_blocking, parallel_map)
+   del(ttb_tensor)
+   del(gt_tensor)
+   u,perf = gt.cp_als(gt_dist_tensor, dtc=gt_dtc, rank=16, maxiters=200, tol=1e-4, seed=12345, dist_guess_method="parallel")
+   del(u)
+   del(perf)
+   del(gt_dist_tensor)
+
+
+
+
diff --git a/python/physics_utils/test_torus_to_tensor.py b/python/physics_utils/test_torus_to_tensor.py
@@ -4,9 +4,8 @@
 import sys
 import exodus3 as ex
 import numpy as np
-import _pygenten as gt
+import pygenten as gt
 import _phys_utils as pu
-sys.path.append('..')
 import torus_to_tensor
 import unittest
 
diff --git a/python/physics_utils/torus_to_tensor.py b/python/physics_utils/torus_to_tensor.py
@@ -24,14 +24,15 @@
 import sys
 import exodus3 as ex
 import numpy as np
-import _pygenten as gt
-import _phys_utils as pu
+import pygenten as gt
+import pygenten._phys_utils as pu
 
 def torus_to_tensor(base_filename, axis, num_procs_per_poloidal_plane,tol=1.0e-10):
     # error check the processor decomposition
     num_procs = gt.num_procs()
-    if num_procs%num_procs_per_poloidal_plane > 0:
-      raise Exception('Invalid num_procs_per_poloidal_plane (' + str(num_procs_per_poloidal_plane) + '): must divide num MPI ranks (' + str(num_procs) + ') evenly')
+    if num_procs_per_poloidal_plane > 0:
+      if num_procs%num_procs_per_poloidal_plane > 0:
+        raise Exception('Invalid num_procs_per_poloidal_plane (' + str(num_procs_per_poloidal_plane) + '): must divide num MPI ranks (' + str(num_procs) + ') evenly')
 
     # get filename for this proc
     rank = gt.proc_rank()
@@ -74,19 +75,21 @@ def torus_to_tensor(base_filename, axis, num_procs_per_poloidal_plane,tol=1.0e-1
     total_ref_nodes = pu.global_go_sum(num_ref_nodes)
     total_nodes     = pu.global_go_sum(num_nodes)
     total_thetas    = len(unique_thetas)
+    num_theta_procs = -1
     if total_nodes != total_ref_nodes*total_thetas:
       msg = 'Toroidal decomposition failure: total_nodes != total_ref_nodes*total_thetas'
       msg = msg + ' (' + str(total_nodes) + ' != ' + str(total_ref_nodes) + '*' + str(total_thetas) + ')'
       raise Exception(msg)
-    if num_procs_per_poloidal_plane > total_ref_nodes:
-      msg = 'Invalid num_procs_per_poloidal_plane (' + str(num_procs_per_poloidal_plane) + '): '
-      msg = msg + 'must be less than number of nodes per poloidal plane (' + str(total_ref_nodes) +')'
-      raise Exception(msg)
-    num_procs_per_theta = num_procs//num_procs_per_poloidal_plane
-    if num_procs_per_theta > total_thetas:
-      msg = 'Invalid num_procs_per_theta (num_procs/num_procs_per_poloidal_plane = ' + str(num_procs_per_theta) + '): '
-      msg = msg + 'must be less than number of poloidal planes in the mesh (' + str(total_thetas) +')'
-      raise Exception(msg)
+    if num_procs_per_poloidal_plane:
+      if num_procs_per_poloidal_plane > total_ref_nodes:
+        msg = 'Invalid num_procs_per_poloidal_plane (' + str(num_procs_per_poloidal_plane) + '): '
+        msg = msg + 'must be less than number of nodes per poloidal plane (' + str(total_ref_nodes) +')'
+        raise Exception(msg)
+      num_theta_procs = num_procs//num_procs_per_poloidal_plane
+      if num_theta_procs > total_thetas:
+        msg = 'Invalid num_theta_procs (num_procs/num_procs_per_poloidal_plane = ' + str(num_theta_procs) + '): '
+        msg = msg + 'must be less than number of poloidal planes in the mesh (' + str(total_thetas) +')'
+        raise Exception(msg)
 
     # get the associated gids and r and a values
     ref_gids = np.zeros(num_ref_nodes, dtype=np.longlong)
@@ -124,11 +127,20 @@ def torus_to_tensor(base_filename, axis, num_procs_per_poloidal_plane,tol=1.0e-1
       cids[i] = tids[i]*total_ref_nodes + rids[i]
 
     # determine which cids should be on which procs based on the user defined decomposition
-    redistributed_cids = distribute_composite_ids_across_procs(num_procs_per_theta,total_thetas,num_procs_per_poloidal_plane,total_ref_nodes)
+    redistributed_cids = -1*np.ones(0, dtype=np.longlong)
+    global_blocking = []
+    parallel_map = []
+    if num_procs_per_poloidal_plane > 0:
+      redistributed_cids, global_blocking = distribute_composite_ids_across_procs(num_theta_procs,total_thetas,num_procs_per_poloidal_plane,total_ref_nodes)
+    else:
+      redistributed_cids = distribute_composite_ids_to_root(num_theta_procs,total_thetas,num_procs_per_poloidal_plane,total_ref_nodes)
     redistributed_num_nodes = len(redistributed_cids)
 
     # use cids to redistribute data across procs
     redistributed_node_data = pu.redistribute_data_across_procs(cids,node_data,redistributed_cids)
+    tensor = np.zeros((0,0,0,0), dtype=np.double)
+    if len(redistributed_node_data) == 0:
+      return tensor
 
     # back out tids and rids from redistributed cids
     redistributed_tids = -1*np.ones(redistributed_num_nodes, dtype=np.longlong)
@@ -149,7 +161,18 @@ def torus_to_tensor(base_filename, axis, num_procs_per_poloidal_plane,tol=1.0e-1
       for v in range(num_vars):
         for t in range(num_times):
           tensor[redistributed_tids[i]-start_tid, redistributed_rids[i]-start_rid, v, t] = redistributed_node_data[i,t*num_vars+v]
-    return tensor 
+
+    # add var and time dimensions to global blocking
+    if num_procs_per_poloidal_plane > 0:
+      vt_blocking = np.zeros((2,num_procs+1),dtype=np.longlong)
+      vt_blocking[0,1] = num_vars
+      vt_blocking[1,1] = num_times
+      global_blocking = np.vstack([global_blocking,vt_blocking])
+      parallel_map = np.ones((4),dtype=np.longlong)
+      parallel_map[0] = num_theta_procs
+      parallel_map[1] = num_procs_per_poloidal_plane
+
+    return tensor, global_blocking, parallel_map 
 
 def get_cylindrical_coordinates(x, y, z, axis, tol):
    """
@@ -326,6 +349,32 @@ def distribute_composite_ids_across_procs(num_procs_x,total_x,num_procs_y,total_
    for i in range(num_target_xids):
      for j in range(num_target_yids):
        target_cids[i*num_target_yids+j] = (start_xid+i)*total_y + (start_yid+j)
+
+   # create the global blocking array needed for dist tensor context
+   global_blocking = np.zeros((2, num_procs+1), dtype=np.longlong)
+   for i in range(num_procs_x):
+     global_blocking[0,i+1] = global_blocking[0,i] + total_x//num_procs_x
+     if i < total_x%num_procs_x:
+       global_blocking[0,i+1] += 1
+   for i in range(num_procs_y):
+     global_blocking[1,i+1] = global_blocking[1,i] + total_y//num_procs_y
+     if i < total_y%num_procs_y:
+       global_blocking[1,i+1] += 1
+
+   return target_cids, global_blocking
+
+def distribute_composite_ids_to_root(num_procs_x,total_x,num_procs_y,total_y):
+   """
+   Given how many procs to divide a number of ids in two coordinate directions into,
+   distribute composite ids on each processor that fit this decomposition
+   """
+
+   # assign ranks in each coordinate direction
+   target_cids = -1*np.ones(0, dtype=np.longlong)
+   rank = gt.proc_rank()
+   if rank == 0:
+      total_cids = total_x*total_y
+      target_cids = np.arange(0, total_cids-1, 1, dtype=np.longlong)
    return target_cids
 
 if __name__ == "__main__":
@@ -340,7 +389,7 @@ def distribute_composite_ids_across_procs(num_procs_x,total_x,num_procs_y,total_
    tol = 1.0e-10
    if len(sys.argv) >= 5:
      tol = np.double(sys.argv[4])
-   tensor = torus_to_tensor(base_filename, axis, num_procs_per_poloidal_plane, tol)
+   tensor, global_blocking = torus_to_tensor(base_filename, axis, num_procs_per_poloidal_plane, tol)
    gt.finalizeGenten()
 
 
diff --git a/python/pygenten/utils.py b/python/pygenten/utils.py
@@ -4,6 +4,7 @@
 
 import pygenten._pygenten as gt
 import json
+import numpy
 
 def make_algparams(args):
     """
@@ -73,13 +74,13 @@ def read_and_distribute_tensor(filename, file_type=None, format=None, shape=None
         return Xs,dtc
     return Xd,dtc
 
-def distribute_tensor(X, **kwargs):
+def distribute_tensor(X, global_blocking=numpy.empty(0), parallel_map=numpy.empty(0), **kwargs):
     """
     Distribute the given tensor in parallel and return the result.
 
     Also returns the distributed tensor context.
     """
     a,rem = make_algparams(kwargs)
     dtc = gt.DistTensorContext()
-    XX = dtc.distributeTensor(X, a)
+    XX = dtc.distributeTensor(X, a, global_blocking, parallel_map)
     return XX,dtc
diff --git a/python/src/Genten_Pybind11_classes.cpp b/python/src/Genten_Pybind11_classes.cpp
@@ -585,7 +585,7 @@ void pygenten_tensor(py::module &m){
      Constructor that returns an empty tensor.)");
     cl.def(py::init([](const py::buffer& b, const bool copy=true) {
         // Initialize a Genten::Tensor from a numpy array
-        py::buffer_info info = b.request();
+	py::buffer_info info = b.request();
 
         if (info.format != py::format_descriptor<ttb_real>::format())
           throw std::runtime_error("Incompatible format: expected a ttb_real array!");
@@ -983,11 +983,43 @@ void pygenten_sptensor(py::module &m){
       }, R"(
     Distribute a given sparse tensor in parallel and return the distributed
     tensor.)");
-    cl.def("distributeTensor", [](Genten::DTC& dtc, const Genten::Tensor& X, const Genten::AlgParams& algParams) {
-        return dtc.distributeTensor(X, algParams);
+    cl.def("distributeTensor", [](Genten::DTC& dtc,
+                                  const Genten::Tensor& X,
+                                  const Genten::AlgParams& algParams,
+                                  const py::array_t<ttb_indx>& global_blocking_in,
+                                  const py::array_t<ttb_indx>& parallel_map_in) {
+	if(global_blocking_in.size() == 0)
+	{
+	  // the tensor hasn't already been distributed
+          return dtc.distributeTensor(X, algParams);
+	}
+	else
+	{
+	  // the tensor is already distributed and we can pass its global blocking after translating it into  a vector of vectors
+      	  using Genten::small_vector;
+          std::vector<small_vector<ttb_indx>> global_blocking;
+          for(py::ssize_t i = 0; i < global_blocking_in.shape(0); i++)
+	  {
+             small_vector<ttb_indx> variable_blocking;
+	     variable_blocking.push_back(global_blocking_in.at(i,0));
+	     for(py::ssize_t j = 1; j < global_blocking_in.shape(1); j++)
+	       if(global_blocking_in.at(i,j) > 0)
+	         variable_blocking.push_back(global_blocking_in.at(i,j));
+             global_blocking.push_back(variable_blocking);
+	  }
+          small_vector<ttb_indx> parallel_map(parallel_map_in.shape(0));
+          for(py::ssize_t i = 0; i < parallel_map_in.shape(0); i++)
+	  {
+            parallel_map[i] = parallel_map_in.at(i);
+          }
+          return dtc.distributeTensor(X, global_blocking, parallel_map, algParams);
+	}
       }, R"(
     Distribute a given dense tensor in parallel and return the distributed
-    tensor.)");
+    tensor.)", py::arg("X"),
+               py::arg("algParams"),
+               py::arg("global_blocking_in") = py::array_t<ttb_indx>(0),
+               py::arg("parallel_map_in") = py::array_t<ttb_indx>(0));
     cl.def("importToRoot", [](const Genten::DTC& dtc, const Genten::Sptensor& u) {
         return dtc.importToRoot<Genten::DefaultHostExecutionSpace>(u);
       }, R"(
diff --git a/src/Genten_DistTensorContext.cpp b/src/Genten_DistTensorContext.cpp
@@ -607,7 +607,7 @@ distributeTensorImpl(const Sptensor& X, const AlgParams& algParams)
 template <typename ExecSpace>
 TensorT<ExecSpace>
 DistTensorContext<ExecSpace>::
-distributeTensorImpl(const Tensor& X, const AlgParams& algParams)
+distributeTensorImpl(const Tensor& X, const AlgParams& algParams, const std::vector<small_vector<ttb_indx>>& global_blocking, const small_vector<ttb_indx>& parallel_map)
 {
   ttb_indx ndims = X.ndims();
   auto layout = X.getLayout();
@@ -633,6 +633,10 @@ distributeTensorImpl(const Tensor& X, const AlgParams& algParams)
   ndims = max_ndims;
 #endif
 
+  std::vector<ttb_real> Tvec;
+  ttb_indx nnz;
+  ttb_indx offset = 0;
+
   // Check if we have already distributed a tensor, in which case this one
   // needs to be of the same size
   if (global_dims_.size() > 0) {
@@ -650,44 +654,61 @@ distributeTensorImpl(const Tensor& X, const AlgParams& algParams)
       global_dims_[i] = X.ndims() > 0 ? X.size(i) : 0;
 
 #ifdef HAVE_DIST
-    std::vector<ttb_indx> max_global_dims = global_dims_;
-    MPI_Allreduce(MPI_IN_PLACE, max_global_dims.data(), ndims,
-                  DistContext::toMpiType<ttb_indx>(), MPI_MAX,
-                  DistContext::commWorld());
-    if (X.ndims() > 0 && max_global_dims != global_dims_)
-      Genten::error("Tensor dimensions are not consistent across processors!");
-    global_dims_ = max_global_dims;
+    if(global_blocking.size() == 0) {
+      std::vector<ttb_indx> max_global_dims = global_dims_;
+      MPI_Allreduce(MPI_IN_PLACE, max_global_dims.data(), ndims,
+                    DistContext::toMpiType<ttb_indx>(), MPI_MAX,
+                    DistContext::commWorld());
+      if (X.ndims() > 0 && max_global_dims != global_dims_)
+        Genten::error("Tensor dimensions are not consistent across processors!");
+      global_dims_ = max_global_dims;
+    }
 #endif
 
-    if (algParams.proc_grid.size() > 0) {
-      gt_assert(algParams.proc_grid.size() == ndims);
-      small_vector<ttb_indx> grid(ndims);
-      for (ttb_indx i=0; i<ndims; ++i)
-        grid[i] = algParams.proc_grid[i];
+    if(global_blocking.size() > 0) { 
+      global_blocking_ = global_blocking;
       pmap_ = std::shared_ptr<ProcessorMap>(new ProcessorMap(global_dims_,
-                                                             grid,
+                                                             parallel_map,
                                                              dist_method));
+      DistContext::Barrier();
+      nnz = pmap_->gridAllReduce(X.nnz(), ProcessorMap::Sum);
+      Tvec = std::vector<ttb_real>(X.getValues().size());
+      for(ttb_indx i = 0; i < X.getValues().size(); i++)
+        Tvec[i] = X.getValues()[i];
     }
     else
-      pmap_ = std::shared_ptr<ProcessorMap>(new ProcessorMap(global_dims_,
-                                                             dist_method));
+    {
+      if (algParams.proc_grid.size() > 0) {
+        gt_assert(algParams.proc_grid.size() == ndims);
+        small_vector<ttb_indx> grid(ndims);
+        for (ttb_indx i=0; i<ndims; ++i)
+          grid[i] = algParams.proc_grid[i];
+        pmap_ = std::shared_ptr<ProcessorMap>(new ProcessorMap(global_dims_,
+                                                               grid,
+                                                               dist_method));
+      }
+      else
+        pmap_ = std::shared_ptr<ProcessorMap>(new ProcessorMap(global_dims_,
+                                                               dist_method));
 
-    detail::printGrids(*pmap_);
+      global_blocking_ =
+        detail::generateUniformBlocking(global_dims_, pmap_->gridDims());
 
-    global_blocking_ =
-      detail::generateUniformBlocking(global_dims_, pmap_->gridDims());
+      DistContext::Barrier();
+
+      nnz = pmap_->gridAllReduce(X.nnz(), ProcessorMap::Max);
+      Tvec = detail::distributeTensorToVectorsDense(
+        X, nnz, pmap_->gridComm(), pmap_->gridRank(), pmap_->gridSize(), offset);
+    }
+    detail::printGrids(*pmap_);
 
     detail::printBlocking(*pmap_, global_blocking_);
     DistContext::Barrier();
   }
 
-  ttb_indx nnz = pmap_->gridAllReduce(X.nnz(), ProcessorMap::Max);
-  ttb_indx offset = 0;
-  auto Tvec = detail::distributeTensorToVectorsDense(
-    X, nnz, pmap_->gridComm(), pmap_->gridRank(), pmap_->gridSize(), offset);
-
+  const bool redistribute_needed = global_blocking.size() == 0;
   return distributeTensorData(Tvec, nnz, offset, global_dims_, global_blocking_,
-                              layout, *pmap_, algParams);
+                              layout, *pmap_, algParams, redistribute_needed);
 }
 
 template <typename ExecSpace>
diff --git a/src/Genten_DistTensorContext.hpp b/src/Genten_DistTensorContext.hpp